[Feature] Support Paddle-OCR (#4396)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled

* init

* update code

* fix code style & disable thinking

* adapt for common_engine.update_mm_requests_chunk_size

* use 3d rope

* use flash_attn_unpadded

* opt siglip

* update to be compatible with the latest codebase

* fix typo

* optim OCR performance

* fix bug

* fix bug

* fix bug

* fix bug

* normlize name

* modify xpu rope

* revert logger

* fix bug

* fix bug

* fix bug

* support default_v1

* optim performance

* fix bug

---------

Co-authored-by: root <root@szzj-acg-tge1-fdda9.szzj.baidu.com>
Co-authored-by: zhangyue66 <zhangyue66@baidu.com>
This commit is contained in:
ming1753
2025-10-24 23:34:30 +08:00
committed by GitHub
parent 822dea8d5f
commit e4e3cede7f
21 changed files with 2869 additions and 175 deletions

View File

@@ -1435,11 +1435,14 @@ class FDConfig:
self.model_config.model_format = "torch"
# TODO
self.max_prefill_batch = int(os.getenv("MAX_PREFILL_NUM", "3"))
if current_platform.is_xpu():
self.max_prefill_batch = 1
if self.model_config is not None and self.model_config.enable_mm:
self.max_prefill_batch = 1 # TODO:当前多模prefill阶段只支持并行度为1,待优化
if not envs.FD_ENABLE_MAX_PREFILL:
self.max_prefill_batch = int(os.getenv("MAX_PREFILL_NUM", "3"))
if current_platform.is_xpu():
self.max_prefill_batch = 1
if self.model_config is not None and self.model_config.enable_mm:
self.max_prefill_batch = 1 # TODO:当前多模prefill阶段只支持并行度为1,待优化
else:
self.max_prefill_batch = self.scheduler_config.max_num_seqs
num_ranks = self.parallel_config.tensor_parallel_size * self.parallel_config.data_parallel_size
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8

View File

@@ -538,6 +538,7 @@ class EngineService:
chunk_grid_thw = grid_thw[grid_thw_st : grid_thw_st + chunk_image_num[idx]]
chunk_patch_num = np.sum(np.prod(chunk_grid_thw, axis=1))
chunk_images = inputs["images"][patch_st : patch_st + chunk_patch_num]
chunk_position_ids = inputs["position_ids"][input_ids_st : input_ids_st + chunk_seq_len[idx]]
chunks_info.append(
{
@@ -546,7 +547,7 @@ class EngineService:
"image_type_ids": (chunk_image_type_ids if chunk_image_type_ids.shape[0] else None),
"grid_thw": (chunk_grid_thw if chunk_grid_thw.shape[0] else None),
"images": (chunk_images if chunk_images.shape[0] else None),
"position_ids": None,
"position_ids": chunk_position_ids,
}
)

View File

@@ -27,6 +27,7 @@ from typing import Union
import numpy as np
import paddle
from fastdeploy import envs
from fastdeploy.engine.request import Request, RequestOutput, RequestStatus, RequestType
from fastdeploy.engine.resource_manager import ResourceManager
from fastdeploy.inter_communicator import IPCSignal
@@ -153,6 +154,7 @@ class ResourceManagerV1(ResourceManager):
# Priority queues for requests.
self.waiting: deque[Request] = deque()
self.running: list[Request] = []
self.enable_max_prefill = envs.FD_ENABLE_MAX_PREFILL
self.finish_execution_pool = ThreadPoolExecutor(max_workers=1)
self.lock = threading.Lock()
self.to_be_rescheduled_request_id_set = set()
@@ -544,8 +546,9 @@ class ResourceManagerV1(ResourceManager):
while self.waiting and token_budget > 0:
if len(self.running) == self.max_num_seqs:
break
if (self.config.model_config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(
scheduled_reqs
if not self.enable_max_prefill and (
(self.config.model_config.enable_mm or paddle.is_compiled_with_xpu())
and self.exist_prefill(scheduled_reqs)
):
break
request = self.waiting[0]

View File

@@ -374,7 +374,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
"""
Create a chat completion for the provided prompt and parameters.
"""
api_server_logger.info(f"Chat Received request: {request.model_dump_json()}")
api_server_logger.debug(f"Chat Received request: {request.model_dump_json()}")
if app.state.dynamic_load_weight:
status, msg = app.state.engine_client.is_workers_alive()
if not status:

View File

@@ -84,6 +84,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
"ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
# set prealloc block num for decoder
"FD_ENC_DEC_BLOCK_NUM": lambda: int(os.getenv("FD_ENC_DEC_BLOCK_NUM", "2")),
# enbale max prefill of one execute step
"FD_ENABLE_MAX_PREFILL": lambda: int(os.getenv("FD_ENABLE_MAX_PREFILL", "0")),
# Whether to use PLUGINS.
"FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","),
# set trace attribute job_id.

View File

@@ -0,0 +1,20 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from .paddleocr_vl_processor import PaddleOCRVLProcessor
from .process import DataProcessor
__all__ = ["DataProcessor", "PaddleOCRVLProcessor"]

View File

@@ -0,0 +1,275 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""Image processor class for Keye."""
# TODO: Support videos
import json
import logging
import math
from pathlib import Path
from typing import Dict, List, Optional, Union
import numpy as np
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_utils import (
ImageInput,
is_valid_image,
make_list_of_images,
to_numpy_array,
)
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
def adjust_size(size, patch_size):
num_patches = size // patch_size
if num_patches % 2 != 0:
num_patches -= 1
return num_patches * patch_size
def smart_resize(
height: int,
width: int,
factor: int = 28,
min_pixels: int = 28 * 28 * 130,
max_pixels: int = 28 * 28 * 1280,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
# if height < factor or width < factor:
# raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
# if int(height < factor//4) + int(width < factor//4):
# raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
if height < factor:
logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor")
width = round((width * factor) / height)
height = factor
if width < factor:
logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor")
height = round((height * factor) / width)
width = factor
if max(height, width) / min(height, width) > 200:
raise ValueError(
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
)
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = math.floor(height / beta / factor) * factor
w_bar = math.floor(width / beta / factor) * factor
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
class ImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
resample: int = 3,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
min_pixels: int = 28 * 28 * 130,
max_pixels: int = 28 * 28 * 1280,
patch_size: int = 14,
temporal_patch_size: int = 1,
merge_size: int = 2,
**kwargs,
) -> None:
super().__init__()
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
self.do_convert_rgb = do_convert_rgb
@classmethod
def from_pretrained(cls, pretrained_model_dir):
pretrained_model_dir = Path(pretrained_model_dir)
image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
with open(image_processor_config_path, "r", encoding="utf-8") as f:
image_processor_config = json.load(f)
return cls(**image_processor_config)
def _preprocess(
self,
images,
do_resize: Optional[bool] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: Optional[bool] = None,
):
images = make_list_of_images(images)
if do_convert_rgb:
images = [image.convert("RGB") for image in images]
width, height = images[0].size
resized_height, resized_width = height, width
processed_images = []
for image in images:
if do_resize:
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.resize((resized_width, resized_height), resample=self.resample)
image = to_numpy_array(image)
if do_rescale:
image = (image * rescale_factor).astype(np.float32)
if do_normalize:
image = image.astype(np.float32)
image -= np.array(image_mean, dtype=np.float32)
image /= np.array(image_std, dtype=np.float32)
processed_images.append(image)
patches = np.array(processed_images)
patches = patches.transpose(0, 3, 1, 2)
if patches.shape[0] == 1:
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
channel = patches.shape[1]
grid_t = patches.shape[0] // self.temporal_patch_size
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
patches = patches.reshape(
grid_t,
self.temporal_patch_size,
channel,
grid_h,
self.patch_size,
grid_w,
self.patch_size,
)
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
assert self.temporal_patch_size == 1
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
return flatten_patches, np.array([grid_t, grid_h, grid_w])
def preprocess(
self,
images,
videos=None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: Optional[bool] = None,
return_tensors=None,
):
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if videos is not None:
raise NotImplementedError("Videos are not yet supported")
patches, image_grid_thw = self._preprocess(
images,
do_resize=do_resize,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_convert_rgb=do_convert_rgb,
)
pixel_values = np.array(patches)
data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
return BatchFeature(data=data, tensor_type=return_tensors)

View File

@@ -0,0 +1,290 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import numpy as np
from fastdeploy.engine.request import Request
from fastdeploy.input.text_processor import DataProcessor as TextProcessor
from fastdeploy.utils import data_processor_logger
from .process import DataProcessor
class PaddleOCRVLProcessor(TextProcessor):
"""
PaddleOCR Vision-Language processor for handling multimodal inputs.
This processor extends TextProcessor to support:
- Image processing
- Multimodal feature extraction
- Tokenization and position encoding
- Request processing and model input generation
Attributes:
processor (DataProcessor): Underlying data processor instance
tokenizer: Text tokenizer instance
limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
"""
def __init__(
self,
config,
model_name_or_path,
limit_mm_per_prompt=None,
mm_processor_kwargs=None,
reasoning_parser_obj=None,
tool_parser_obj=None,
):
"""
Initialize PaddleOCRVLProcessor instance.
Args:
config: Model configuration object
model_name_or_path (str): Pretrained model name or path
limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
mm_processor_kwargs (dict, optional): Multimodal processor arguments
reasoning_parser_obj: Reasoning parser instance
tool_parser_obj: Tool parser instance
"""
super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
self.processor = DataProcessor(
model_path=model_name_or_path,
tokens_per_second=config.vision_config.tokens_per_second,
tokenizer=self.tokenizer,
**processor_kwargs,
)
self.image_patch_id = self.processor.image_patch_id
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
def process_request(self, request, max_model_len=None, **kwargs):
"""
Process incoming request and generate model inputs.
Args:
request: Input request object
max_model_len (int, optional): Maximum context length
**kwargs: Additional processing parameters
Returns:
Request: Processed request with model inputs
"""
task = request.to_dict()
task["enable_thinking"] = kwargs.get("enable_thinking", False)
self.process_request_dict(task, max_model_len)
request = Request.from_dict(task)
request = self._apply_default_parameters(request)
return request
def _parse_processor_kwargs(self, kwargs):
"""
Parse and validate multimodal processor arguments.
Args:
kwargs (dict): Processor configuration arguments
Returns:
dict: Validated processor arguments
Raises:
ValueError: If arguments format is invalid
"""
if not kwargs:
return {}
try:
if not isinstance(kwargs, dict):
raise ValueError("mm-processor-kwargs must be a dictionary")
# Validate kwargs types against expected schema
data_processor_logger.info(f"Processing kwargs: {kwargs}")
expected_types = {
"video_max_frames": int, # Maximum video frames parameter
"video_min_frames": int, # Minimum video frames parameter
}
for key, value in kwargs.items():
if key in expected_types and not isinstance(value, expected_types[key]):
raise ValueError(
f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
)
return kwargs
except Exception as e:
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
return {}
def _parse_limits(self, limits):
"""
Parse and validate multimodal input limits.
Args:
limits (dict): Input limits configuration
Returns:
dict: Validated limits with defaults
Raises:
ValueError: If limits format is invalid
"""
DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
if not limits:
return DEFAULT_LIMITS
try:
if not isinstance(limits, dict):
raise ValueError("limit-mm-per-prompt must be a dictionary")
data_processor_logger.info(f"_parse_limits:{limits}")
return {**DEFAULT_LIMITS, **limits}
except Exception as e:
data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
return DEFAULT_LIMITS
def _check_mm_limits(self, item):
"""
Validate multimodal inputs against configured limits.
Args:
item: Input request item to validate
Raises:
ValueError: If input exceeds configured limits
"""
if isinstance(item, dict):
# 请求包含prompt和multi_modal_data
mm_data = item
else:
# 请求包含messages
mm_data = {"image": [], "video": []}
for message in item:
if isinstance(message.get("content"), list):
for part in message["content"]:
if part.get("type") in ["image_url", "image"]:
mm_data["image"].append(part)
elif part.get("type") in ["video_url", "video"]:
mm_data["video"].append(part)
for modality, data in mm_data.items():
if modality in self.limit_mm_per_prompt:
limit = self.limit_mm_per_prompt[modality]
if len(data) > limit:
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
def process_request_dict(self, request, max_model_len=None):
"""
Process request dictionary into model inputs.
Args:
request (dict): Input request dictionary
max_model_len (int, optional): Maximum context length
Returns:
dict: Processed request with model inputs
Raises:
ValueError: If request format is invalid
"""
request = self._apply_default_parameters(request)
if not request.get("eos_token_ids"):
request["eos_token_ids"] = self.eos_token_ids
stop_sequences = request.get("stop", [])
if stop_sequences:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
request["stop_token_ids"] = stop_seqs
request["stop_seqs_len"] = stop_seqs_len
if request.get("prompt"):
multimodal_data = request.get("multimodal_data")
if multimodal_data is None:
multimodal_data = {}
self._check_mm_limits(multimodal_data)
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
outputs = self.processor.text2ids(request["prompt"], images, videos)
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
outputs = self.processor.request2ids(request)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
metadata = request.get("metadata")
# Handle continuation of previous generation by appending existing tokens
if metadata and metadata.get("generated_token_ids"):
self.append_generated_tokens(outputs, metadata["generated_token_ids"])
outputs = self.pack_outputs(outputs)
request["prompt_token_ids"] = outputs["input_ids"].tolist()
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
request["multimodal_inputs"] = outputs
# Handle prompt truncation if exceeds model context length
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
request["prompt_token_ids"] = request["prompt_token_ids"][
: max_model_len - 1
] # Leave space for at least 1 new token
# Set default max_tokens if not specified
if request.get("max_tokens") is None:
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) # Ensure at least 1 token
return request
def append_generated_tokens(self, outputs, generated_token_ids):
"""
Append generated tokens to existing outputs.
Args:
outputs: Current model outputs
generated_token_ids: Generated tokens to append
"""
out = {"input_ids": [], "token_type_ids": [], "position_ids": [], "cur_position": outputs["cur_position"]}
self.processor._add_text(generated_token_ids, out)
outputs["input_ids"] = np.concatenate(
[outputs["input_ids"], np.array(out["input_ids"], dtype=np.int64)], axis=0
)
outputs["token_type_ids"] = np.concatenate(
[outputs["token_type_ids"], np.array(out["token_type_ids"], dtype=np.int64)], axis=0
)
outputs["position_ids"] = np.concatenate(
[outputs["position_ids"], out["position_ids"][0]], axis=1, dtype=np.int64
)
outputs["cur_position"] = out["cur_position"]
def pack_outputs(self, outputs):
"""
Prepare final output dictionary for model.
Args:
outputs: Intermediate processing outputs
Returns:
dict: Packed output dictionary with all required fields
"""
outputs["image_patch_id"] = self.processor.image_token_id
outputs["video_patch_id"] = self.processor.video_token_id
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
return outputs

View File

@@ -0,0 +1,471 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Any, Dict, List, Union
import numpy as np
from paddleformers.transformers import AutoTokenizer
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.utils import data_processor_logger
from .image_processor import ImageProcessor
class DataProcessor:
"""
Processes multimodal inputs (text, images, videos) into model-ready formats.
Handles:
- Tokenization of text with special tokens for visual content
- Image and video preprocessing
- Generation of 3D positional embeddings
- Conversion of chat messages to model inputs
Attributes:
tokenizer: Text tokenizer instance
image_processor: Image/video preprocessor
image_token: Special token for image placeholders
video_token: Special token for video placeholders
vision_start: Token marking start of visual content
"""
def __init__(
self,
model_path: str,
video_min_frames: int = 4,
video_max_frames: int = 768,
tokens_per_second: int = 2,
tokenizer=None,
**kwargs,
) -> None:
"""
Initialize the data processor.
Args:
model_path: Path to pretrained model
video_min_frames: Minimum frames to sample from videos
video_max_frames: Maximum frames to sample from videos
tokens_per_second: Temporal resolution for positional embeddings
**kwargs: Additional configuration
"""
self.min_frames = video_min_frames
self.max_frames = video_max_frames
# Initialize tokenizer with left padding and fast tokenizer
if tokenizer is None:
self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True)
self.tokenizer.ignored_index = -100 # Set ignored index for loss calculation
else:
self.tokenizer = tokenizer
self.image_processor = ImageProcessor.from_pretrained(model_path) # Initialize image processor
# Convolution sizes for patch aggregation
self.spatial_conv_size = self.image_processor.merge_size
self.temporal_conv_size = self.image_processor.temporal_patch_size
# Special tokens and IDs
self.image_token = "<|image_pad|>"
self.video_token = "<|video_pad|>"
self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
self.image_patch_id = self.image_token_id
self.vision_start = "<|vision_start|>"
self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start)
self.tokens_per_second = tokens_per_second
self.role_prefixes = {
"system": "",
"user": "User: ",
"bot": "Assistant: ",
"assistant": "Assistant: ",
}
def _pack_outputs(self, outputs):
"""
Pack and convert all output data into numpy arrays with appropriate types.
Args:
outputs (dict): Dictionary containing model outputs with keys:
- images: List of visual features
- grid_thw: List of spatial dimensions
- image_type_ids: List of content type indicators
- input_ids: List of token IDs
- token_type_ids: List of type identifiers
- position_ids: List of position embeddings
Returns:
dict: Processed outputs with all values converted to numpy arrays
"""
# Process visual outputs - stack if exists or set to None if empty
if not outputs["images"]:
outputs["images"] = None # No images case
outputs["grid_thw"] = None # No spatial dimensions
outputs["image_type_ids"] = None # No type IDs
else:
outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically
outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions
outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array
# Convert all outputs to numpy arrays with appropriate types
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64
outputs["position_ids"] = np.concatenate(
outputs["position_ids"], axis=1, dtype=np.int64
) # Concatenate position IDs
return outputs
def text2ids(self, text, images=None, videos=None):
"""
Convert text with image/video placeholders into model inputs.
Args:
text: Input text with <|image@placeholder|> and <|video@placeholder|> markers
images: List of PIL Images corresponding to image placeholders
videos: List of video data corresponding to video placeholders
Returns:
Dict containing:
- input_ids: Token IDs
- token_type_ids: Type identifiers (text/image/video)
- position_ids: 3D positional embeddings
- images: Preprocessed visual features
- grid_thw: Spatial/temporal dimensions
- image_type_ids: Visual content type (0=image, 1=video)
"""
outputs = {
"input_ids": [],
"token_type_ids": [],
"position_ids": [],
"images": [],
"grid_thw": [],
"image_type_ids": [],
"labels": [],
"cur_position": 0,
"pic_cnt": 0,
"video_cnt": 0,
"vit_seqlen": [],
"vit_position_ids": [],
}
# Define placeholders and their lengths
# IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
IMAGE_PLACEHOLDER = "<|image_pad|>"
VIDEO_PLACEHOLDER = "<|video@placeholder|>"
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
# Initialize tracking variables for text parsing
st, image_idx, video_idx = 0, 0, 0 # Start position, image counter, video counter
while st < len(text):
# Find next image or video placeholder in text
image_pos = text.find(IMAGE_PLACEHOLDER, st)
image_pos = len(text) if image_pos == -1 else image_pos # Set to end if not found
video_pos = text.find(VIDEO_PLACEHOLDER, st)
video_pos = len(text) if video_pos == -1 else video_pos # Set to end if not found
ed = min(image_pos, video_pos) # End position is first placeholder found
self._add_text(text[st:ed], outputs)
if ed == len(text):
break
if ed == image_pos:
outputs["pic_cnt"] += 1
self._add_image(images[image_idx], outputs)
image_idx += 1
st = ed + IMAGE_PLACEHOLDER_LEN
else:
item = videos[video_idx]
if isinstance(item, dict):
frames, meta = self._load_and_process_video(item["video"], item)
else:
frames, meta = self._load_and_process_video(item, {})
outputs["video_cnt"] += 1
self._add_video(frames, meta, outputs)
video_idx += 1
st = ed + VIDEO_PLACEHOLDER_LEN
return self._pack_outputs(outputs)
def request2ids(
self, request: Dict[str, Any], tgts: List[str] = None
) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
"""
Convert chat request with multimodal messages into model inputs.
Args:
request: Dictionary containing:
- messages: List of chat messages with text/image/video content
- request_id: Unique identifier for logging
tgts: Optional target sequences
Returns:
Dict with same structure as text2ids() output
"""
outputs = {
"input_ids": [],
"token_type_ids": [],
"position_ids": [],
"images": [],
"grid_thw": [],
"image_type_ids": [],
"labels": [],
"cur_position": 0,
"pic_cnt": 0,
"video_cnt": 0,
"vit_seqlen": [],
"vit_position_ids": [],
}
# Parse and validate chat messages
messages = parse_chat_messages(request.get("messages"))
image_message_list = [] # Store visual content messages
for msg in messages:
role = msg.get("role")
assert role in self.role_prefixes, f"Unsupported role: {role}"
# Normalize content to list format
content_items = msg.get("content")
if not isinstance(content_items, list):
content_items = [content_items]
# Collect all visual content items
for item in content_items:
if isinstance(item, dict) and item.get("type") in ["image", "video"]:
image_message_list.append(item)
raw_messages = request["messages"]
request["messages"] = messages
prompt_token_ids = self.apply_chat_template(request)
if len(prompt_token_ids) == 0:
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
request["messages"] = raw_messages
vision_start_index = 0
vision_message_index = 0
for i in range(len(prompt_token_ids)):
if prompt_token_ids[i] == self.vision_start_id:
self._add_text(prompt_token_ids[vision_start_index : i + 1], outputs)
vision_start_index = i + 1
image_message = image_message_list[vision_message_index]
if image_message["type"] == "image":
img = image_message.get("image")
if img is None:
continue
outputs["pic_cnt"] += 1
self._add_image(img, outputs)
elif image_message["type"] == "video":
video_bytes = image_message.get("video")
if video_bytes is None:
continue
frames, meta = self._load_and_process_video(video_bytes, image_message)
outputs["video_cnt"] += 1
self._add_video(frames, meta, outputs)
vision_message_index += 1
self._add_text(prompt_token_ids[vision_start_index:], outputs)
return self._pack_outputs(outputs)
def _add_text(self, tokens, outputs: Dict) -> None:
"""
Add text tokens to model inputs dictionary.
Args:
tokens: Text string or already tokenized IDs
outputs: Dictionary accumulating model inputs
Note:
- Handles both raw text and pre-tokenized inputs
- Updates position IDs for 3D embeddings
"""
if not tokens:
return None
if isinstance(tokens, str):
tokens_str = self.tokenizer.tokenize(tokens)
tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
num_tokens = len(tokens)
outputs["input_ids"].extend(tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
position_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
outputs["position_ids"].append(position_ids)
outputs["cur_position"] = position_ids.max() + 1
def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray:
"""
Generate 3D positional embeddings for text tokens.
Args:
start_pos: Starting position index
num_tokens: Number of tokens to generate positions for
Returns:
numpy.ndarray: 3D position IDs shaped (3, num_tokens)
"""
text_array = np.arange(num_tokens).reshape(1, -1)
text_index = np.broadcast_to(text_array, (3, num_tokens))
position = text_index + start_pos
return position
def _add_image(self, img, outputs: Dict) -> None:
"""
Add image data to model inputs dictionary.
Args:
img: PIL Image to process
outputs: Dictionary accumulating model inputs
Note:
- Preprocesses image and calculates spatial dimensions
- Adds image token IDs and type markers
- Generates appropriate position embeddings
"""
ret = self.image_processor.preprocess(images=[img.convert("RGB")])
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
grid_thw = ret["grid_thw"].tolist()
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
outputs["images"].append(ret["pixel_values"])
outputs["grid_thw"].append(grid_thw)
outputs["image_type_ids"].append(0)
# position_ids
t, h, w = grid_thw
position_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
outputs["position_ids"].append(position_ids)
outputs["cur_position"] = position_ids.max() + 1
numel = h * w
outputs["vit_seqlen"].append(numel)
outputs["vit_position_ids"].append(np.arange(numel) % numel)
def _add_video(self, frames, meta: Dict, outputs: Dict) -> None:
"""
Add video data to model inputs dictionary.
Args:
frames: Video frames as numpy array
meta: Video metadata containing fps/duration
outputs: Dictionary accumulating model inputs
Note:
- Handles temporal dimension in position embeddings
- Uses video-specific token IDs and type markers
"""
ret = self.image_processor.preprocess(images=frames)
num_tokens = ret["image_grid_thw"].prod() // self.image_processor.merge_size**2
grid_thw = ret["image_grid_thw"].tolist()
outputs["input_ids"].extend([self.video_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["images"].append(ret["pixel_values"])
outputs["grid_thw"].append(grid_thw)
outputs["image_type_ids"].extend([1] * grid_thw[0])
fps = meta["fps"]
second_per_grid_t = self.temporal_conv_size / fps
t, h, w = grid_thw
position_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
outputs["position_ids"].append(position_ids)
outputs["cur_position"] = position_ids.max() + 1
numel = h * w
outputs["vit_seqlen"].append(numel)
outputs["vit_position_ids"].append(np.arange(numel) % numel)
def _compute_vision_positions(
self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
) -> np.ndarray:
"""
Generate 3D position IDs for visual inputs.
Args:
start_pos: Base position in sequence
t: Temporal patches (1 for images)
h: Height in patches
w: Width in patches
second_per_grid_t: Time per temporal patch
Returns:
np.ndarray: Position IDs for [t,h,w] dimensions
"""
h //= self.spatial_conv_size
w //= self.spatial_conv_size
tn = np.arange(t).reshape(-1, 1)
tn = np.broadcast_to(tn, (t, h * w))
tn = tn * int(second_per_grid_t) * self.tokens_per_second
t_index = tn.flatten()
hn = np.arange(h).reshape(1, -1, 1)
h_index = np.broadcast_to(hn, (t, h, w)).flatten()
wn = np.arange(w).reshape(1, 1, -1)
w_index = np.broadcast_to(wn, (t, h, w)).flatten()
position = np.stack([t_index, h_index, w_index]) + start_pos
return position
def apply_chat_template(self, request):
"""
Apply chat template to convert messages into token sequence.
Args:
request: Dictionary containing chat messages
Returns:
List of token IDs
Raises:
ValueError: If model doesn't support chat templates
"""
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")
raw_prompt = self.tokenizer.apply_chat_template(
request["messages"],
tokenize=False,
add_generation_prompt=request.get("add_generation_prompt", True),
chat_template=request.get("chat_template", None),
)
prompt_token_str = raw_prompt.replace(self.image_token, "").replace(self.video_token, "")
request["text_after_process"] = raw_prompt
tokens = self.tokenizer.tokenize(prompt_token_str)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
data_processor_logger.info(
f"req_id:{request.get('request_id', ''), } prompt: {raw_prompt} tokens: {tokens}, token_ids: {token_ids}"
)
return token_ids

View File

@@ -105,6 +105,18 @@ class InputPreprocessor:
reasoning_parser_obj=reasoning_parser_obj,
tool_parser_obj=tool_parser_obj,
)
elif "PaddleOCRVL" in architecture:
from fastdeploy.input.paddleocr_vl_processor import (
PaddleOCRVLProcessor,
)
self.processor = PaddleOCRVLProcessor(
config=self.model_config,
model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser_obj=reasoning_parser_obj,
)
else:
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor

View File

@@ -419,33 +419,42 @@ class ErnieVlRotaryEmbedding3D:
self.max_position = max_position
self.freq_allocation = freq_allocation
def __call__(self, position_ids):
def __call__(self, position_ids, max_len_lst, cumsum_seqlens):
rot_emb = paddle.zeros((2, 1, self.max_position, 1, self.rotary_dim // 2), dtype="float32")
bsz = len(cumsum_seqlens) - 1
# position_ids_3d: [bsz, seq_len, 3]
position_ids_3d = paddle.tile(
paddle.arange(self.max_position, dtype="int64").unsqueeze(0).unsqueeze(-1),
[1, 1, 3],
[bsz, 1, 3],
)
for i in range(bsz):
position_ids_cur = position_ids[cumsum_seqlens[i] : cumsum_seqlens[i + 1]]
prefix_max_position_ids = paddle.max(position_ids_cur) + 1
dec_pos_ids = paddle.tile(
paddle.arange(max_len_lst[i], dtype="int64").unsqueeze(-1),
[1, 3],
)
dec_pos_ids = dec_pos_ids + prefix_max_position_ids
position_ids_3d_real = paddle.concat([position_ids_cur, dec_pos_ids], axis=0)
position_ids_3d[i, : position_ids_3d_real.shape[0], :] = position_ids_3d_real
position_ids_3d[:, : position_ids.shape[1], :] = position_ids
# position_ids: [bsz, seq_len]
# position_ids: [bsz(1), seq_len]
position_ids = paddle.arange(0, self.max_position, 1, dtype="float32").reshape((1, -1))
position_ids = position_ids / self.paritial_rotary_factor
indices = paddle.arange(0, self.rotary_dim, 2, dtype="float32")
indices = 1 / self.base ** (indices / self.rotary_dim)
# sinusoid_inp: [bsz, seq_len, 1, head_dim // 2]
# sinusoid_inp: [bsz(1), seq_len, 1, head_dim // 2]
sinusoid_inp = position_ids.unsqueeze(-1) * indices.unsqueeze(0)
# pos_emb: [bsz, seq_len, 1, head_dim]
# pos_emb: [bsz(1), seq_len, 1, head_dim]
pos_emb = paddle.concat([paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1)
# pos_emb: [bsz, 1, seq_len, head_dim]
# pos_emb: [bsz(1), 1, seq_len, head_dim]
pos_emb = paddle.reshape(pos_emb, (-1, 1, self.max_position, self.rotary_dim))
# pos_emb: [bsz, seq_len, 1, head_dim]
# pos_emb: [bsz(1), seq_len, 1, head_dim]
pos_emb = pos_emb.transpose([0, 2, 1, 3])
# sin: [bsz, seq_len, 1, head_dim // 2]
# sin: [bsz(1), seq_len, 1, head_dim // 2]
sin, cos = paddle.chunk(pos_emb, 2, axis=-1)
batch_indices = paddle.arange(end=position_ids.shape[0]).cast("int64")
# batch_indices: [[0]]
@@ -454,39 +463,46 @@ class ErnieVlRotaryEmbedding3D:
sin = sin.tile([position_ids.shape[0], 1, 1, 1])
cos = cos.tile([position_ids.shape[0], 1, 1, 1])
tmp_pos_id_0 = position_ids_3d[..., 0].squeeze().astype("int64")
tmp_pos_id_1 = position_ids_3d[..., 1].squeeze().astype("int64")
tmp_pos_id_2 = position_ids_3d[..., 2].squeeze().astype("int64")
tmp_pos_id_0 = position_ids_3d[..., 0].astype("int64")
tmp_pos_id_1 = position_ids_3d[..., 1].astype("int64")
tmp_pos_id_2 = position_ids_3d[..., 2].astype("int64")
sin_bsz = paddle.index_select(sin, index=batch_indices, axis=0)
sin_t = paddle.index_select(sin_bsz, index=tmp_pos_id_0, axis=1)[:, :, :, -self.freq_allocation :]
sin_h = paddle.index_select(sin_bsz, index=tmp_pos_id_1, axis=1)[
:, :, :, : self.rotary_dim // 2 - self.freq_allocation : 2
]
sin_w = paddle.index_select(sin_bsz, index=tmp_pos_id_2, axis=1)[
:, :, :, 1 : self.rotary_dim // 2 - self.freq_allocation : 2
]
sin_hw = paddle.stack([sin_h, sin_w], axis=-1).reshape(sin_h.shape[:-1] + [sin_h.shape[-1] * 2])
sin_thw = paddle.concat([sin_hw, sin_t], axis=-1)
cos_bsz = paddle.index_select(cos, index=batch_indices, axis=0)
cos_t = paddle.index_select(cos_bsz, index=tmp_pos_id_0, axis=1)[:, :, :, -self.freq_allocation :]
cos_h = paddle.index_select(cos_bsz, index=tmp_pos_id_1, axis=1)[
:, :, :, : self.rotary_dim // 2 - self.freq_allocation : 2
]
cos_w = paddle.index_select(cos_bsz, index=tmp_pos_id_2, axis=1)[
:, :, :, 1 : self.rotary_dim // 2 - self.freq_allocation : 2
]
cos_hw = paddle.stack([cos_h, cos_w], axis=-1).reshape(cos_h.shape[:-1] + [cos_h.shape[-1] * 2])
cos_thw = paddle.concat([cos_hw, cos_t], axis=-1)
rot_emb_list = []
for i in range(bsz):
sin_t = paddle.index_select(sin_bsz, index=tmp_pos_id_0[i], axis=1)[:, :, :, -self.freq_allocation :]
sin_h = paddle.index_select(sin_bsz, index=tmp_pos_id_1[i], axis=1)[
:, :, :, : self.rotary_dim // 2 - self.freq_allocation : 2
]
sin_w = paddle.index_select(sin_bsz, index=tmp_pos_id_2[i], axis=1)[
:, :, :, 1 : self.rotary_dim // 2 - self.freq_allocation : 2
]
sin_hw = paddle.stack([sin_h, sin_w], axis=-1).reshape(sin_h.shape[:-1] + [sin_h.shape[-1] * 2])
sin_thw = paddle.concat([sin_hw, sin_t], axis=-1)
rot_emb[0] = cos_thw
rot_emb[1] = sin_thw
cos_bsz = paddle.index_select(cos, index=batch_indices, axis=0)
cos_t = paddle.index_select(cos_bsz, index=tmp_pos_id_0[i], axis=1)[:, :, :, -self.freq_allocation :]
cos_h = paddle.index_select(cos_bsz, index=tmp_pos_id_1[i], axis=1)[
:, :, :, : self.rotary_dim // 2 - self.freq_allocation : 2
]
cos_w = paddle.index_select(cos_bsz, index=tmp_pos_id_2[i], axis=1)[
:, :, :, 1 : self.rotary_dim // 2 - self.freq_allocation : 2
]
cos_hw = paddle.stack([cos_h, cos_w], axis=-1).reshape(cos_h.shape[:-1] + [cos_h.shape[-1] * 2])
cos_thw = paddle.concat([cos_hw, cos_t], axis=-1)
if current_platform.is_iluvatar():
rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape([2, 1, self.max_position, 1, self.rotary_dim])
rot_emb[0] = cos_thw
rot_emb[1] = sin_thw
return rot_emb
if current_platform.is_iluvatar():
rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape(
[2, 1, self.max_position, 1, self.rotary_dim]
)
rot_emb_list.append(rot_emb)
return rot_emb_list
class QwenVlRotaryEmbedding3D:
@@ -504,33 +520,42 @@ class QwenVlRotaryEmbedding3D:
self.max_position = max_position
self.freq_allocation = freq_allocation
def __call__(self, position_ids):
def __call__(self, position_ids, max_len_lst, cumsum_seqlens):
rot_emb = paddle.zeros((2, 1, self.max_position, 1, self.rotary_dim // 2), dtype="float32")
bsz = len(cumsum_seqlens) - 1
# position_ids_3d: [bsz, seq_len, 3]
position_ids_3d = paddle.tile(
paddle.arange(self.max_position, dtype="int64").unsqueeze(0).unsqueeze(-1),
[1, 1, 3],
[bsz, 1, 3],
)
for i in range(bsz):
position_ids_cur = position_ids[cumsum_seqlens[i] : cumsum_seqlens[i + 1]]
prefix_max_position_ids = paddle.max(position_ids_cur) + 1
dec_pos_ids = paddle.tile(
paddle.arange(max_len_lst[i], dtype="int64").unsqueeze(-1),
[1, 3],
)
dec_pos_ids = dec_pos_ids + prefix_max_position_ids
position_ids_3d_real = paddle.concat([position_ids_cur, dec_pos_ids], axis=0)
position_ids_3d[i, : position_ids_3d_real.shape[0], :] = position_ids_3d_real
position_ids_3d[:, : position_ids.shape[1], :] = position_ids
# position_ids: [bsz, seq_len]
# position_ids: [bsz(1), seq_len]
position_ids = paddle.arange(0, self.max_position, 1, dtype="float32").reshape((1, -1))
position_ids = position_ids / self.paritial_rotary_factor
indices = paddle.arange(0, self.rotary_dim, 2, dtype="float32")
indices = 1 / self.base ** (indices / self.rotary_dim)
# sinusoid_inp: [bsz, seq_len, 1, head_dim // 2]
# sinusoid_inp: [bsz(1), seq_len, 1, head_dim // 2]
sinusoid_inp = position_ids.unsqueeze(-1) * indices.unsqueeze(0)
# pos_emb: [bsz, seq_len, 1, head_dim]
# pos_emb: [bsz(1), seq_len, 1, head_dim]
pos_emb = paddle.concat([paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1)
# pos_emb: [bsz, 1, seq_len, head_dim]
# pos_emb: [bsz(1), 1, seq_len, head_dim]
pos_emb = paddle.reshape(pos_emb, (-1, 1, self.max_position, self.rotary_dim))
# pos_emb: [bsz, seq_len, 1, head_dim]
# pos_emb: [bsz(1), seq_len, 1, head_dim]
pos_emb = pos_emb.transpose([0, 2, 1, 3])
# sin: [bsz, seq_len, 1, head_dim // 2]
# sin: [bsz(1), seq_len, 1, head_dim // 2]
sin, cos = paddle.chunk(pos_emb, 2, axis=-1)
batch_indices = paddle.arange(end=position_ids.shape[0]).cast("int64")
# batch_indices: [[0]]
@@ -539,9 +564,9 @@ class QwenVlRotaryEmbedding3D:
sin = sin.tile([position_ids.shape[0], 1, 1, 1])
cos = cos.tile([position_ids.shape[0], 1, 1, 1])
tmp_pos_id_0 = position_ids_3d[..., 0].squeeze().astype("int64")
tmp_pos_id_1 = position_ids_3d[..., 1].squeeze().astype("int64")
tmp_pos_id_2 = position_ids_3d[..., 2].squeeze().astype("int64")
tmp_pos_id_0 = position_ids_3d[..., 0].astype("int64")
tmp_pos_id_1 = position_ids_3d[..., 1].astype("int64")
tmp_pos_id_2 = position_ids_3d[..., 2].astype("int64")
# sin_bsz = paddle.index_select(sin, index=batch_indices, axis=0)
# sin_t = paddle.index_select(sin_bsz, index=tmp_pos_id_0, axis=1)[:, :, :, -self.freq_allocation :]
@@ -559,28 +584,37 @@ class QwenVlRotaryEmbedding3D:
section_w = (self.rotary_dim // 2 - self.freq_allocation) // 2 # 24
sin_bsz = paddle.index_select(sin, index=batch_indices, axis=0)
sin_t = paddle.index_select(sin_bsz, index=tmp_pos_id_0, axis=1)[:, :, :, :section_t]
sin_h = paddle.index_select(sin_bsz, index=tmp_pos_id_1, axis=1)[:, :, :, section_t : section_t + section_h]
sin_w = paddle.index_select(sin_bsz, index=tmp_pos_id_2, axis=1)[
:, :, :, section_t + section_h : section_t + section_h + section_w
]
sin_thw = paddle.concat([sin_t, sin_h, sin_w], axis=-1)
cos_bsz = paddle.index_select(cos, index=batch_indices, axis=0)
rot_emb_list = []
for i in range(bsz):
sin_t = paddle.index_select(sin_bsz, index=tmp_pos_id_0[i], axis=1)[:, :, :, :section_t]
sin_h = paddle.index_select(sin_bsz, index=tmp_pos_id_1[i], axis=1)[
:, :, :, section_t : section_t + section_h
]
sin_w = paddle.index_select(sin_bsz, index=tmp_pos_id_2[i], axis=1)[
:, :, :, section_t + section_h : section_t + section_h + section_w
]
sin_thw = paddle.concat([sin_t, sin_h, sin_w], axis=-1)
cos_t = paddle.index_select(cos_bsz, index=tmp_pos_id_0, axis=1)[:, :, :, :section_t]
cos_h = paddle.index_select(cos_bsz, index=tmp_pos_id_1, axis=1)[:, :, :, section_t : section_t + section_h]
cos_w = paddle.index_select(cos_bsz, index=tmp_pos_id_2, axis=1)[
:, :, :, section_t + section_h : section_t + section_h + section_w
]
cos_thw = paddle.concat([cos_t, cos_h, cos_w], axis=-1)
cos_bsz = paddle.index_select(cos, index=batch_indices, axis=0)
rot_emb[0] = cos_thw
rot_emb[1] = sin_thw
cos_t = paddle.index_select(cos_bsz, index=tmp_pos_id_0[i], axis=1)[:, :, :, :section_t]
cos_h = paddle.index_select(cos_bsz, index=tmp_pos_id_1[i], axis=1)[
:, :, :, section_t : section_t + section_h
]
cos_w = paddle.index_select(cos_bsz, index=tmp_pos_id_2[i], axis=1)[
:, :, :, section_t + section_h : section_t + section_h + section_w
]
cos_thw = paddle.concat([cos_t, cos_h, cos_w], axis=-1)
# neox style need
rot_emb_neox = paddle.concat([rot_emb, rot_emb], axis=-1)
return rot_emb_neox
rot_emb[0] = cos_thw
rot_emb[1] = sin_thw
# neox style need
rot_emb_neox = paddle.concat([rot_emb, rot_emb], axis=-1)
rot_emb_list.append(rot_emb_neox)
return rot_emb_list
def get_rope_3d(
@@ -591,6 +625,8 @@ def get_rope_3d(
max_position: int,
freq_allocation: int,
model_type: str,
max_len_lst: list[int],
cumsum_seqlens: list[int],
) -> paddle.Tensor:
"""
Pre-calculate rotary position embedding for position_ids.
@@ -618,10 +654,14 @@ def get_rope_3d(
rotary_emb3d_layer = QwenVlRotaryEmbedding3D(
rotary_dim, base, partial_rotary_factor, max_position, freq_allocation
)
elif "paddleocr" in model_type:
rotary_emb3d_layer = QwenVlRotaryEmbedding3D(
rotary_dim, base, partial_rotary_factor, max_position, freq_allocation
)
else: # default ernie
rotary_emb3d_layer = ErnieVlRotaryEmbedding3D(
rotary_dim, base, partial_rotary_factor, max_position, freq_allocation
)
rotary_emb_3d = rotary_emb3d_layer(position_ids)
rotary_emb_3d = rotary_emb3d_layer(position_ids, max_len_lst, cumsum_seqlens)
return rotary_emb_3d

View File

@@ -0,0 +1,15 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

View File

@@ -0,0 +1,167 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from paddleformers.transformers.configuration_utils import PretrainedConfig
class PPOCRVisionConfig(PretrainedConfig):
model_type = "paddleocr_vl"
base_config_key = "vision_config"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=14,
hidden_act="gelu_pytorch_tanh",
layer_norm_eps=1e-6,
attention_dropout=0.0,
spatial_merge_size=2,
temporal_patch_size=2,
tokens_per_second=2,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.tokens_per_second = tokens_per_second
class PaddleOCRConfig(PretrainedConfig):
model_type = "paddleocr_vl"
keys_to_ignore_at_inference = ["past_key_values"]
sub_configs = {"vision_config": PPOCRVisionConfig}
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
def __init__(
self,
vocab_size=32000,
hidden_size=768,
intermediate_size=11008,
max_position_embeddings=32768,
num_hidden_layers=2,
num_attention_heads=2,
image_token_id=101304,
video_token_id=101305,
vision_start_token_id=101306,
rms_norm_eps=1e-6,
use_cache=False,
use_flash_attention=False,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
head_dim=128,
hidden_act="silu",
use_bias=False,
rope_theta=10000,
weight_share_add_bias=True,
ignored_index=-100,
attention_probs_dropout_prob=0.0,
hidden_dropout_prob=0.0,
compression_ratio: float = 1.0,
num_key_value_heads=None,
max_sequence_length=None,
tie_word_embeddings=False,
vision_config=None,
**kwargs,
):
# Set default for tied embeddings if not specified.
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
if isinstance(vision_config, dict):
self.vision_config = self.sub_configs["vision_config"](**vision_config)
elif vision_config is None:
self.vision_config = self.sub_configs["vision_config"]()
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.max_position_embeddings = max_position_embeddings
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.use_flash_attention = use_flash_attention
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.vision_start_token_id = vision_start_token_id
self.head_dim = head_dim
if hidden_act != "silu":
raise NotImplementedError
self.hidden_act = hidden_act
self.hidden_size = hidden_size
self.use_bias = use_bias
self.weight_share_add_bias = weight_share_add_bias
self.rope_theta = rope_theta
self.ignored_index = ignored_index
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.hidden_dropout_prob = hidden_dropout_prob
self.compression_ratio = compression_ratio
self.num_key_value_heads = num_key_value_heads
self.max_sequence_length = max_sequence_length
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
# Currently, these configuration items are hard-coded
self.fuse_rms_norm = True
self.use_sparse_flash_attn = True
self.use_var_len_flash_attn = False
self.scale_qk_coeff = 1.0
self.fuse_softmax_mask = False
self.use_sparse_head_and_loss_fn = False
self.use_recompute_loss_fn = False
self.use_fused_head_and_loss_fn = False
self.fuse_linear = False
self.token_balance_seqlen = False
self.use_rmsnorm = True
self.fuse_ln = False
self.cachekv_quant = False
self.fuse_swiglu = False

View File

@@ -0,0 +1,455 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import re
from functools import partial
from typing import Dict, Optional, Union
import numpy as np
import paddle
import paddle.nn as nn
from paddleformers.transformers import PretrainedModel
from paddleformers.transformers.configuration_utils import PretrainedConfig
from paddleformers.utils.log import logger
from fastdeploy import envs
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import (
support_graph_optimization,
)
from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
from fastdeploy.model_executor.models.model_base import (
ModelCategory,
ModelForCasualLM,
ModelRegistry,
)
from fastdeploy.model_executor.utils import (
default_weight_loader,
process_weights_after_loading,
)
from .projector import Projector
from .siglip import SiglipVisionModel
@support_graph_optimization
class PaddleOCRVLModel(nn.Layer):
def __init__(
self,
fd_config: FDConfig = None,
):
super().__init__()
self.config = fd_config.model_config
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.pretrained_config.prefix_name = "model"
self._dtype = fd_config.model_config.torch_dtype
self.embed_tokens = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=self._dtype,
prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
)
self.layers = nn.LayerList(
[
Ernie4_5_DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}",
)
for i in range(self.num_layers)
]
)
for i, layer in enumerate(self.layers):
layer.self_attn.attn = Attention(
fd_config=fd_config,
layer_id=i,
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}.self_attn",
use_neox_rotary_style=True,
)
self.norm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.embed_tokens.load_state_dict(state_dict)
self.norm.load_state_dict(state_dict)
for i in range(self.num_layers):
logger.info(f"Start load layer {i}")
self.layers[i].load_state_dict(state_dict)
def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor:
return self.embed_tokens(ids_remove_padding=ids_remove_padding)
def forward(
self,
input_embeddings: paddle.Tensor,
forward_meta: ForwardMeta,
):
hidden_states = input_embeddings
residual = None
for i in range(self.num_layers):
hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
hidden_states = hidden_states + residual
out = self.norm(hidden_states)
return out
@ModelRegistry.register_model_class(
architecture="PaddleOCRVLForConditionalGeneration",
module_name="paddleocr_vl.paddleocr_vl",
category=ModelCategory.MULTIMODAL,
primary_use=ModelCategory.MULTIMODAL,
)
class PaddleOCRVLForConditionalGeneration(ModelForCasualLM):
def __init__(self, fd_config):
super().__init__(fd_config)
config = fd_config.model_config
self.config = config
self.mlp_AR = Projector(config, config.vision_config, prefix="mlp_AR")
self.visual = SiglipVisionModel(config.vision_config, prefix="visual")
self.model = PaddleOCRVLModel(fd_config)
self.vocab_size = config.vocab_size
self.lm_head = ParallelLMHead(
fd_config=fd_config,
embedding_dim=fd_config.model_config.hidden_size,
num_embeddings=fd_config.model_config.vocab_size,
prefix="lm_head",
)
# Persistent buffers for CUDA graphs.
if envs.FD_ENABLE_MAX_PREFILL:
max_length = fd_config.scheduler_config.max_num_seqs * fd_config.model_config.max_model_len
else:
max_length = fd_config.model_config.max_model_len
self._input_embeddings = paddle.zeros(
[max_length, fd_config.model_config.hidden_size],
dtype=fd_config.model_config.dtype,
)
@paddle.no_grad()
def load_weights(self, weights_iterator) -> None:
"""
Load model parameters from a given weights_iterator object.
Args:
weights_iterator (Iterator): An iterator yielding (name, weight) pairs.
"""
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("up_gate_proj", "gate_proj", "gate"),
("up_gate_proj", "up_proj", "up"),
("embed_tokens.embeddings", "embed_tokens", None),
("lm_head.linear", "lm_head", None),
]
params_dict = dict(self.named_parameters())
process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
for loaded_weight_name, loaded_weight in weights_iterator:
loaded_weight_name = (
self.process_weights_before_loading_fn(loaded_weight_name)
if getattr(self, "process_weights_before_loading_fn", None)
else loaded_weight_name
)
if loaded_weight_name is None:
continue
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in loaded_weight_name:
continue
model_param_name = loaded_weight_name.replace(weight_name, param_name)
if model_param_name not in params_dict:
continue
param = params_dict[model_param_name]
weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
weight_loader(param, loaded_weight, shard_id)
break
else:
model_param_name = loaded_weight_name
if model_param_name not in params_dict:
continue
param = params_dict[model_param_name]
weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
weight_loader(param, loaded_weight)
model_sublayer_name = re.sub(r"\.(weight)$", "", model_param_name)
process_weights_after_loading_fn(model_sublayer_name, param)
@paddle.no_grad()
def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, paddle.Tensor]]):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.model.load_state_dict(state_dict)
self.visual.load_state_dict(state_dict)
self.projector.load_state_dict(state_dict)
self.lm_head.load_state_dict(state_dict)
@property
def projector(self):
return self.mlp_AR
@classmethod
def name(self):
return "PaddleOCRVLForConditionalGeneration"
def compute_logits(self, hidden_states: paddle.Tensor):
logits = self.lm_head(hidden_states)
logits = paddle.cast(logits, paddle.float32)
logits[:, self.vocab_size :] = -float("inf")
return logits
def get_input_embeddings(
self,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor] = None,
) -> paddle.Tensor:
input_embeddings = self.model.get_input_embeddings(ids_remove_padding=ids_remove_padding)
image_mask = ids_remove_padding == self.model.config.image_token_id
image_token_num = image_mask.sum()
if image_token_num > 0:
input_embeddings[image_mask] = image_features.cast(self._dtype)
return input_embeddings
def forward(
self,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor],
forward_meta: ForwardMeta,
):
input_embeddings = self.get_input_embeddings(
ids_remove_padding=ids_remove_padding, image_features=image_features
)
self._input_embeddings.copy_(input_embeddings, False)
hidden_states = self.model(
input_embeddings=self._input_embeddings,
forward_meta=forward_meta,
)
return hidden_states
class PaddleOCRVLPretrainedModel(PretrainedModel):
config_class = FDConfig
def _init_weight(self, layer):
"""
_init_weight
"""
return None
@classmethod
def arch_name(self):
return "PaddleOCRVLForConditionalGeneration"
from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
from fastdeploy.model_executor.models.utils import WeightMeta
weight_infos = [
WeightMeta(
f".layers.{{{layerid.LAYER_ID}}}.self_attn.qkv_proj.weight",
True,
tsm.GQA,
),
WeightMeta(f".layers.{{{layerid.LAYER_ID}}}.self_attn.o_proj.weight", False),
WeightMeta(
f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.up_gate_proj.weight",
True,
tsm.PairFused,
),
WeightMeta(f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.down_proj.weight", False),
WeightMeta(
f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.TEXT_EXPERT_ID}}}.up_gate_proj.weight",
True,
tsm.PairFused,
),
WeightMeta(
f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.TEXT_EXPERT_ID}}}.down_proj.weight",
False,
),
WeightMeta(
f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.IMG_EXPERT_ID}}}.up_gate_proj.weight",
True,
tsm.PairFused,
),
WeightMeta(
f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.IMG_EXPERT_ID}}}.down_proj.weight",
False,
),
WeightMeta(
f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.shared_experts.up_gate_proj.weight",
True,
tsm.PairFused,
),
WeightMeta(
f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.shared_experts.down_proj.weight",
False,
),
WeightMeta(
f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.shared_experts.down_proj.weight",
False,
),
WeightMeta(".embed_tokens.weight", False),
WeightMeta("lm_head.weight", True),
]
weight_vison = [
# resampler_model
WeightMeta("ernie.resampler_model.spatial_linear.0.weight", False),
WeightMeta("resampler_model.spatial_linear.0.weight", False),
# vision
WeightMeta(
f"vision_model.blocks.{{{layerid.LAYER_ID}}}.attn.proj.weight",
False,
),
WeightMeta(f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc2.weight", False),
WeightMeta(f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc1.weight", True),
WeightMeta(f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc1.bias", True),
WeightMeta(
f"vision_model.blocks.{{{layerid.LAYER_ID}}}.attn.qkv.weight",
True,
tsm.GQA,
),
WeightMeta(
f"vision_model.blocks.{{{layerid.LAYER_ID}}}.attn.qkv.bias",
True,
tsm.GQA,
),
]
@classmethod
def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_split=True):
"""
get_tensor_parallel_mappings
"""
from fastdeploy.model_executor.models.tp_utils import (
build_expanded_keys,
has_prefix,
split_or_merge_func_v1,
)
fn = split_or_merge_func_v1(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
head_dim=config.head_dim,
)
vision_fn = split_or_merge_func_v1(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.vision_config.get("num_heads"),
num_key_value_heads=config.vision_config.get("num_heads"),
head_dim=config.vision_config.get("hidden_size") // config.vision_config.get("num_heads"),
)
def get_tensor_parallel_split_mappings(
num_layers: int,
moe_num_experts: list[int],
moe_layer_start_index: int,
prefix_name: str,
):
base_actions = {}
for weight_name, is_column, extra in cls.weight_infos:
params = {
"is_column": is_column,
**({extra.value: True} if extra else {}),
}
if "lm_head.weight" in weight_name or weight_name == "":
key = weight_name
elif not has_prefix(prefix_name, weight_name):
key = f"{prefix_name}{weight_name}"
else:
key = weight_name
base_actions[key] = partial(fn, **params)
final_actions = {}
final_actions = build_expanded_keys(
base_actions,
num_layers,
(moe_layer_start_index if moe_layer_start_index > 0 else num_layers),
text_num_experts=moe_num_experts[0],
img_num_experts=moe_num_experts[1],
)
return final_actions
def get_vison_parallel_split_mappings(num_layers: int):
base_actions = {}
for weight_name, is_column, extra in cls.weight_vison:
params = {
"is_column": is_column,
**({extra.value: True} if extra else {}),
}
base_actions[weight_name] = partial(vision_fn, **params)
final_actions = {}
final_actions = build_expanded_keys(
base_actions,
num_layers,
)
return final_actions
moe_layer_start_index = -1
if isinstance(config.moe_layer_start_index, list):
moe_layer_start_index = min(config.moe_layer_start_index)
elif isinstance(config.moe_layer_start_index, int):
moe_layer_start_index = config.moe_layer_start_index
mappings = get_tensor_parallel_split_mappings(
config.num_hidden_layers,
config.moe_num_experts,
moe_layer_start_index,
config.prefix_name,
)
vision_mappings = get_vison_parallel_split_mappings(config.vision_config.get("depth"))
return {**mappings, **vision_mappings}

View File

@@ -0,0 +1,107 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import math
import paddle
import paddle.nn as nn
from fastdeploy.model_executor.layers.utils import get_tensor
class GELUActivation(nn.Layer):
"""
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
def __init__(self, use_gelu_python: bool = False):
super().__init__()
if use_gelu_python:
self.act = self._gelu_python
else:
self.act = nn.functional.gelu
def _gelu_python(self, input):
return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
def forward(self, input):
return self.act(input)
class Projector(nn.Layer):
def __init__(self, text_config, vision_config, prefix=""):
super().__init__()
self.prefix_name = prefix
self.text_config = text_config
self.vision_config = vision_config
self.merge_kernel_size = (2, 2)
self.hidden_size = self.vision_config.hidden_size * self.merge_kernel_size[0] * self.merge_kernel_size[1]
self.pre_norm = nn.LayerNorm(self.vision_config.hidden_size, epsilon=1e-05)
self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size)
self.act = GELUActivation()
self.linear_2 = nn.Linear(self.hidden_size, self.text_config.hidden_size)
def forward(self, image_features, image_grid_thw):
m1, m2 = self.merge_kernel_size
if isinstance(image_features, (list, tuple)):
processed_features = list()
for image_feature, image_grid in zip(image_features, image_grid_thw):
image_feature = self.pre_norm(image_feature) # shape: (T*H*W, D)
t, h, w = image_grid
from einops import rearrange
image_feature = rearrange(
image_feature,
"(t h p1 w p2) d -> (t h w) (p1 p2 d)",
t=int(t),
h=int(h // m1),
p1=int(m1),
w=int(w // m2),
p2=int(m2),
)
hidden_states = self.linear_1(image_feature)
hidden_states = self.act(hidden_states)
hidden_states = self.linear_2(hidden_states)
processed_features.append(hidden_states)
return processed_features
dim = image_features.shape[-1]
image_features = paddle.reshape(image_features, [-1, dim])
hidden_states = self.pre_norm(image_features)
hidden_states = paddle.reshape(hidden_states, [-1, self.hidden_size])
hidden_states = self.linear_1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.linear_2(hidden_states)
return hidden_states
def load_state_dict(self, state_dict):
params_dict = dict(self.named_parameters())
for param_name, param in params_dict.items():
state_dict_key = f"{self.prefix_name}.{param_name}"
if state_dict_key not in state_dict:
raise ValueError(f"The key {state_dict_key} does not exist in state_dict. ")
tensor = get_tensor(state_dict.pop(state_dict_key))
if param.shape != tensor.shape:
raise ValueError(f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}")
else:
param.copy_(tensor, False)

View File

@@ -0,0 +1,740 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import List, Optional, Tuple, Union
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.functional.flash_attention import flash_attn_unpadded
from paddleformers.transformers.activations import ACT2FN
from paddleformers.transformers.model_utils import PretrainedModel
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.utils import slice_fn
try:
from paddle.nn.functional.flash_attention import flash_attention_v3_varlen
except:
flash_attention_v3_varlen = None
from .config import PPOCRVisionConfig
def rotate_half(x):
Dh = x.shape[-1]
x1 = x[..., : Dh // 2]
x2 = x[..., Dh // 2 :]
return paddle.concat([-x2, x1], axis=-1)
def _ensure_cos_sin_dim(cos, sin, dim_needed):
last = cos.shape[-1]
if last == dim_needed:
return cos, sin
elif last * 2 == dim_needed:
cos = paddle.concat([cos, cos], axis=-1)
sin = paddle.concat([sin, sin], axis=-1)
return cos, sin
else:
raise ValueError(f"Unexpected cos/sin last-dim: {last}, expected {dim_needed} or {dim_needed//2}")
def apply_rotary_pos_emb_vision(x, cos, sin):
orig_dtype = x.dtype
x = x.astype("float32")
x_embed = (x * cos) + (rotate_half(x) * sin)
return x_embed.astype(orig_dtype)
class QKVLinear(nn.Linear):
def __init__(self, config, in_features, out_features, weight_attr=None, bias_attr=None):
super().__init__(in_features, out_features, weight_attr, bias_attr)
self.config = config
self.in_features = in_features
self.out_features = out_features
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
assert self.head_dim * self.num_heads == self.embed_dim
self.weight.weight_loader = self.weight_loader
self.bias.weight_loader = self.weight_loader
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
# Tensor parallelism splits the weight along the output_dim
loaded_weight = get_tensor(loaded_weight)
if not param._is_initialized():
param.initialize()
if loaded_shard_id == "q":
param_shard_offset = 0
param_shard_size = self.num_heads * self.head_dim
elif loaded_shard_id == "k":
param_shard_offset = self.num_heads * self.head_dim
param_shard_size = self.num_heads * self.head_dim
else:
# loaded_shard_id == "v"
param_shard_offset = self.num_heads * self.head_dim * 2
param_shard_size = self.num_heads * self.head_dim
param = slice_fn(param, self.out_features, start=param_shard_offset, end=param_shard_offset + param_shard_size)
assert param.shape == loaded_weight.shape, (
f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
)
# Ensure loaded weight dtype matches model param dtype
if loaded_weight.dtype != param.dtype:
if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn:
loaded_weight = loaded_weight.view(param.dtype)
else:
loaded_weight = loaded_weight.cast(param.dtype)
param.copy_(loaded_weight, False)
class SiglipAttention(nn.Layer):
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
assert self.head_dim * self.num_heads == self.embed_dim
self.scale = self.head_dim**-0.5
self.qkv_proj = QKVLinear(config, self.embed_dim, self.embed_dim * 3, bias_attr=True)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
prop = paddle.device.cuda.get_device_properties()
cc = prop.major * 10 + prop.minor
is_current_sm_supported = cc >= 90
is_paddle_supported = any(num >= 90 for num in paddle.version.cuda_archs())
if is_current_sm_supported and is_paddle_supported:
self.flash_attn_func = flash_attention_v3_varlen
self.flash_attn_kwargs = {}
else:
self.flash_attn_func = flash_attn_unpadded
self.flash_attn_kwargs = {"scale": self.scale, "training": False}
def forward(
self,
hidden_states: paddle.Tensor, # [B, L, D]
attention_mask: Optional[paddle.Tensor] = None,
output_attentions: Optional[bool] = False,
cu_seqlens: Optional[List[paddle.Tensor]] = None,
max_seqlen: Optional[paddle.Tensor] = None,
rope_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]] = None, # (cos, sin)
):
B, seq_length, D = hidden_states.shape
qkv = (
self.qkv_proj(hidden_states)
.reshape(
[
seq_length,
3,
self.num_heads,
-1,
]
)
.transpose(perm=[1, 0, 2, 3])
)
q, k, v = qkv.unbind(axis=0)
cos, sin = rope_emb
# --------
q = apply_rotary_pos_emb_vision(q, cos, sin)
k = apply_rotary_pos_emb_vision(k, cos, sin)
attn_output = self.flash_attn_func(
q,
k,
v,
cu_seqlens,
cu_seqlens,
max_seqlen,
max_seqlen,
causal=False,
**self.flash_attn_kwargs,
)[0]
# --------
attn_output = attn_output.reshape(seq_length, -1)
attn_output = self.out_proj(attn_output)
return attn_output
class SiglipVisionEmbeddings(nn.Layer):
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size # 1152
self.image_size = config.image_size # 384
self.patch_size = config.patch_size # 14
self.patch_embedding = nn.Conv2D(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
padding="VALID",
)
self.num_patches = (self.image_size // self.patch_size) ** 2 # 729
self.num_positions = self.num_patches
self.cache_position_embedding = dict()
self.cache_position_count = dict()
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
self.register_buffer(
"position_ids",
paddle.arange(self.num_positions).unsqueeze(0),
persistable=False,
)
def interpolate_pos_encoding(self, embeddings, height: int, width: int, is_after_patchify: bool = False):
num_positions = self.position_embedding.weight.shape[0]
patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
dim = embeddings.shape[-1]
if is_after_patchify:
new_height = height
new_width = width
else:
new_height = height // self.patch_size
new_width = width // self.patch_size
sqrt_num_positions = paddle.to_tensor(num_positions**0.5, dtype=paddle.int64)
patch_pos_embed = patch_pos_embed.reshape((1, sqrt_num_positions, sqrt_num_positions, dim))
patch_pos_embed = patch_pos_embed.transpose((0, 3, 1, 2))
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bilinear",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.transpose((0, 2, 3, 1)).reshape((1, -1, dim))
return patch_pos_embed
@staticmethod
def flatten_list(image_grid_thw):
tmp_image_grid_thw = list()
for image_grid in image_grid_thw:
if isinstance(image_grid, list):
tmp_image_grid_thw.extend(image_grid)
else:
tmp_image_grid_thw.append(image_grid)
return tmp_image_grid_thw
def fetch_position_embedding_lfu_cache(self, embeddings, h, w, max_cache=20):
grid = (h, w)
if grid in self.cache_position_embedding:
self.cache_position_count[grid] += 1
return self.cache_position_embedding[grid]
if len(self.cache_position_embedding) >= max_cache:
min_hit_grid = min(self.cache_position_count, key=self.cache_position_count.get)
self.cache_position_count.pop(min_hit_grid)
self.cache_position_embedding.pop(min_hit_grid)
position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
self.cache_position_count[grid] = 1
self.cache_position_embedding[grid] = position_embedding
return position_embedding
def forward(
self,
pixel_values: paddle.Tensor, # [B, L, C, H, W]
position_ids: Optional[paddle.Tensor] = None, # [B or 1, S]
image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
interpolate_pos_encoding: bool = False,
) -> paddle.Tensor:
if pixel_values.dim() == 4:
pixel_values = pixel_values.unsqueeze(0)
if pixel_values.dim() == 5:
assert position_ids is not None
from einops import rearrange
batch_size, squence_len, channel, height, width = pixel_values.shape
target_dtype = self.patch_embedding.weight.dtype
pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
embeddings = patch_embeds.flatten(-2).squeeze(-1)
embeddings = rearrange(embeddings, "(b l) d -> b l d", b=batch_size, l=squence_len)
# todo: not debug
if interpolate_pos_encoding and image_grid_thw is not None:
flatten_image_grid_thw = self.flatten_list(image_grid_thw)
flatten_image_grid_thw = np.array(flatten_image_grid_thw)
assert batch_size == 1
start = 0
assert sum([np.prod(x) for x in flatten_image_grid_thw]) == embeddings.shape[1], (
flatten_image_grid_thw,
embeddings.shape,
)
embeddings = embeddings.squeeze(0)
tmp_embeddings = list()
for image_grid in image_grid_thw:
t, h, w = image_grid
end = start + t * h * w
image_embeddings = embeddings[int(start) : int(end), :]
position_embedding = (
self.interpolate_pos_encoding(image_embeddings, h, w, True).squeeze(0).tile((t, 1))
).astype(image_embeddings.dtype)
image_embeddings = image_embeddings + position_embedding
tmp_embeddings.append(image_embeddings)
start = end
embeddings = paddle.concat(tmp_embeddings, axis=0).unsqueeze(0)
else:
embeddings = embeddings + self.packing_position_embedding(position_ids)
return embeddings
else:
raise NotImplementedError(str(pixel_values.shape))
class SiglipMLP(nn.Layer):
def __init__(self, config):
super().__init__()
self.config = config
if config.hidden_act == "gelu_pytorch_tanh":
config.hidden_act = "silu"
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class SiglipEncoderLayer(paddle.nn.Layer):
def __init__(self, config):
super().__init__()
self.embed_dim = config.hidden_size
self.layer_norm1 = paddle.nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
self.self_attn = SiglipAttention(config)
self.layer_norm2 = paddle.nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
self.mlp = SiglipMLP(config)
# @paddle.jit.to_static
def forward(
self,
hidden_states,
attention_mask,
output_attentions=False,
cu_seqlens=None,
max_seqlen=None,
rope_emb=None,
):
residual = hidden_states
############################
ln1_out = self.layer_norm1(hidden_states)
x = self.self_attn(
hidden_states=ln1_out,
attention_mask=attention_mask,
output_attentions=output_attentions,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
rope_emb=rope_emb,
)
hs_post_attn = residual + x
residual = hs_post_attn
ln2_out = self.layer_norm2(residual)
mlp_out = self.mlp(ln2_out)
hidden_states_out = residual + mlp_out
outputs = (hidden_states_out,)
return outputs
class SigLIPRotaryEmbedding(nn.Layer):
def __init__(self, dim: int, theta: float = 10000.0) -> None:
super().__init__()
self.dim = dim
self.theta = theta
self.rope_init()
def rope_init(self):
arange = paddle.arange(0, self.dim, 2, dtype="float32")
inv_freq = 1.0 / (self.theta ** (arange / self.dim))
self.register_buffer("inv_freq", inv_freq.astype(paddle.get_default_dtype()), persistable=False)
def forward(self, seqlen: int) -> paddle.Tensor:
seq = paddle.arange(seqlen, dtype=self.inv_freq.dtype)
freqs = paddle.outer(seq, self.inv_freq)
return freqs
class SiglipEncoder(nn.Layer):
def __init__(self, config):
super().__init__()
self.config = config
embed_dim = config.hidden_size
num_heads = config.num_attention_heads
head_dim = embed_dim // num_heads
self.layers = nn.LayerList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
self.gradient_checkpointing = False
@staticmethod
def flatten_list(image_grid_thw):
tmp_image_grid_thw = list()
for image_grid in image_grid_thw:
if isinstance(image_grid, list):
tmp_image_grid_thw.extend(image_grid)
else:
tmp_image_grid_thw.append(image_grid)
return tmp_image_grid_thw
def build_window_index(self, image_grid, window_size):
"""
返回:
window_indices: int64 [sum(t*h*w_valid)]
cu_seqlens_within_windows: int32 [num_windows_total*t],首位补 0 的前缀和
"""
from einops import rearrange
window_indices = list()
pad_values = -100
start_window_index = 0
cu_seqlens_within_windows = list()
for t, h, w in map(int, image_grid):
window_index = paddle.arange(t * h * w).reshape((t, h, w))
pad_h = (-h) % window_size
pad_w = (-w) % window_size
assert pad_h >= 0 and pad_w >= 0, (pad_h, pad_w)
window_index = F.pad(window_index, (0, pad_w, 0, pad_h), value=pad_values)
window_index = rearrange(
window_index,
"t (h p1) (w p2) -> t (h w) (p1 p2)",
p1=window_size,
p2=window_size,
)
window_seqlens = (window_index != pad_values).long().sum(-1).reshape(-1)
window_index = window_index.reshape(-1)
window_index = window_index[window_index != pad_values]
window_indices.append(window_index + start_window_index)
cu_seqlens_within_windows.append(window_seqlens.cumsum(0) + start_window_index)
start_window_index += t * h * w
window_indices = paddle.concat(window_indices, axis=0)
cu_seqlens_within_windows = paddle.concat(cu_seqlens_within_windows, axis=0)
cu_seqlens_within_windows = F.pad(cu_seqlens_within_windows, (1, 0), value=0).astype("int32")
return window_indices, cu_seqlens_within_windows
def forward(
self,
inputs_embeds: paddle.Tensor,
attention_mask: Optional[paddle.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
cu_seqlens: Optional[paddle.Tensor] = None,
image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
height_position_ids: Optional[paddle.Tensor] = None,
width_position_ids: Optional[paddle.Tensor] = None,
use_rope: Optional[bool] = False,
window_size: Optional[int] = -1,
vision_or_text: str = "vision",
):
assert vision_or_text in ["vision", "text"]
use_window_attn = window_size > 0 and vision_or_text == "vision"
use_rope = (use_rope is True) and (vision_or_text == "vision")
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_states = inputs_embeds
attention_mask = attention_mask.to(inputs_embeds.dtype) if attention_mask is not None else None
if use_rope is True:
flatten_image_grid_thw = self.flatten_list(image_grid_thw)
flatten_image_grid_thw = np.array(flatten_image_grid_thw)
assert sum([np.prod(x) for x in flatten_image_grid_thw]) == hidden_states.shape[1], (
flatten_image_grid_thw,
hidden_states.shape,
)
if width_position_ids is None or height_position_ids is None:
split_hids = list()
split_wids = list()
for t, h, w in flatten_image_grid_thw:
t, h, w = map(int, (t, h, w))
image_pids = paddle.arange(t * h * w) % (h * w)
sample_hids = image_pids // w
sample_wids = image_pids % w
split_hids.append(sample_hids)
split_wids.append(sample_wids)
width_position_ids = paddle.concat(split_wids, axis=0)
height_position_ids = paddle.concat(split_hids, axis=0)
window_indices, cu_seqlens_within_windows = None, None
if use_window_attn:
window_indices, cu_seqlens_within_windows = self.build_window_index(
flatten_image_grid_thw, window_size
)
reversed_window_indices = window_indices.argsort()
height_position_ids = height_position_ids[window_indices]
width_position_ids = width_position_ids[window_indices]
pids = paddle.stack([height_position_ids, width_position_ids], axis=-1).astype(paddle.int64)
max_grid_size = pids.max() + 1
rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
rope_emb = rope_emb_max_grid[pids].flatten(1)
rope_emb = rope_emb.tile((1, 2))
cos = rope_emb.cos().astype("float32")
sin = rope_emb.sin().astype("float32")
cos = cos.unsqueeze(-2)
sin = sin.unsqueeze(-2)
rope_emb = (cos, sin)
else:
rope_emb = None
window_indices, cu_seqlens_within_windows = None, None
if use_window_attn:
flatten_image_grid_thw = self.flatten_list(image_grid_thw)
assert (
sum([np.prod(x.astype("float32").cpu().numpy()) for x in flatten_image_grid_thw])
== hidden_states.shape[1]
), (flatten_image_grid_thw, hidden_states.shape)
window_indices, cu_seqlens_within_windows = self.build_window_index(
flatten_image_grid_thw, window_size
)
reversed_window_indices = window_indices.argsort()
if use_window_attn:
assert cu_seqlens_within_windows is not None
attn_cu_seqlens = cu_seqlens_within_windows
hidden_states = hidden_states[:, window_indices, :]
else:
attn_cu_seqlens = cu_seqlens
max_seqlen = (attn_cu_seqlens[1:] - attn_cu_seqlens[:-1]).max().item()
for encoder_layer in self.layers:
if output_hidden_states:
encoder_states = encoder_states + (
(hidden_states[:, reversed_window_indices, :],) if use_window_attn else (hidden_states,)
)
layer_outputs = encoder_layer(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
cu_seqlens=attn_cu_seqlens,
max_seqlen=max_seqlen,
rope_emb=rope_emb,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if use_window_attn:
hidden_states = hidden_states[:, reversed_window_indices, :]
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
return hidden_states
class SiglipMultiheadAttentionPoolingHead(nn.Layer):
"""Multihead Attention Pooling."""
def __init__(self, config: PPOCRVisionConfig):
super().__init__()
self.probe = self.create_parameter(
shape=(1, 1, config.hidden_size),
default_initializer=paddle.nn.initializer.Normal(),
)
self.attention = nn.MultiHeadAttention(config.hidden_size, config.num_attention_heads)
self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
self.mlp = SiglipMLP(config)
def forward(self, hidden_state, key_padding_mask=None):
batch_size = hidden_state.shape[0]
probe = self.probe.tile((batch_size, 1, 1))
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
residual = hidden_state
hidden_state = self.layernorm(hidden_state)
hidden_state = residual + self.mlp(hidden_state)
return hidden_state[:, 0]
class SiglipVisionTransformer(nn.Layer):
def __init__(self, config: PPOCRVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = SiglipVisionEmbeddings(config)
self.encoder = SiglipEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
if self.use_head:
self.head = SiglipMultiheadAttentionPoolingHead(config)
def forward(
self,
pixel_values,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
attention_mask=None,
sample_indices=None,
image_indices=None,
position_ids=None,
height_position_ids=None,
width_position_ids=None,
cu_seqlens=None,
padding_mask=None,
vision_return_embed_list: Optional[bool] = False,
image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
return_pooler_output: Optional[bool] = True,
use_rope: Optional[bool] = False,
window_size: Optional[bool] = -1,
):
hidden_states = self.embeddings(
pixel_values,
interpolate_pos_encoding=interpolate_pos_encoding,
position_ids=position_ids,
image_grid_thw=image_grid_thw,
)
last_hidden_state = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
attention_mask=attention_mask,
cu_seqlens=cu_seqlens,
image_grid_thw=image_grid_thw,
use_rope=use_rope,
height_position_ids=height_position_ids,
width_position_ids=width_position_ids,
window_size=window_size,
vision_or_text="vision",
)
last_hidden_state = self.post_layernorm(last_hidden_state)
sample_hidden_state = list()
assert cu_seqlens is not None
for i in range(cu_seqlens.shape[0] - 1):
start = cu_seqlens[i]
end = cu_seqlens[i + 1]
tensor = last_hidden_state[:, start:end, :].squeeze(0)
sample_hidden_state.append(tensor)
return sample_hidden_state
class SiglipVisionModel(PretrainedModel):
config_class = PPOCRVisionConfig
main_input_name = "pixel_values"
def __init__(self, config: PPOCRVisionConfig, prefix=""):
super().__init__(config)
self.prefix_name = prefix
self.vision_model = SiglipVisionTransformer(config)
def get_input_embeddings(self) -> nn.Layer:
return self.vision_model.embeddings.patch_embedding
def forward(
self,
pixel_values,
sample_indices=None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
position_ids=None,
vision_return_embed_list: Optional[bool] = False,
image_grid_thw: Optional[List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]] = None,
cu_seqlens=None,
return_pooler_output: Optional[bool] = True,
use_rope: Optional[bool] = False,
window_size: Optional[bool] = -1,
):
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
position_ids=position_ids,
vision_return_embed_list=vision_return_embed_list,
image_grid_thw=image_grid_thw,
sample_indices=sample_indices,
cu_seqlens=cu_seqlens,
return_pooler_output=return_pooler_output,
use_rope=use_rope,
window_size=window_size,
)
def load_state_dict(self, state_dict):
params_dict = dict(self.named_parameters())
for param_name, param in params_dict.items():
state_dict_key = f"{self.prefix_name}.{param_name}"
if state_dict_key not in state_dict:
if "self_attn.qkv_proj.weight" in state_dict_key:
q_weight_key = state_dict_key.replace("qkv_proj", "q_proj")
k_weight_key = state_dict_key.replace("qkv_proj", "k_proj")
v_weight_key = state_dict_key.replace("qkv_proj", "v_proj")
q_tensor = get_tensor(state_dict.pop(q_weight_key))
k_tensor = get_tensor(state_dict.pop(k_weight_key))
v_tensor = get_tensor(state_dict.pop(v_weight_key))
weight_tensor = paddle.concat([q_tensor, k_tensor, v_tensor], axis=-1).transpose([1, 0])
tensor = paddle.transpose(weight_tensor, perm=[1, 0])
elif "self_attn.qkv_proj.bias" in state_dict_key:
q_bias_key = state_dict_key.replace("qkv_proj", "q_proj")
k_bias_key = state_dict_key.replace("qkv_proj", "k_proj")
v_bias_key = state_dict_key.replace("qkv_proj", "v_proj")
q_bias = get_tensor(state_dict.pop(q_bias_key))
k_bias = get_tensor(state_dict.pop(k_bias_key))
v_bias = get_tensor(state_dict.pop(v_bias_key))
qkv_bias = paddle.concat([q_bias, k_bias, v_bias], axis=-1)
tensor = qkv_bias
else:
raise ValueError(f"The key {state_dict_key} does not exist in state_dict. ")
else:
tensor = get_tensor(state_dict.pop(state_dict_key))
if param.shape != tensor.shape:
raise ValueError(f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}")
else:
param.copy_(tensor, False)

View File

@@ -254,7 +254,10 @@ def is_paddle_support_v1_loader():
def v1_loader_support(fd_config):
_v1_no_support_archs = ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]
_v1_no_support_archs = [
"Qwen2VLForConditionalGeneration",
"Qwen2_5_VLForConditionalGeneration",
]
def _err_msg(msg: str) -> str:
logger.info(msg + "; fallback to the v0 loader for model loading.")

View File

@@ -24,6 +24,7 @@ from typing import Dict, List, Optional, Tuple
import crcmod
from redis import ConnectionPool
from fastdeploy import envs
from fastdeploy.engine.request import Request, RequestOutput
from fastdeploy.scheduler import utils
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
@@ -542,22 +543,23 @@ class GlobalScheduler:
remaining_request.append((request_queue_name, serialized_request))
continue
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
if not envs.FD_ENABLE_MAX_PREFILL:
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
remaining_request.append((request_queue_name, serialized_request))
continue
else:
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
remaining_request.append((request_queue_name, serialized_request))
continue
else:
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
remaining_request.append((request_queue_name, serialized_request))
continue
else:
if current_prefill_tokens > max_num_batched_tokens:
remaining_request.append((request_queue_name, serialized_request))
continue
if current_prefill_tokens > max_num_batched_tokens:
remaining_request.append((request_queue_name, serialized_request))
continue
scheduled_requests.append(request)

View File

@@ -18,6 +18,7 @@ import threading
import time
from typing import Dict, List, Optional, Tuple
from fastdeploy import envs
from fastdeploy.engine.request import Request, RequestOutput
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
from fastdeploy.utils import scheduler_logger
@@ -258,20 +259,21 @@ class LocalScheduler:
if required_total_blocks > available_blocks:
break
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
# 长请求
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
if not envs.FD_ENABLE_MAX_PREFILL:
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
# 长请求
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
break
else:
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
break
else:
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
break
else:
if current_prefill_tokens > max_num_batched_tokens:
break
if current_prefill_tokens > max_num_batched_tokens:
break
requests.append(request.raw)
self.ids_read_cursor += len(requests)

View File

@@ -310,6 +310,14 @@ class GPUModelRunner(ModelRunnerBase):
req_len = len(req_dicts)
has_prefill_task = False
has_decode_task = False
multi_vision_inputs = {"images_lst": [], "grid_thw_lst": [], "vit_position_ids_lst": [], "cu_seqlens": [0]}
rope_3d_position_ids = {
"position_ids_idx": [],
"position_ids_lst": [],
"position_ids_offset": [0],
"max_tokens_lst": [],
}
for i in range(req_len):
request = req_dicts[i]
idx = request.idx
@@ -320,39 +328,49 @@ class GPUModelRunner(ModelRunnerBase):
if self.enable_mm:
inputs = request.multimodal_inputs
if request.with_image:
vision_inputs = {}
vision_inputs["input_ids"] = paddle.to_tensor(
inputs["input_ids"][prefill_start_index:prefill_end_index], dtype=paddle.int64
)
vision_inputs["token_type_ids"] = paddle.to_tensor(
inputs["token_type_ids"][prefill_start_index:prefill_end_index], dtype=paddle.int64
)
vision_inputs["image_type_ids"] = paddle.to_tensor(
inputs["image_type_ids"][request.image_type_ids_start : request.image_type_ids_end],
dtype=paddle.int64,
)
vision_inputs["images"] = paddle.to_tensor(
inputs["images"][request.image_start : request.image_end],
dtype="uint8" if "ernie" in self.model_config.model_type else "bfloat16",
)
vision_inputs["grid_thw"] = paddle.to_tensor(
inputs["grid_thw"][request.num_image_start : request.num_image_end], dtype="int64"
)
self.share_inputs["image_features"] = self.extract_vision_features(vision_inputs)
if envs.FD_ENABLE_MAX_PREFILL:
multi_vision_inputs["images_lst"].append(
inputs["images"][request.image_start : request.image_end].cuda()
)
multi_vision_inputs["grid_thw_lst"].extend(
inputs["grid_thw"][request.num_image_start : request.num_image_end]
)
multi_vision_inputs["cu_seqlens"].extend(
inputs["vit_seqlen"][request.num_image_start : request.num_image_end]
)
multi_vision_inputs["vit_position_ids_lst"].extend(
inputs["vit_position_ids"][request.num_image_start : request.num_image_end]
)
else:
vision_inputs = {}
vision_inputs["input_ids"] = paddle.to_tensor(
inputs["input_ids"][prefill_start_index:prefill_end_index], dtype=paddle.int64
)
vision_inputs["token_type_ids"] = paddle.to_tensor(
inputs["token_type_ids"][prefill_start_index:prefill_end_index], dtype=paddle.int64
)
vision_inputs["image_type_ids"] = paddle.to_tensor(
inputs["image_type_ids"][request.image_type_ids_start : request.image_type_ids_end],
dtype=paddle.int64,
)
vision_inputs["images"] = paddle.to_tensor(
inputs["images"][request.image_start : request.image_end],
dtype="uint8" if "ernie" in self.model_config.model_type else "bfloat16",
)
vision_inputs["grid_thw"] = paddle.to_tensor(
inputs["grid_thw"][request.num_image_start : request.num_image_end], dtype="int64"
)
self.share_inputs["image_features"] = self.extract_vision_features(vision_inputs)
else:
self.share_inputs["image_features"] = None
if inputs["position_ids"] is not None:
position_ids = paddle.to_tensor(
request.multimodal_inputs["position_ids"],
dtype="int64",
).unsqueeze([0])
else:
position_ids = None
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
position_ids = request.multimodal_inputs["position_ids"]
rope_3d_position_ids["position_ids_idx"].append(idx)
rope_3d_position_ids["position_ids_lst"].append(position_ids)
rope_3d_position_ids["position_ids_offset"].append(
position_ids.shape[0] + rope_3d_position_ids["position_ids_offset"][-1]
)
rope_3d_position_ids["max_tokens_lst"].append(request.get("max_tokens", 2048))
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
# Enable thinking
@@ -466,6 +484,21 @@ class GPUModelRunner(ModelRunnerBase):
else:
self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0
if len(multi_vision_inputs["images_lst"]) > 0:
self.share_inputs["image_features"] = self.extract_vision_features(multi_vision_inputs)
if len(rope_3d_position_ids["position_ids_idx"]) > 0:
packed_position_ids = paddle.to_tensor(
np.concatenate(rope_3d_position_ids["position_ids_lst"]), dtype="int64"
)
rope_3d_lst = self.prepare_rope3d(
packed_position_ids,
rope_3d_position_ids["max_tokens_lst"],
rope_3d_position_ids["position_ids_offset"],
)
for i, idx in enumerate(rope_3d_position_ids["position_ids_idx"]):
self.share_inputs["rope_emb"][idx : idx + 1, :] = rope_3d_lst[i]
if has_prefill_task or has_decode_task:
self.share_inputs["not_need_stop"][0] = True
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
@@ -545,7 +578,7 @@ class GPUModelRunner(ModelRunnerBase):
position_ids = paddle.to_tensor(
request.multimodal_inputs["position_ids"],
dtype="int64",
).unsqueeze([0])
)
else:
position_ids = None
token_chunk_size = inputs["input_ids"].shape[1]
@@ -582,8 +615,8 @@ class GPUModelRunner(ModelRunnerBase):
if self.enable_mm:
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)
position_ids, [request.get("max_tokens", 2048)], [0, position_ids.shape[0]]
)[0]
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
@@ -994,7 +1027,9 @@ class GPUModelRunner(ModelRunnerBase):
if self.enable_mm:
head_dim = self.model_config.head_dim
if "qwen" in self.model_config.model_type: # neox style = True
if (
"qwen" in self.model_config.model_type or "paddleocr" in self.model_config.model_type
): # neox style = True
rope_head_dim = head_dim
else: # neox style = False
rope_head_dim = head_dim // 2
@@ -2221,7 +2256,7 @@ class GPUModelRunner(ModelRunnerBase):
grid_thw = None
if one["position_ids"] is not None:
position_ids = paddle.to_tensor(one["position_ids"], dtype="int64").unsqueeze([0])
position_ids = paddle.to_tensor(one["position_ids"], dtype="int64")
else:
position_ids = None
@@ -2288,6 +2323,49 @@ class GPUModelRunner(ModelRunnerBase):
return image_features
def extract_vision_features_paddleocr(self, inputs: list[paddle.Tensor]) -> paddle.Tensor:
if envs.FD_ENABLE_MAX_PREFILL:
inputs["vit_position_ids_lst"] = np.concatenate(inputs["vit_position_ids_lst"])
images = paddle.concat(inputs["images_lst"]).cast("bfloat16")
grid_thw = paddle.to_tensor(inputs["grid_thw_lst"], dtype="int64")
position_ids = paddle.to_tensor(inputs["vit_position_ids_lst"], dtype="int64")
cu_seqlens = paddle.cumsum(paddle.to_tensor(inputs["cu_seqlens"])).cast("int32")
else:
assert inputs["images"] is not None
grid_thw = inputs["grid_thw"]
images = inputs["images"]
position_ids = []
cu_seqlens = [0]
for idx, thw in enumerate(grid_thw):
numel = np.prod(np.array(thw))
position_ids.append(paddle.arange(numel) % np.prod(thw[1:]))
cu_seqlens.append(cu_seqlens[-1] + numel)
position_ids = paddle.concat(position_ids, axis=0).to(images.place)
cu_seqlens = paddle.to_tensor(cu_seqlens, dtype=paddle.int32).to(images.place)
with paddle.amp.auto_cast(
True,
custom_black_list=self.amp_black,
custom_white_list=self.amp_white,
level="O2",
dtype=self.model_config.dtype,
):
image_features = self.model.visual(
pixel_values=images,
image_grid_thw=grid_thw,
position_ids=position_ids,
interpolate_pos_encoding=True,
cu_seqlens=cu_seqlens,
use_rope=True,
window_size=-1,
)
image_features = self.model.projector(image_features, grid_thw)
image_features = paddle.concat(image_features, axis=0)
return image_features
@paddle.no_grad()
def extract_vision_features(self, inputs: list[paddle.Tensor]) -> paddle.Tensor:
"""extract_vision_features"""
@@ -2295,28 +2373,26 @@ class GPUModelRunner(ModelRunnerBase):
return self.extract_vision_features_ernie(inputs)
elif "qwen" in self.model_config.model_type:
return self.extract_vision_features_qwen(inputs)
elif "paddleocr" in self.model_config.model_type:
return self.extract_vision_features_paddleocr(inputs)
else:
raise ValueError(f"multiple modalities model {self.model_config.model_type} is not supported")
@paddle.no_grad()
def prepare_rope3d(self, position_ids: paddle.Tensor, max_len: int) -> paddle.Tensor:
def prepare_rope3d(
self, position_ids: paddle.Tensor, max_len_lst: list[int], cumsum_seqlens: list[int]
) -> list[paddle.Tensor]:
"""prepare_rope3d"""
prefix_max_position_ids = paddle.max(position_ids) + 1
dec_pos_ids = paddle.tile(
paddle.arange(max_len, dtype="int64").unsqueeze(0).unsqueeze(-1),
[1, 1, 3],
)
dec_pos_ids = dec_pos_ids + prefix_max_position_ids
position_ids_3d_real = paddle.concat([position_ids, dec_pos_ids], axis=1)
rope_emb = get_rope_3d(
position_ids=position_ids_3d_real,
rope_emb_lst = get_rope_3d(
position_ids=position_ids,
rotary_dim=self.model_config.head_dim,
partial_rotary_factor=1.0,
base=self.model_config.rope_theta,
max_position=self.model_config.max_model_len,
freq_allocation=getattr(self.model_config, "freq_allocation", 20),
model_type=self.model_config.model_type,
max_len_lst=max_len_lst,
cumsum_seqlens=cumsum_seqlens,
)
return rope_emb
return rope_emb_lst

View File

@@ -420,6 +420,12 @@ class XPUModelRunner(ModelRunnerBase):
req_len = len(req_dicts)
has_prefill_task = False
has_decode_task = False
rope_3d_position_ids = {
"position_ids_idx": [],
"position_ids_lst": [],
"position_ids_offset": [0],
"max_tokens_lst": [],
}
for i in range(req_len):
request = req_dicts[i]
idx = request.idx
@@ -451,17 +457,13 @@ class XPUModelRunner(ModelRunnerBase):
else:
self.share_inputs["image_features"] = None
if inputs["position_ids"] is not None:
position_ids = paddle.to_tensor(
request.multimodal_inputs["position_ids"],
dtype="int64",
).unsqueeze([0])
else:
position_ids = None
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
position_ids = request.multimodal_inputs["position_ids"]
rope_3d_position_ids["position_ids_idx"].append(idx)
rope_3d_position_ids["position_ids_lst"].append(position_ids)
rope_3d_position_ids["position_ids_offset"].append(
position_ids.shape[0] + rope_3d_position_ids["position_ids_offset"][-1]
)
rope_3d_position_ids["max_tokens_lst"].append(request.get("max_tokens", 2048))
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
# Enable thinking
@@ -568,6 +570,18 @@ class XPUModelRunner(ModelRunnerBase):
else:
self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0
if len(rope_3d_position_ids["position_ids_idx"]) > 0:
packed_position_ids = paddle.to_tensor(
np.concatenate(rope_3d_position_ids["position_ids_lst"]), dtype="int64"
)
rope_3d_lst = self.prepare_rope3d(
packed_position_ids,
rope_3d_position_ids["max_tokens_lst"],
rope_3d_position_ids["position_ids_offset"],
)
for i, idx in enumerate(rope_3d_position_ids["position_ids_idx"]):
self.share_inputs["rope_emb"][idx : idx + 1, :] = rope_3d_lst[i]
if has_prefill_task or has_decode_task:
self.share_inputs["not_need_stop"][0] = True
@@ -603,8 +617,8 @@ class XPUModelRunner(ModelRunnerBase):
if self.enable_mm:
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)
position_ids, [request.get("max_tokens", 2048)], [0, position_ids.shape[0]]
)[0]
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
@@ -1310,7 +1324,7 @@ class XPUModelRunner(ModelRunnerBase):
grid_thw = None
if one["position_ids"] is not None:
position_ids = paddle.to_tensor(one["position_ids"], dtype="int64").unsqueeze([0])
position_ids = paddle.to_tensor(one["position_ids"], dtype="int64")
else:
position_ids = None
@@ -1365,24 +1379,20 @@ class XPUModelRunner(ModelRunnerBase):
return image_features
@paddle.no_grad()
def prepare_rope3d(self, position_ids: paddle.Tensor, max_len: int) -> paddle.Tensor:
def prepare_rope3d(
self, position_ids: paddle.Tensor, max_len_lst: list[int], cumsum_seqlens: list[int]
) -> list[paddle.Tensor]:
"""prepare_rope3d"""
prefix_max_position_ids = paddle.max(position_ids) + 1
dec_pos_ids = paddle.tile(
paddle.arange(max_len, dtype="int64").unsqueeze(0).unsqueeze(-1),
[1, 1, 3],
)
dec_pos_ids = dec_pos_ids + prefix_max_position_ids
position_ids_3d_real = paddle.concat([position_ids, dec_pos_ids], axis=1)
rope_emb = get_rope_3d(
position_ids=position_ids_3d_real,
rope_emb_lst = get_rope_3d(
position_ids=position_ids,
rotary_dim=self.model_config.head_dim,
partial_rotary_factor=1.0,
base=self.model_config.rope_theta,
max_position=self.model_config.max_model_len,
freq_allocation=getattr(self.model_config, "freq_allocation", 20),
model_type=self.model_config.model_type,
max_len_lst=max_len_lst,
cumsum_seqlens=cumsum_seqlens,
)
return rope_emb
return rope_emb_lst