mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00

* support bad_words_token_ids * docs * fix test * fix * bad words support kvcache v1 and token ids * fix
297 lines
11 KiB
Python
297 lines
11 KiB
Python
"""
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
|
|
import numpy as np
|
|
|
|
from fastdeploy.engine.request import Request
|
|
from fastdeploy.input.qwen_mm_processor import DataProcessor
|
|
from fastdeploy.input.text_processor import DataProcessor as TextProcessor
|
|
from fastdeploy.utils import data_processor_logger
|
|
|
|
|
|
class QwenVLProcessor(TextProcessor):
|
|
"""
|
|
Qwen Vision-Language processor for handling multimodal inputs.
|
|
|
|
This processor extends TextProcessor to support:
|
|
- Image and video processing
|
|
- Multimodal feature extraction
|
|
- Tokenization and position encoding
|
|
- Request processing and model input generation
|
|
|
|
Attributes:
|
|
processor (DataProcessor): Underlying data processor instance
|
|
tokenizer: Text tokenizer instance
|
|
limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
config,
|
|
model_name_or_path,
|
|
limit_mm_per_prompt=None,
|
|
mm_processor_kwargs=None,
|
|
reasoning_parser_obj=None,
|
|
tool_parser_obj=None,
|
|
):
|
|
"""
|
|
Initialize QwenVLProcessor instance.
|
|
|
|
Args:
|
|
config: Model configuration object
|
|
model_name_or_path (str): Pretrained model name or path
|
|
limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
|
|
mm_processor_kwargs (dict, optional): Multimodal processor arguments
|
|
reasoning_parser_obj: Reasoning parser instance
|
|
tool_parser_obj: Tool parser instance
|
|
"""
|
|
super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
|
|
|
|
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
|
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
|
|
self.processor = DataProcessor(
|
|
model_path=model_name_or_path,
|
|
tokens_per_second=config.vision_config.tokens_per_second,
|
|
tokenizer=self.tokenizer,
|
|
**processor_kwargs,
|
|
)
|
|
|
|
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
|
|
|
|
def process_request(self, request, max_model_len=None, **kwargs):
|
|
"""
|
|
Process incoming request and generate model inputs.
|
|
|
|
Args:
|
|
request: Input request object
|
|
max_model_len (int, optional): Maximum context length
|
|
**kwargs: Additional processing parameters
|
|
|
|
Returns:
|
|
Request: Processed request with model inputs
|
|
"""
|
|
task = request.to_dict()
|
|
task["enable_thinking"] = kwargs.get("enable_thinking", False)
|
|
self.process_request_dict(task, max_model_len)
|
|
request = Request.from_dict(task)
|
|
request = self._apply_default_parameters(request)
|
|
return request
|
|
|
|
def _parse_processor_kwargs(self, kwargs):
|
|
"""
|
|
Parse and validate multimodal processor arguments.
|
|
|
|
Args:
|
|
kwargs (dict): Processor configuration arguments
|
|
|
|
Returns:
|
|
dict: Validated processor arguments
|
|
|
|
Raises:
|
|
ValueError: If arguments format is invalid
|
|
"""
|
|
if not kwargs:
|
|
return {}
|
|
|
|
try:
|
|
if not isinstance(kwargs, dict):
|
|
raise ValueError("mm-processor-kwargs must be a dictionary")
|
|
|
|
# Validate kwargs types against expected schema
|
|
data_processor_logger.info(f"Processing kwargs: {kwargs}")
|
|
expected_types = {
|
|
"video_max_frames": int, # Maximum video frames parameter
|
|
"video_min_frames": int, # Minimum video frames parameter
|
|
}
|
|
|
|
for key, value in kwargs.items():
|
|
if key in expected_types and not isinstance(value, expected_types[key]):
|
|
raise ValueError(
|
|
f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
|
|
)
|
|
|
|
return kwargs
|
|
|
|
except Exception as e:
|
|
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
|
|
return {}
|
|
|
|
def _parse_limits(self, limits):
|
|
"""
|
|
Parse and validate multimodal input limits.
|
|
|
|
Args:
|
|
limits (dict): Input limits configuration
|
|
|
|
Returns:
|
|
dict: Validated limits with defaults
|
|
|
|
Raises:
|
|
ValueError: If limits format is invalid
|
|
"""
|
|
DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
|
|
|
|
if not limits:
|
|
return DEFAULT_LIMITS
|
|
|
|
try:
|
|
if not isinstance(limits, dict):
|
|
raise ValueError("limit-mm-per-prompt must be a dictionary")
|
|
data_processor_logger.info(f"_parse_limits:{limits}")
|
|
return {**DEFAULT_LIMITS, **limits}
|
|
except Exception as e:
|
|
data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
|
|
return DEFAULT_LIMITS
|
|
|
|
def _check_mm_limits(self, item):
|
|
"""
|
|
Validate multimodal inputs against configured limits.
|
|
|
|
Args:
|
|
item: Input request item to validate
|
|
|
|
Raises:
|
|
ValueError: If input exceeds configured limits
|
|
"""
|
|
if isinstance(item, dict):
|
|
# 请求包含prompt和multi_modal_data
|
|
mm_data = item
|
|
else:
|
|
# 请求包含messages
|
|
mm_data = {"image": [], "video": []}
|
|
|
|
for message in item:
|
|
if isinstance(message.get("content"), list):
|
|
for part in message["content"]:
|
|
if part.get("type") in ["image_url", "image"]:
|
|
mm_data["image"].append(part)
|
|
elif part.get("type") in ["video_url", "video"]:
|
|
mm_data["video"].append(part)
|
|
|
|
for modality, data in mm_data.items():
|
|
if modality in self.limit_mm_per_prompt:
|
|
limit = self.limit_mm_per_prompt[modality]
|
|
if len(data) > limit:
|
|
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
|
|
|
|
def process_request_dict(self, request, max_model_len=None):
|
|
"""
|
|
Process request dictionary into model inputs.
|
|
|
|
Args:
|
|
request (dict): Input request dictionary
|
|
max_model_len (int, optional): Maximum context length
|
|
|
|
Returns:
|
|
dict: Processed request with model inputs
|
|
|
|
Raises:
|
|
ValueError: If request format is invalid
|
|
"""
|
|
|
|
request = self._apply_default_parameters(request)
|
|
if not request.get("eos_token_ids"):
|
|
request["eos_token_ids"] = self.eos_token_ids
|
|
|
|
stop_sequences = request.get("stop", [])
|
|
if stop_sequences:
|
|
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
|
request["stop_token_ids"] = stop_seqs
|
|
request["stop_seqs_len"] = stop_seqs_len
|
|
|
|
bad_words = request.get("bad_words")
|
|
bad_words_token_ids = request.get("bad_words_token_ids")
|
|
if bad_words:
|
|
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
|
|
request["bad_words_token_ids"] = bad_words_token_ids
|
|
|
|
if request.get("prompt"):
|
|
multimodal_data = request.get("multimodal_data")
|
|
if multimodal_data is None:
|
|
multimodal_data = {}
|
|
self._check_mm_limits(multimodal_data)
|
|
images = multimodal_data.get("image", None)
|
|
videos = multimodal_data.get("video", None)
|
|
outputs = self.processor.text2ids(request["prompt"], images, videos)
|
|
|
|
elif request.get("messages"):
|
|
messages = request["messages"]
|
|
self._check_mm_limits(messages)
|
|
outputs = self.processor.request2ids(request)
|
|
|
|
else:
|
|
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
|
|
|
|
metadata = request.get("metadata")
|
|
# Handle continuation of previous generation by appending existing tokens
|
|
if metadata and metadata.get("generated_token_ids"):
|
|
self.append_generated_tokens(outputs, metadata["generated_token_ids"])
|
|
outputs = self.pack_outputs(outputs)
|
|
|
|
request["prompt_token_ids"] = outputs["input_ids"].tolist()
|
|
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
|
|
request["multimodal_inputs"] = outputs
|
|
|
|
# Handle prompt truncation if exceeds model context length
|
|
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
|
|
request["prompt_token_ids"] = request["prompt_token_ids"][
|
|
: max_model_len - 1
|
|
] # Leave space for at least 1 new token
|
|
|
|
# Set default max_tokens if not specified
|
|
if request.get("max_tokens") is None:
|
|
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) # Ensure at least 1 token
|
|
data_processor_logger.info(f"Processed request {request}")
|
|
|
|
return request
|
|
|
|
def append_generated_tokens(self, outputs, generated_token_ids):
|
|
"""
|
|
Append generated tokens to existing outputs.
|
|
|
|
Args:
|
|
outputs: Current model outputs
|
|
generated_token_ids: Generated tokens to append
|
|
"""
|
|
out = {"input_ids": [], "token_type_ids": [], "position_ids": [], "cur_position": outputs["cur_position"]}
|
|
self.processor._add_text(generated_token_ids, out)
|
|
|
|
outputs["input_ids"] = np.concatenate(
|
|
[outputs["input_ids"], np.array(out["input_ids"], dtype=np.int64)], axis=0
|
|
)
|
|
outputs["token_type_ids"] = np.concatenate(
|
|
[outputs["token_type_ids"], np.array(out["token_type_ids"], dtype=np.int64)], axis=0
|
|
)
|
|
outputs["position_ids"] = np.concatenate(
|
|
[outputs["position_ids"], out["position_ids"][0]], axis=1, dtype=np.int64
|
|
)
|
|
outputs["cur_position"] = out["cur_position"]
|
|
|
|
def pack_outputs(self, outputs):
|
|
"""
|
|
Prepare final output dictionary for model.
|
|
|
|
Args:
|
|
outputs: Intermediate processing outputs
|
|
|
|
Returns:
|
|
dict: Packed output dictionary with all required fields
|
|
"""
|
|
outputs["image_patch_id"] = self.processor.image_token_id
|
|
outputs["video_patch_id"] = self.processor.video_token_id
|
|
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
|
|
return outputs
|