rename ernie_xxx to ernie4_5_xxx (#3621)

* rename ernie_xxx to ernie4_5_xxx

* ci fix
This commit is contained in:
Yuanle Liu
2025-08-26 19:29:27 +08:00
committed by GitHub
parent 642480f5f6
commit cbce94a00e
37 changed files with 126 additions and 100 deletions

View File

@@ -1,6 +1,7 @@
# Offline Inference
## 1. Usage
FastDeploy supports offline inference by loading models locally and processing user data. Usage examples:
### Chat Interface (LLM.chat)
@@ -91,10 +92,10 @@ from PIL import Image
from fastdeploy.entrypoints.llm import LLM
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle"
tokenizer = ErnieBotTokenizer.from_pretrained(PATH)
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
messages = [
{
@@ -144,15 +145,16 @@ for output in outputs:
```
>Note: The `generate interface` does not currently support passing parameters to control the thinking function (on/off). It always uses the model's default parameters.
> Note: The `generate interface` does not currently support passing parameters to control the thinking function (on/off). It always uses the model's default parameters.
## 2. API Documentation
### 2.1 fastdeploy.LLM
For ```LLM``` configuration, refer to [Parameter Documentation](parameters.md).
For ``LLM`` configuration, refer to [Parameter Documentation](parameters.md).
> Configuration Notes:
>
> 1. `port` and `metrics_port` is only used for online inference.
> 2. After startup, the service logs KV Cache block count (e.g. `total_block_num:640`). Multiply this by block_size (default 64) to get total cacheable tokens.
> 3. Calculate `max_num_seqs` based on cacheable tokens. Example: avg input=800 tokens, output=500 tokens, blocks=640 → `kv_cache_ratio = 800/(800+500)=0.6`, `max_seq_len = 640*64/(800+500)=31`.
@@ -163,7 +165,7 @@ For ```LLM``` configuration, refer to [Parameter Documentation](parameters.md).
* sampling_params: See 2.4 for parameter details
* use_tqdm: Enable progress visualization
* chat_template_kwargs(dict): Extra template parameters (currently supports enable_thinking(bool))
*usage example: `chat_template_kwargs={"enable_thinking": False}`*
*usage example: `chat_template_kwargs={"enable_thinking": False}`*
### 2.3 fastdeploy.LLM.generate

View File

@@ -1,6 +1,7 @@
# 离线推理
## 1. 使用方式
通过FastDeploy离线推理可支持本地加载模型并处理用户数据使用方式如下
### 对话接口(LLM.chat)
@@ -32,9 +33,9 @@ for output in outputs:
generated_text = output.outputs.text
```
上述示例中```LLM```配置方式, `SamplingParams` `LLM.generate` `LLM.chat`以及输出output对应的结构体 `RequestOutput` 接口说明见如下文档说明。
上述示例中 ``LLM``配置方式, `SamplingParams` `LLM.generate` `LLM.chat`以及输出output对应的结构体 `RequestOutput` 接口说明见如下文档说明。
> 注: 若为思考模型, 加载模型时需要指定`resoning_parser` 参数,并在请求时, 可以通过配置`chat_template_kwargs` 中 `enable_thinking`参数, 进行开关思考。
> 注: 若为思考模型, 加载模型时需要指定 `resoning_parser` 参数,并在请求时, 可以通过配置 `chat_template_kwargs` 中 `enable_thinking`参数, 进行开关思考。
```python
from fastdeploy.entrypoints.llm import LLM
@@ -82,7 +83,7 @@ for output in outputs:
> 注: 续写接口, 适应于用户自定义好上下文输入, 并希望模型仅输出续写内容的场景; 推理过程不会增加其他 `prompt`拼接。
> 对于 `chat`模型, 建议使用对话接口(LLM.chat)。
对于多模模型, 例如`baidu/ERNIE-4.5-VL-28B-A3B-Paddle`, 在调用`generate接口`时, 需要提供包含图片的prompt, 使用方式如下:
对于多模模型, 例如 `baidu/ERNIE-4.5-VL-28B-A3B-Paddle`, 在调用 `generate接口`时, 需要提供包含图片的prompt, 使用方式如下:
```python
import io
@@ -91,10 +92,10 @@ from PIL import Image
from fastdeploy.entrypoints.llm import LLM
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie_tokenizer import Ernie4_5Tokenizer
PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle"
tokenizer = ErnieBotTokenizer.from_pretrained(PATH)
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
messages = [
{
@@ -153,7 +154,8 @@ for output in outputs:
支持配置参数参考 [FastDeploy参数说明](./parameters.md)
> 参数配置说明:
> 1. 离线推理不需要配置 `port` 和`metrics_port` 参数。
>
> 1. 离线推理不需要配置 `port` 和 `metrics_port` 参数。
> 2. 模型服务启动后会在日志文件log/fastdeploy.log中打印如 `Doing profile, the total_block_num:640` 的日志其中640即表示自动计算得到的KV Cache block数量将它乘以block_size(默认值64)即可得到部署后总共可以在KV Cache中缓存的Token数。
> 3. `max_num_seqs` 用于配置decode阶段最大并发处理请求数该参数可以基于第1点中缓存的Token数来计算一个较优值例如线上统计输入平均token数800, 输出平均token数500本次计>算得到KV Cache block为640 block_size为64。那么我们可以配置 `kv_cache_ratio = 800 / (800 + 500) = 0.6` , 配置 `max_seq_len = 640 * 64 / (800 + 500) = 31`。
@@ -163,12 +165,12 @@ for output in outputs:
* sampling_params: 模型超参设置具体说明见2.4
* use_tqdm: 是否打开推理进度可视化
* chat_template_kwargs(dict): 传递给对话模板的额外参数当前支持enable_thinking(bool)
*使用示例`chat_template_kwargs={"enable_thinking": False}`*
*使用示例 `chat_template_kwargs={"enable_thinking": False}`*
### 2.3 fastdeploy.LLM.generate
* prompts(str, list[str], list[int], list[list[int]], dict[str, Any], list[dict[str, Any]]): 输入的prompt, 支持batch prompt 输入解码后的token ids 进行输入
*dict 类型使用示例`prompts={"prompt": prompt, "multimodal_data": {"image": images}}`*
*dict 类型使用示例 `prompts={"prompt": prompt, "multimodal_data": {"image": images}}`*
* sampling_params: 模型超参设置具体说明见2.4
* use_tqdm: 是否打开推理进度可视化
@@ -193,7 +195,7 @@ for output in outputs:
* outputs(fastdeploy.engine.request.CompletionOutput): 输出结果
* finished(bool)标识当前query 是否推理结束
* metrics(fastdeploy.engine.request.RequestMetrics):记录推理耗时指标
* num_cached_tokens(int): 缓存的token数量, 仅在开启```enable_prefix_caching```时有效
* num_cached_tokens(int): 缓存的token数量, 仅在开启 ``enable_prefix_caching``时有效
* error_code(int): 错误码
* error_msg(str): 错误信息

View File

@@ -19,14 +19,14 @@ import os
import numpy as np
from paddleformers.generation import GenerationConfig
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.input.text_processor import BaseDataProcessor
from fastdeploy.utils import data_processor_logger
_SAMPLING_EPS = 1e-5
class ErnieProcessor(BaseDataProcessor):
class Ernie4_5Processor(BaseDataProcessor):
"""
初始化模型实例
@@ -431,9 +431,9 @@ class ErnieProcessor(BaseDataProcessor):
]
for i in range(len(vocab_file_names)):
if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
break
self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path)
self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
def get_pad_id(self):
"""

View File

@@ -27,7 +27,7 @@ from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, Tex
from paddleformers.utils.log import logger
class ErnieBotTokenizer(PretrainedTokenizer):
class Ernie4_5Tokenizer(PretrainedTokenizer):
"""
一个更好用的 `ErnieBotToknizer`
encode 目前 sft/ppo 阶段的特殊token也支持多模态
@@ -164,7 +164,7 @@ class ErnieBotTokenizer(PretrainedTokenizer):
"""doc"""
if "add_special_tokens" in kwargs:
kwargs.pop("add_special_tokens")
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
# logger.warning(f'Ernie4_5Tokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:

View File

@@ -14,14 +14,15 @@
# limitations under the License.
"""
from .process import IDS_TYPE_FLAG, DataProcessor, fancy_print
from .ernie4_5_vl_processor import Ernie4_5_VLProcessor
from .process import DataProcessor, fancy_print
from .process_video import read_video_decord
from .utils.video_utils import VideoReaderWrapper
__all__ = [
"DataProcessor",
"fancy_print",
"IDS_TYPE_FLAG",
"VideoReaderWrapper",
"read_video_decord",
"Ernie4_5_VLProcessor",
]

View File

@@ -20,12 +20,14 @@ import numpy as np
from paddleformers.generation import GenerationConfig
from fastdeploy.engine.request import Request
from fastdeploy.input.ernie_processor import ErnieProcessor
from fastdeploy.input.mm_processor import IDS_TYPE_FLAG, DataProcessor
from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.utils import data_processor_logger
from .process import DataProcessor
class ErnieMoEVLProcessor(ErnieProcessor):
class Ernie4_5_VLProcessor(Ernie4_5Processor):
"""The processor class for ERNIE MoE VL models."""
def __init__(
@@ -41,14 +43,14 @@ class ErnieMoEVLProcessor(ErnieProcessor):
preprocessor_path = model_name_or_path
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
self.ernie_processor = DataProcessor(
self.ernie4_5_processor = DataProcessor(
tokenizer_name=tokenizer_path,
image_preprocessor_name=preprocessor_path,
**processor_kwargs,
)
self.ernie_processor.eval()
self.image_patch_id = self.ernie_processor.image_patch_id
self.spatial_conv_size = self.ernie_processor.spatial_conv_size
self.ernie4_5_processor.eval()
self.image_patch_id = self.ernie4_5_processor.image_patch_id
self.spatial_conv_size = self.ernie4_5_processor.spatial_conv_size
self.tool_parser_dict = dict()
self.decode_status = dict()
@@ -86,7 +88,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
Returns:
tokenizer (AutoTokenizer)
"""
self.tokenizer = self.ernie_processor.tokenizer
self.tokenizer = self.ernie4_5_processor.tokenizer
def _apply_default_parameters(self, request):
"""
@@ -222,7 +224,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
request["text_after_process"] = request.get("prompt")
outputs = self.ernie_processor.text2ids(request["prompt"], images, videos)
outputs = self.ernie4_5_processor.text2ids(request["prompt"], images, videos)
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
@@ -235,7 +237,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", True)
outputs = self.ernie_processor.request2ids(request)
outputs = self.ernie4_5_processor.request2ids(request)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")

View File

@@ -26,15 +26,14 @@ from paddleformers.transformers.image_utils import ChannelDimension
from PIL import Image
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.utils import data_processor_logger
from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
from .process_video import read_frames_decord, read_video_decord
from .utils.render_timestamp import render_frame_timestamp
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
def fancy_print(input_ids, tokenizer, image_patch_id=None):
"""
@@ -477,9 +476,9 @@ class DataProcessor:
]
for i in range(len(vocab_file_names)):
if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
break
self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path)
self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
def apply_chat_template(self, request):
"""

View File

@@ -14,6 +14,6 @@
# limitations under the License.
"""
from .tokenizer_vl import ErnieVLTokenizer
from .ernie4_5_vl_tokenizer import Ernie4_5_VLTokenizer
__all__ = ["ErnieVLTokenizer"]
__all__ = ["Ernie4_5_VLTokenizer"]

View File

@@ -14,9 +14,6 @@
# limitations under the License.
"""
"""
ErnieVLTokenizer
"""
import os
import re
from shutil import copyfile
@@ -31,7 +28,7 @@ from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, Tex
from fastdeploy.utils import console_logger as logger
class ErnieVLTokenizer(PretrainedTokenizer):
class Ernie4_5_VLTokenizer(PretrainedTokenizer):
"""doc"""
resource_files_names = {
@@ -157,7 +154,7 @@ class ErnieVLTokenizer(PretrainedTokenizer):
"""doc"""
if "add_special_tokens" in kwargs:
kwargs.pop("add_special_tokens")
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
# logger.warning(f'Ernie4_5Tokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:

View File

@@ -89,18 +89,18 @@ class InputPreprocessor:
tool_parser_obj=tool_parser_obj,
)
else:
from fastdeploy.input.ernie_processor import ErnieProcessor
from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
self.processor = ErnieProcessor(
self.processor = Ernie4_5Processor(
model_name_or_path=self.model_name_or_path,
reasoning_parser_obj=reasoning_parser_obj,
tool_parser_obj=tool_parser_obj,
)
else:
if ErnieArchitectures.contains_ernie_arch(architectures):
from fastdeploy.input.ernie_vl_processor import ErnieMoEVLProcessor
from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor
self.processor = ErnieMoEVLProcessor(
self.processor = Ernie4_5_VLProcessor(
model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,

View File

@@ -14,9 +14,10 @@
# limitations under the License.
"""
from .process import IDS_TYPE_FLAG, DataProcessor
from .process import DataProcessor
from .qwen_vl_processor import QwenVLProcessor
__all__ = [
"DataProcessor",
"IDS_TYPE_FLAG",
"QwenVLProcessor",
]

View File

@@ -21,7 +21,7 @@ import numpy as np
from paddleformers.transformers import AutoTokenizer
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.mm_processor import IDS_TYPE_FLAG
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.utils import data_processor_logger
from .image_processor import ImageProcessor

View File

@@ -20,7 +20,7 @@ from typing import Optional, Union
import numpy as np
from PIL import Image
from fastdeploy.input.mm_processor import read_video_decord
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
def read_frames(video_path):

View File

@@ -17,10 +17,11 @@
import numpy as np
from fastdeploy.engine.request import Request
from fastdeploy.input.qwen_mm_processor import DataProcessor
from fastdeploy.input.text_processor import DataProcessor as TextProcessor
from fastdeploy.utils import data_processor_logger
from .process import DataProcessor
class QwenVLProcessor(TextProcessor):
"""

21
fastdeploy/input/utils.py Normal file
View File

@@ -0,0 +1,21 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
__all__ = [
"IDS_TYPE_FLAG",
]
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}

View File

@@ -279,7 +279,7 @@ class BackendBase:
tokenizer = PreTrainedTokenizerFast(__slow_tokenizer=tokenizer)
else:
from fastdeploy.model_executor.guided_decoding.ernie_tokenizer import (
ErnieBotTokenizer,
Ernie4_5Tokenizer,
)
vocab_file_names = [
@@ -294,10 +294,10 @@ class BackendBase:
vocab_file_names[i],
)
):
ErnieBotTokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i]
Ernie4_5Tokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i]
break
tokenizer = ErnieBotTokenizer.from_pretrained(self.fd_config.model_config.model)
tokenizer = Ernie4_5Tokenizer.from_pretrained(self.fd_config.model_config.model)
return tokenizer
except Exception as e:

View File

@@ -30,7 +30,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
class ErnieBotTokenizer(PreTrainedTokenizer):
class Ernie4_5Tokenizer(PreTrainedTokenizer):
"""
Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:

View File

@@ -66,7 +66,7 @@ if not (current_platform.is_dcu() or current_platform.is_iluvatar()):
from fastdeploy.spec_decode import MTPProposer, NgramProposer
from fastdeploy import envs
from fastdeploy.input.mm_processor import DataProcessor
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp
from fastdeploy.worker.model_runner_base import ModelRunnerBase

View File

@@ -26,7 +26,7 @@ from paddleformers.utils.log import logger
from fastdeploy import envs
from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request, RequestType
from fastdeploy.input.mm_processor import DataProcessor
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.utils import (
profile_run_guard,

View File

@@ -38,7 +38,7 @@ from fastdeploy.config import (
ParallelConfig,
SpeculativeConfig,
)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.model_executor.layers.quantization import get_quantization_config
@@ -106,7 +106,7 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]:
def update_fd_config_for_mm(fd_config: FDConfig) -> None:
if fd_config.model_config.enable_mm:
tokenizer = ErnieBotTokenizer.from_pretrained(
tokenizer = Ernie4_5Tokenizer.from_pretrained(
fd_config.model_config.model,
model_max_length=fd_config.parallel_config.max_model_len,
padding_side="right",

View File

@@ -12,7 +12,7 @@ from paddleformers.utils.env import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
from paddleformers.utils.log import logger
from safetensors.numpy import save_file as safe_save_file
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.load_weight_utils import (
get_all_safetensors,
@@ -140,9 +140,9 @@ def main():
]
for i in range(len(vocab_file_names)):
if os.path.exists(os.path.join(args.model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
break
tokenizer = ErnieBotTokenizer.from_pretrained(args.model_name_or_path)
tokenizer = Ernie4_5Tokenizer.from_pretrained(args.model_name_or_path)
_, safetensor_files = get_all_safetensors(args.model_name_or_path)
weights_iterator = safetensors_weights_iterator(safetensor_files)
state_dict = {}

View File

@@ -211,7 +211,7 @@ setup(
"model_executor/ops/iluvatar/*",
"model_executor/models/*",
"model_executor/layers/*",
"input/mm_processor/utils/*",
"input/ernie4_5_vl_processor/utils/*",
"model_executor/ops/gcu/*",
"version.txt",
]

View File

@@ -738,8 +738,8 @@ def test_non_streaming_chat_with_disable_chat_template(openai_client, capsys):
assert hasattr(enabled_response, "choices")
assert len(enabled_response.choices) > 0
# from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
# tokenizer = ErnieBotTokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True)
# from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
# tokenizer = Ernie4_5Tokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True)
# prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False)
prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: "
disabled_response = openai_client.chat.completions.create(
@@ -821,9 +821,9 @@ def test_non_streaming_chat_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0].message, "completion_token_ids")
assert isinstance(response_0.choices[0].message.completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = []
output_ids_0 = []
for ids in response_0.choices[0].message.completion_token_ids:
@@ -977,9 +977,9 @@ def test_non_streaming_completion_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0], "completion_token_ids")
assert isinstance(response_0.choices[0].completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = []
output_ids_0 = []
for ids in response_0.choices[0].completion_token_ids:

View File

@@ -733,8 +733,8 @@ def test_non_streaming_chat_completion_disable_chat_template(openai_client, caps
assert hasattr(enabled_response, "choices")
assert len(enabled_response.choices) > 0
# from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
# tokenizer = ErnieBotTokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True)
# from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
# tokenizer = Ernie4_5Tokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True)
# prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False)
prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: "
disabled_response = openai_client.chat.completions.create(
@@ -816,9 +816,9 @@ def test_non_streaming_chat_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0].message, "completion_token_ids")
assert isinstance(response_0.choices[0].message.completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = []
output_ids_0 = []
for ids in response_0.choices[0].message.completion_token_ids:
@@ -972,9 +972,9 @@ def test_non_streaming_completion_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0], "completion_token_ids")
assert isinstance(response_0.choices[0].completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = []
output_ids_0 = []
for ids in response_0.choices[0].completion_token_ids:

View File

@@ -1,14 +1,14 @@
import unittest
from unittest.mock import MagicMock, patch
from fastdeploy.input.ernie_processor import ErnieProcessor
from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
class TestErnieProcessorProcessResponseDictStreaming(unittest.TestCase):
class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase):
def setUp(self):
# 创建 ErnieProcessor 实例的模拟对象
with patch.object(ErnieProcessor, "__init__", return_value=None) as mock_init:
self.processor = ErnieProcessor("model_path")
# 创建 Ernie4_5Processor 实例的模拟对象
with patch.object(Ernie4_5Processor, "__init__", return_value=None) as mock_init:
self.processor = Ernie4_5Processor("model_path")
mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}")
# 设置必要的属性

View File

@@ -101,7 +101,7 @@ class TestQwenVLProcessor(unittest.TestCase):
self.patcher_parse_video.start()
self.patcher_read_frames = patch(
"fastdeploy.input.qwen_mm_processor.process.read_frames", return_value=mock_read_frames(480, 640, 5, 2)
"fastdeploy.input.qwen_vl_processor.process.read_frames", return_value=mock_read_frames(480, 640, 5, 2)
)
self.patcher_read_frames.start()

View File

@@ -9,8 +9,8 @@ from fastdeploy.entrypoints.chat_utils import load_chat_template
from fastdeploy.entrypoints.llm import LLM
from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest
from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
from fastdeploy.input.ernie_processor import ErnieProcessor
from fastdeploy.input.ernie_vl_processor import ErnieMoEVLProcessor
from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor
from fastdeploy.input.text_processor import DataProcessor
@@ -108,10 +108,10 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
chat_completion = await self.chat_completion_handler.create_chat_completion(request)
self.assertEqual("hello", chat_completion["chat_template"])
@patch("fastdeploy.input.ernie_vl_processor.ErnieMoEVLProcessor.__init__")
def test_vl_processor(self, mock_class):
@patch("fastdeploy.input.ernie4_5_vl_processor.Ernie4_5_VLProcessor.__init__")
def test_ernie4_5_vl_processor(self, mock_class):
mock_class.return_value = None
vl_processor = ErnieMoEVLProcessor()
ernie4_5_vl_processor = Ernie4_5_VLProcessor()
mock_request = Request.from_dict({"request_id": "123"})
def mock_apply_default_parameters(request):
@@ -120,9 +120,9 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
def mock_process_request(request, max_model_len):
return request
vl_processor._apply_default_parameters = mock_apply_default_parameters
vl_processor.process_request_dict = mock_process_request
result = vl_processor.process_request(mock_request, chat_template="hello")
ernie4_5_vl_processor._apply_default_parameters = mock_apply_default_parameters
ernie4_5_vl_processor.process_request_dict = mock_process_request
result = ernie4_5_vl_processor.process_request(mock_request, chat_template="hello")
self.assertEqual("hello", result.chat_template)
@patch("fastdeploy.input.text_processor.DataProcessor.__init__")
@@ -149,10 +149,10 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
result = text_processor.process_request(mock_request, chat_template="hello")
self.assertEqual("hello", result.chat_template)
@patch("fastdeploy.input.ernie_processor.ErnieProcessor.__init__")
def test_ernie_processor_process(self, mock_class):
@patch("fastdeploy.input.ernie4_5_processor.Ernie4_5Processor.__init__")
def test_ernie4_5_processor_process(self, mock_class):
mock_class.return_value = None
ernie_processor = ErnieProcessor()
ernie4_5_processor = Ernie4_5Processor()
mock_request = Request.from_dict(
{"request_id": "123", "messages": ["hi"], "max_tokens": 128, "temperature": 1, "top_p": 1}
)
@@ -166,12 +166,12 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
def mock_messages2ids(text):
return [1]
ernie_processor._apply_default_parameters = mock_apply_default_parameters
ernie_processor.process_request_dict = mock_process_request
ernie_processor.messages2ids = mock_messages2ids
ernie_processor.eos_token_ids = [1]
ernie_processor.reasoning_parser = MagicMock()
result = ernie_processor.process_request(mock_request, chat_template="hello")
ernie4_5_processor._apply_default_parameters = mock_apply_default_parameters
ernie4_5_processor.process_request_dict = mock_process_request
ernie4_5_processor.messages2ids = mock_messages2ids
ernie4_5_processor.eos_token_ids = [1]
ernie4_5_processor.reasoning_parser = MagicMock()
result = ernie4_5_processor.process_request(mock_request, chat_template="hello")
self.assertEqual("hello", result.chat_template)
@patch("fastdeploy.entrypoints.llm.LLM.__init__")