rename ernie_xxx to ernie4_5_xxx (#3621)

* rename ernie_xxx to ernie4_5_xxx

* ci fix
This commit is contained in:
Yuanle Liu
2025-08-26 19:29:27 +08:00
committed by GitHub
parent 642480f5f6
commit cbce94a00e
37 changed files with 126 additions and 100 deletions

View File

@@ -1,6 +1,7 @@
# Offline Inference # Offline Inference
## 1. Usage ## 1. Usage
FastDeploy supports offline inference by loading models locally and processing user data. Usage examples: FastDeploy supports offline inference by loading models locally and processing user data. Usage examples:
### Chat Interface (LLM.chat) ### Chat Interface (LLM.chat)
@@ -91,10 +92,10 @@ from PIL import Image
from fastdeploy.entrypoints.llm import LLM from fastdeploy.entrypoints.llm import LLM
from fastdeploy.engine.sampling_params import SamplingParams from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle" PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle"
tokenizer = ErnieBotTokenizer.from_pretrained(PATH) tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
messages = [ messages = [
{ {
@@ -144,15 +145,16 @@ for output in outputs:
``` ```
>Note: The `generate interface` does not currently support passing parameters to control the thinking function (on/off). It always uses the model's default parameters. > Note: The `generate interface` does not currently support passing parameters to control the thinking function (on/off). It always uses the model's default parameters.
## 2. API Documentation ## 2. API Documentation
### 2.1 fastdeploy.LLM ### 2.1 fastdeploy.LLM
For ```LLM``` configuration, refer to [Parameter Documentation](parameters.md). For ``LLM`` configuration, refer to [Parameter Documentation](parameters.md).
> Configuration Notes: > Configuration Notes:
>
> 1. `port` and `metrics_port` is only used for online inference. > 1. `port` and `metrics_port` is only used for online inference.
> 2. After startup, the service logs KV Cache block count (e.g. `total_block_num:640`). Multiply this by block_size (default 64) to get total cacheable tokens. > 2. After startup, the service logs KV Cache block count (e.g. `total_block_num:640`). Multiply this by block_size (default 64) to get total cacheable tokens.
> 3. Calculate `max_num_seqs` based on cacheable tokens. Example: avg input=800 tokens, output=500 tokens, blocks=640 → `kv_cache_ratio = 800/(800+500)=0.6`, `max_seq_len = 640*64/(800+500)=31`. > 3. Calculate `max_num_seqs` based on cacheable tokens. Example: avg input=800 tokens, output=500 tokens, blocks=640 → `kv_cache_ratio = 800/(800+500)=0.6`, `max_seq_len = 640*64/(800+500)=31`.

View File

@@ -1,6 +1,7 @@
# 离线推理 # 离线推理
## 1. 使用方式 ## 1. 使用方式
通过FastDeploy离线推理可支持本地加载模型并处理用户数据使用方式如下 通过FastDeploy离线推理可支持本地加载模型并处理用户数据使用方式如下
### 对话接口(LLM.chat) ### 对话接口(LLM.chat)
@@ -32,9 +33,9 @@ for output in outputs:
generated_text = output.outputs.text generated_text = output.outputs.text
``` ```
上述示例中```LLM```配置方式, `SamplingParams` `LLM.generate` `LLM.chat`以及输出output对应的结构体 `RequestOutput` 接口说明见如下文档说明。 上述示例中 ``LLM``配置方式, `SamplingParams` `LLM.generate` `LLM.chat`以及输出output对应的结构体 `RequestOutput` 接口说明见如下文档说明。
> 注: 若为思考模型, 加载模型时需要指定`resoning_parser` 参数,并在请求时, 可以通过配置`chat_template_kwargs` 中 `enable_thinking`参数, 进行开关思考。 > 注: 若为思考模型, 加载模型时需要指定 `resoning_parser` 参数,并在请求时, 可以通过配置 `chat_template_kwargs` 中 `enable_thinking`参数, 进行开关思考。
```python ```python
from fastdeploy.entrypoints.llm import LLM from fastdeploy.entrypoints.llm import LLM
@@ -82,7 +83,7 @@ for output in outputs:
> 注: 续写接口, 适应于用户自定义好上下文输入, 并希望模型仅输出续写内容的场景; 推理过程不会增加其他 `prompt`拼接。 > 注: 续写接口, 适应于用户自定义好上下文输入, 并希望模型仅输出续写内容的场景; 推理过程不会增加其他 `prompt`拼接。
> 对于 `chat`模型, 建议使用对话接口(LLM.chat)。 > 对于 `chat`模型, 建议使用对话接口(LLM.chat)。
对于多模模型, 例如`baidu/ERNIE-4.5-VL-28B-A3B-Paddle`, 在调用`generate接口`时, 需要提供包含图片的prompt, 使用方式如下: 对于多模模型, 例如 `baidu/ERNIE-4.5-VL-28B-A3B-Paddle`, 在调用 `generate接口`时, 需要提供包含图片的prompt, 使用方式如下:
```python ```python
import io import io
@@ -91,10 +92,10 @@ from PIL import Image
from fastdeploy.entrypoints.llm import LLM from fastdeploy.entrypoints.llm import LLM
from fastdeploy.engine.sampling_params import SamplingParams from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie_tokenizer import Ernie4_5Tokenizer
PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle" PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle"
tokenizer = ErnieBotTokenizer.from_pretrained(PATH) tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
messages = [ messages = [
{ {
@@ -153,7 +154,8 @@ for output in outputs:
支持配置参数参考 [FastDeploy参数说明](./parameters.md) 支持配置参数参考 [FastDeploy参数说明](./parameters.md)
> 参数配置说明: > 参数配置说明:
> 1. 离线推理不需要配置 `port` 和`metrics_port` 参数。 >
> 1. 离线推理不需要配置 `port` 和 `metrics_port` 参数。
> 2. 模型服务启动后会在日志文件log/fastdeploy.log中打印如 `Doing profile, the total_block_num:640` 的日志其中640即表示自动计算得到的KV Cache block数量将它乘以block_size(默认值64)即可得到部署后总共可以在KV Cache中缓存的Token数。 > 2. 模型服务启动后会在日志文件log/fastdeploy.log中打印如 `Doing profile, the total_block_num:640` 的日志其中640即表示自动计算得到的KV Cache block数量将它乘以block_size(默认值64)即可得到部署后总共可以在KV Cache中缓存的Token数。
> 3. `max_num_seqs` 用于配置decode阶段最大并发处理请求数该参数可以基于第1点中缓存的Token数来计算一个较优值例如线上统计输入平均token数800, 输出平均token数500本次计>算得到KV Cache block为640 block_size为64。那么我们可以配置 `kv_cache_ratio = 800 / (800 + 500) = 0.6` , 配置 `max_seq_len = 640 * 64 / (800 + 500) = 31`。 > 3. `max_num_seqs` 用于配置decode阶段最大并发处理请求数该参数可以基于第1点中缓存的Token数来计算一个较优值例如线上统计输入平均token数800, 输出平均token数500本次计>算得到KV Cache block为640 block_size为64。那么我们可以配置 `kv_cache_ratio = 800 / (800 + 500) = 0.6` , 配置 `max_seq_len = 640 * 64 / (800 + 500) = 31`。
@@ -163,12 +165,12 @@ for output in outputs:
* sampling_params: 模型超参设置具体说明见2.4 * sampling_params: 模型超参设置具体说明见2.4
* use_tqdm: 是否打开推理进度可视化 * use_tqdm: 是否打开推理进度可视化
* chat_template_kwargs(dict): 传递给对话模板的额外参数当前支持enable_thinking(bool) * chat_template_kwargs(dict): 传递给对话模板的额外参数当前支持enable_thinking(bool)
*使用示例`chat_template_kwargs={"enable_thinking": False}`* *使用示例 `chat_template_kwargs={"enable_thinking": False}`*
### 2.3 fastdeploy.LLM.generate ### 2.3 fastdeploy.LLM.generate
* prompts(str, list[str], list[int], list[list[int]], dict[str, Any], list[dict[str, Any]]): 输入的prompt, 支持batch prompt 输入解码后的token ids 进行输入 * prompts(str, list[str], list[int], list[list[int]], dict[str, Any], list[dict[str, Any]]): 输入的prompt, 支持batch prompt 输入解码后的token ids 进行输入
*dict 类型使用示例`prompts={"prompt": prompt, "multimodal_data": {"image": images}}`* *dict 类型使用示例 `prompts={"prompt": prompt, "multimodal_data": {"image": images}}`*
* sampling_params: 模型超参设置具体说明见2.4 * sampling_params: 模型超参设置具体说明见2.4
* use_tqdm: 是否打开推理进度可视化 * use_tqdm: 是否打开推理进度可视化
@@ -193,7 +195,7 @@ for output in outputs:
* outputs(fastdeploy.engine.request.CompletionOutput): 输出结果 * outputs(fastdeploy.engine.request.CompletionOutput): 输出结果
* finished(bool)标识当前query 是否推理结束 * finished(bool)标识当前query 是否推理结束
* metrics(fastdeploy.engine.request.RequestMetrics):记录推理耗时指标 * metrics(fastdeploy.engine.request.RequestMetrics):记录推理耗时指标
* num_cached_tokens(int): 缓存的token数量, 仅在开启```enable_prefix_caching```时有效 * num_cached_tokens(int): 缓存的token数量, 仅在开启 ``enable_prefix_caching``时有效
* error_code(int): 错误码 * error_code(int): 错误码
* error_msg(str): 错误信息 * error_msg(str): 错误信息

View File

@@ -19,14 +19,14 @@ import os
import numpy as np import numpy as np
from paddleformers.generation import GenerationConfig from paddleformers.generation import GenerationConfig
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.input.text_processor import BaseDataProcessor from fastdeploy.input.text_processor import BaseDataProcessor
from fastdeploy.utils import data_processor_logger from fastdeploy.utils import data_processor_logger
_SAMPLING_EPS = 1e-5 _SAMPLING_EPS = 1e-5
class ErnieProcessor(BaseDataProcessor): class Ernie4_5Processor(BaseDataProcessor):
""" """
初始化模型实例 初始化模型实例
@@ -431,9 +431,9 @@ class ErnieProcessor(BaseDataProcessor):
] ]
for i in range(len(vocab_file_names)): for i in range(len(vocab_file_names)):
if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])): if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i] Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
break break
self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path) self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
def get_pad_id(self): def get_pad_id(self):
""" """

View File

@@ -27,7 +27,7 @@ from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, Tex
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
class ErnieBotTokenizer(PretrainedTokenizer): class Ernie4_5Tokenizer(PretrainedTokenizer):
""" """
一个更好用的 `ErnieBotToknizer` 一个更好用的 `ErnieBotToknizer`
encode 目前 sft/ppo 阶段的特殊token也支持多模态 encode 目前 sft/ppo 阶段的特殊token也支持多模态
@@ -164,7 +164,7 @@ class ErnieBotTokenizer(PretrainedTokenizer):
"""doc""" """doc"""
if "add_special_tokens" in kwargs: if "add_special_tokens" in kwargs:
kwargs.pop("add_special_tokens") kwargs.pop("add_special_tokens")
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`') # logger.warning(f'Ernie4_5Tokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs) return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:

View File

@@ -14,14 +14,15 @@
# limitations under the License. # limitations under the License.
""" """
from .process import IDS_TYPE_FLAG, DataProcessor, fancy_print from .ernie4_5_vl_processor import Ernie4_5_VLProcessor
from .process import DataProcessor, fancy_print
from .process_video import read_video_decord from .process_video import read_video_decord
from .utils.video_utils import VideoReaderWrapper from .utils.video_utils import VideoReaderWrapper
__all__ = [ __all__ = [
"DataProcessor", "DataProcessor",
"fancy_print", "fancy_print",
"IDS_TYPE_FLAG",
"VideoReaderWrapper", "VideoReaderWrapper",
"read_video_decord", "read_video_decord",
"Ernie4_5_VLProcessor",
] ]

View File

@@ -20,12 +20,14 @@ import numpy as np
from paddleformers.generation import GenerationConfig from paddleformers.generation import GenerationConfig
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
from fastdeploy.input.ernie_processor import ErnieProcessor from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
from fastdeploy.input.mm_processor import IDS_TYPE_FLAG, DataProcessor from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.utils import data_processor_logger from fastdeploy.utils import data_processor_logger
from .process import DataProcessor
class ErnieMoEVLProcessor(ErnieProcessor):
class Ernie4_5_VLProcessor(Ernie4_5Processor):
"""The processor class for ERNIE MoE VL models.""" """The processor class for ERNIE MoE VL models."""
def __init__( def __init__(
@@ -41,14 +43,14 @@ class ErnieMoEVLProcessor(ErnieProcessor):
preprocessor_path = model_name_or_path preprocessor_path = model_name_or_path
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
self.ernie_processor = DataProcessor( self.ernie4_5_processor = DataProcessor(
tokenizer_name=tokenizer_path, tokenizer_name=tokenizer_path,
image_preprocessor_name=preprocessor_path, image_preprocessor_name=preprocessor_path,
**processor_kwargs, **processor_kwargs,
) )
self.ernie_processor.eval() self.ernie4_5_processor.eval()
self.image_patch_id = self.ernie_processor.image_patch_id self.image_patch_id = self.ernie4_5_processor.image_patch_id
self.spatial_conv_size = self.ernie_processor.spatial_conv_size self.spatial_conv_size = self.ernie4_5_processor.spatial_conv_size
self.tool_parser_dict = dict() self.tool_parser_dict = dict()
self.decode_status = dict() self.decode_status = dict()
@@ -86,7 +88,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
Returns: Returns:
tokenizer (AutoTokenizer) tokenizer (AutoTokenizer)
""" """
self.tokenizer = self.ernie_processor.tokenizer self.tokenizer = self.ernie4_5_processor.tokenizer
def _apply_default_parameters(self, request): def _apply_default_parameters(self, request):
""" """
@@ -222,7 +224,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
images = multimodal_data.get("image", None) images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None) videos = multimodal_data.get("video", None)
request["text_after_process"] = request.get("prompt") request["text_after_process"] = request.get("prompt")
outputs = self.ernie_processor.text2ids(request["prompt"], images, videos) outputs = self.ernie4_5_processor.text2ids(request["prompt"], images, videos)
elif request.get("messages"): elif request.get("messages"):
messages = request["messages"] messages = request["messages"]
self._check_mm_limits(messages) self._check_mm_limits(messages)
@@ -235,7 +237,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
else: else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict") raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", True) request.setdefault("enable_thinking", True)
outputs = self.ernie_processor.request2ids(request) outputs = self.ernie4_5_processor.request2ids(request)
else: else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")

View File

@@ -26,15 +26,14 @@ from paddleformers.transformers.image_utils import ChannelDimension
from PIL import Image from PIL import Image
from fastdeploy.entrypoints.chat_utils import parse_chat_messages from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.utils import data_processor_logger from fastdeploy.utils import data_processor_logger
from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
from .process_video import read_frames_decord, read_video_decord from .process_video import read_frames_decord, read_video_decord
from .utils.render_timestamp import render_frame_timestamp from .utils.render_timestamp import render_frame_timestamp
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
def fancy_print(input_ids, tokenizer, image_patch_id=None): def fancy_print(input_ids, tokenizer, image_patch_id=None):
""" """
@@ -477,9 +476,9 @@ class DataProcessor:
] ]
for i in range(len(vocab_file_names)): for i in range(len(vocab_file_names)):
if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])): if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i] Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
break break
self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path) self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
def apply_chat_template(self, request): def apply_chat_template(self, request):
""" """

View File

@@ -14,6 +14,6 @@
# limitations under the License. # limitations under the License.
""" """
from .tokenizer_vl import ErnieVLTokenizer from .ernie4_5_vl_tokenizer import Ernie4_5_VLTokenizer
__all__ = ["ErnieVLTokenizer"] __all__ = ["Ernie4_5_VLTokenizer"]

View File

@@ -14,9 +14,6 @@
# limitations under the License. # limitations under the License.
""" """
"""
ErnieVLTokenizer
"""
import os import os
import re import re
from shutil import copyfile from shutil import copyfile
@@ -31,7 +28,7 @@ from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, Tex
from fastdeploy.utils import console_logger as logger from fastdeploy.utils import console_logger as logger
class ErnieVLTokenizer(PretrainedTokenizer): class Ernie4_5_VLTokenizer(PretrainedTokenizer):
"""doc""" """doc"""
resource_files_names = { resource_files_names = {
@@ -157,7 +154,7 @@ class ErnieVLTokenizer(PretrainedTokenizer):
"""doc""" """doc"""
if "add_special_tokens" in kwargs: if "add_special_tokens" in kwargs:
kwargs.pop("add_special_tokens") kwargs.pop("add_special_tokens")
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`') # logger.warning(f'Ernie4_5Tokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs) return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:

View File

@@ -89,18 +89,18 @@ class InputPreprocessor:
tool_parser_obj=tool_parser_obj, tool_parser_obj=tool_parser_obj,
) )
else: else:
from fastdeploy.input.ernie_processor import ErnieProcessor from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
self.processor = ErnieProcessor( self.processor = Ernie4_5Processor(
model_name_or_path=self.model_name_or_path, model_name_or_path=self.model_name_or_path,
reasoning_parser_obj=reasoning_parser_obj, reasoning_parser_obj=reasoning_parser_obj,
tool_parser_obj=tool_parser_obj, tool_parser_obj=tool_parser_obj,
) )
else: else:
if ErnieArchitectures.contains_ernie_arch(architectures): if ErnieArchitectures.contains_ernie_arch(architectures):
from fastdeploy.input.ernie_vl_processor import ErnieMoEVLProcessor from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor
self.processor = ErnieMoEVLProcessor( self.processor = Ernie4_5_VLProcessor(
model_name_or_path=self.model_name_or_path, model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt, limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_kwargs=self.mm_processor_kwargs,

View File

@@ -14,9 +14,10 @@
# limitations under the License. # limitations under the License.
""" """
from .process import IDS_TYPE_FLAG, DataProcessor from .process import DataProcessor
from .qwen_vl_processor import QwenVLProcessor
__all__ = [ __all__ = [
"DataProcessor", "DataProcessor",
"IDS_TYPE_FLAG", "QwenVLProcessor",
] ]

View File

@@ -21,7 +21,7 @@ import numpy as np
from paddleformers.transformers import AutoTokenizer from paddleformers.transformers import AutoTokenizer
from fastdeploy.entrypoints.chat_utils import parse_chat_messages from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.mm_processor import IDS_TYPE_FLAG from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.utils import data_processor_logger from fastdeploy.utils import data_processor_logger
from .image_processor import ImageProcessor from .image_processor import ImageProcessor

View File

@@ -20,7 +20,7 @@ from typing import Optional, Union
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from fastdeploy.input.mm_processor import read_video_decord from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
def read_frames(video_path): def read_frames(video_path):

View File

@@ -17,10 +17,11 @@
import numpy as np import numpy as np
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
from fastdeploy.input.qwen_mm_processor import DataProcessor
from fastdeploy.input.text_processor import DataProcessor as TextProcessor from fastdeploy.input.text_processor import DataProcessor as TextProcessor
from fastdeploy.utils import data_processor_logger from fastdeploy.utils import data_processor_logger
from .process import DataProcessor
class QwenVLProcessor(TextProcessor): class QwenVLProcessor(TextProcessor):
""" """

21
fastdeploy/input/utils.py Normal file
View File

@@ -0,0 +1,21 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
__all__ = [
"IDS_TYPE_FLAG",
]
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}

View File

@@ -279,7 +279,7 @@ class BackendBase:
tokenizer = PreTrainedTokenizerFast(__slow_tokenizer=tokenizer) tokenizer = PreTrainedTokenizerFast(__slow_tokenizer=tokenizer)
else: else:
from fastdeploy.model_executor.guided_decoding.ernie_tokenizer import ( from fastdeploy.model_executor.guided_decoding.ernie_tokenizer import (
ErnieBotTokenizer, Ernie4_5Tokenizer,
) )
vocab_file_names = [ vocab_file_names = [
@@ -294,10 +294,10 @@ class BackendBase:
vocab_file_names[i], vocab_file_names[i],
) )
): ):
ErnieBotTokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i] Ernie4_5Tokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i]
break break
tokenizer = ErnieBotTokenizer.from_pretrained(self.fd_config.model_config.model) tokenizer = Ernie4_5Tokenizer.from_pretrained(self.fd_config.model_config.model)
return tokenizer return tokenizer
except Exception as e: except Exception as e:

View File

@@ -30,7 +30,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
class ErnieBotTokenizer(PreTrainedTokenizer): class Ernie4_5Tokenizer(PreTrainedTokenizer):
""" """
Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding. Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
Args: Args:

View File

@@ -66,7 +66,7 @@ if not (current_platform.is_dcu() or current_platform.is_iluvatar()):
from fastdeploy.spec_decode import MTPProposer, NgramProposer from fastdeploy.spec_decode import MTPProposer, NgramProposer
from fastdeploy import envs from fastdeploy import envs
from fastdeploy.input.mm_processor import DataProcessor from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp
from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.model_runner_base import ModelRunnerBase

View File

@@ -26,7 +26,7 @@ from paddleformers.utils.log import logger
from fastdeploy import envs from fastdeploy import envs
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request, RequestType from fastdeploy.engine.request import Request, RequestType
from fastdeploy.input.mm_processor import DataProcessor from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.utils import ( from fastdeploy.model_executor.graph_optimization.utils import (
profile_run_guard, profile_run_guard,

View File

@@ -38,7 +38,7 @@ from fastdeploy.config import (
ParallelConfig, ParallelConfig,
SpeculativeConfig, SpeculativeConfig,
) )
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import IPCSignal from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.model_executor.layers.quantization import get_quantization_config from fastdeploy.model_executor.layers.quantization import get_quantization_config
@@ -106,7 +106,7 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]:
def update_fd_config_for_mm(fd_config: FDConfig) -> None: def update_fd_config_for_mm(fd_config: FDConfig) -> None:
if fd_config.model_config.enable_mm: if fd_config.model_config.enable_mm:
tokenizer = ErnieBotTokenizer.from_pretrained( tokenizer = Ernie4_5Tokenizer.from_pretrained(
fd_config.model_config.model, fd_config.model_config.model,
model_max_length=fd_config.parallel_config.max_model_len, model_max_length=fd_config.parallel_config.max_model_len,
padding_side="right", padding_side="right",

View File

@@ -12,7 +12,7 @@ from paddleformers.utils.env import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
from safetensors.numpy import save_file as safe_save_file from safetensors.numpy import save_file as safe_save_file
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.load_weight_utils import ( from fastdeploy.model_executor.load_weight_utils import (
get_all_safetensors, get_all_safetensors,
@@ -140,9 +140,9 @@ def main():
] ]
for i in range(len(vocab_file_names)): for i in range(len(vocab_file_names)):
if os.path.exists(os.path.join(args.model_name_or_path, vocab_file_names[i])): if os.path.exists(os.path.join(args.model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i] Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
break break
tokenizer = ErnieBotTokenizer.from_pretrained(args.model_name_or_path) tokenizer = Ernie4_5Tokenizer.from_pretrained(args.model_name_or_path)
_, safetensor_files = get_all_safetensors(args.model_name_or_path) _, safetensor_files = get_all_safetensors(args.model_name_or_path)
weights_iterator = safetensors_weights_iterator(safetensor_files) weights_iterator = safetensors_weights_iterator(safetensor_files)
state_dict = {} state_dict = {}

View File

@@ -211,7 +211,7 @@ setup(
"model_executor/ops/iluvatar/*", "model_executor/ops/iluvatar/*",
"model_executor/models/*", "model_executor/models/*",
"model_executor/layers/*", "model_executor/layers/*",
"input/mm_processor/utils/*", "input/ernie4_5_vl_processor/utils/*",
"model_executor/ops/gcu/*", "model_executor/ops/gcu/*",
"version.txt", "version.txt",
] ]

View File

@@ -738,8 +738,8 @@ def test_non_streaming_chat_with_disable_chat_template(openai_client, capsys):
assert hasattr(enabled_response, "choices") assert hasattr(enabled_response, "choices")
assert len(enabled_response.choices) > 0 assert len(enabled_response.choices) > 0
# from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer # from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
# tokenizer = ErnieBotTokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True) # tokenizer = Ernie4_5Tokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True)
# prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False) # prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False)
prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: " prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: "
disabled_response = openai_client.chat.completions.create( disabled_response = openai_client.chat.completions.create(
@@ -821,9 +821,9 @@ def test_non_streaming_chat_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0].message, "completion_token_ids") assert hasattr(response_0.choices[0].message, "completion_token_ids")
assert isinstance(response_0.choices[0].message.completion_token_ids, list) assert isinstance(response_0.choices[0].message.completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = [] output_tokens_0 = []
output_ids_0 = [] output_ids_0 = []
for ids in response_0.choices[0].message.completion_token_ids: for ids in response_0.choices[0].message.completion_token_ids:
@@ -977,9 +977,9 @@ def test_non_streaming_completion_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0], "completion_token_ids") assert hasattr(response_0.choices[0], "completion_token_ids")
assert isinstance(response_0.choices[0].completion_token_ids, list) assert isinstance(response_0.choices[0].completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = [] output_tokens_0 = []
output_ids_0 = [] output_ids_0 = []
for ids in response_0.choices[0].completion_token_ids: for ids in response_0.choices[0].completion_token_ids:

View File

@@ -733,8 +733,8 @@ def test_non_streaming_chat_completion_disable_chat_template(openai_client, caps
assert hasattr(enabled_response, "choices") assert hasattr(enabled_response, "choices")
assert len(enabled_response.choices) > 0 assert len(enabled_response.choices) > 0
# from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer # from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
# tokenizer = ErnieBotTokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True) # tokenizer = Ernie4_5Tokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True)
# prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False) # prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False)
prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: " prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: "
disabled_response = openai_client.chat.completions.create( disabled_response = openai_client.chat.completions.create(
@@ -816,9 +816,9 @@ def test_non_streaming_chat_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0].message, "completion_token_ids") assert hasattr(response_0.choices[0].message, "completion_token_ids")
assert isinstance(response_0.choices[0].message.completion_token_ids, list) assert isinstance(response_0.choices[0].message.completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = [] output_tokens_0 = []
output_ids_0 = [] output_ids_0 = []
for ids in response_0.choices[0].message.completion_token_ids: for ids in response_0.choices[0].message.completion_token_ids:
@@ -972,9 +972,9 @@ def test_non_streaming_completion_with_bad_words(openai_client, capsys):
assert hasattr(response_0.choices[0], "completion_token_ids") assert hasattr(response_0.choices[0], "completion_token_ids")
assert isinstance(response_0.choices[0].completion_token_ids, list) assert isinstance(response_0.choices[0].completion_token_ids, list)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
tokenizer = ErnieBotTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True)
output_tokens_0 = [] output_tokens_0 = []
output_ids_0 = [] output_ids_0 = []
for ids in response_0.choices[0].completion_token_ids: for ids in response_0.choices[0].completion_token_ids:

View File

@@ -1,14 +1,14 @@
import unittest import unittest
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from fastdeploy.input.ernie_processor import ErnieProcessor from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
class TestErnieProcessorProcessResponseDictStreaming(unittest.TestCase): class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase):
def setUp(self): def setUp(self):
# 创建 ErnieProcessor 实例的模拟对象 # 创建 Ernie4_5Processor 实例的模拟对象
with patch.object(ErnieProcessor, "__init__", return_value=None) as mock_init: with patch.object(Ernie4_5Processor, "__init__", return_value=None) as mock_init:
self.processor = ErnieProcessor("model_path") self.processor = Ernie4_5Processor("model_path")
mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}") mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}")
# 设置必要的属性 # 设置必要的属性

View File

@@ -101,7 +101,7 @@ class TestQwenVLProcessor(unittest.TestCase):
self.patcher_parse_video.start() self.patcher_parse_video.start()
self.patcher_read_frames = patch( self.patcher_read_frames = patch(
"fastdeploy.input.qwen_mm_processor.process.read_frames", return_value=mock_read_frames(480, 640, 5, 2) "fastdeploy.input.qwen_vl_processor.process.read_frames", return_value=mock_read_frames(480, 640, 5, 2)
) )
self.patcher_read_frames.start() self.patcher_read_frames.start()

View File

@@ -9,8 +9,8 @@ from fastdeploy.entrypoints.chat_utils import load_chat_template
from fastdeploy.entrypoints.llm import LLM from fastdeploy.entrypoints.llm import LLM
from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest
from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
from fastdeploy.input.ernie_processor import ErnieProcessor from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
from fastdeploy.input.ernie_vl_processor import ErnieMoEVLProcessor from fastdeploy.input.ernie4_5_vl_processor import Ernie4_5_VLProcessor
from fastdeploy.input.text_processor import DataProcessor from fastdeploy.input.text_processor import DataProcessor
@@ -108,10 +108,10 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
chat_completion = await self.chat_completion_handler.create_chat_completion(request) chat_completion = await self.chat_completion_handler.create_chat_completion(request)
self.assertEqual("hello", chat_completion["chat_template"]) self.assertEqual("hello", chat_completion["chat_template"])
@patch("fastdeploy.input.ernie_vl_processor.ErnieMoEVLProcessor.__init__") @patch("fastdeploy.input.ernie4_5_vl_processor.Ernie4_5_VLProcessor.__init__")
def test_vl_processor(self, mock_class): def test_ernie4_5_vl_processor(self, mock_class):
mock_class.return_value = None mock_class.return_value = None
vl_processor = ErnieMoEVLProcessor() ernie4_5_vl_processor = Ernie4_5_VLProcessor()
mock_request = Request.from_dict({"request_id": "123"}) mock_request = Request.from_dict({"request_id": "123"})
def mock_apply_default_parameters(request): def mock_apply_default_parameters(request):
@@ -120,9 +120,9 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
def mock_process_request(request, max_model_len): def mock_process_request(request, max_model_len):
return request return request
vl_processor._apply_default_parameters = mock_apply_default_parameters ernie4_5_vl_processor._apply_default_parameters = mock_apply_default_parameters
vl_processor.process_request_dict = mock_process_request ernie4_5_vl_processor.process_request_dict = mock_process_request
result = vl_processor.process_request(mock_request, chat_template="hello") result = ernie4_5_vl_processor.process_request(mock_request, chat_template="hello")
self.assertEqual("hello", result.chat_template) self.assertEqual("hello", result.chat_template)
@patch("fastdeploy.input.text_processor.DataProcessor.__init__") @patch("fastdeploy.input.text_processor.DataProcessor.__init__")
@@ -149,10 +149,10 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
result = text_processor.process_request(mock_request, chat_template="hello") result = text_processor.process_request(mock_request, chat_template="hello")
self.assertEqual("hello", result.chat_template) self.assertEqual("hello", result.chat_template)
@patch("fastdeploy.input.ernie_processor.ErnieProcessor.__init__") @patch("fastdeploy.input.ernie4_5_processor.Ernie4_5Processor.__init__")
def test_ernie_processor_process(self, mock_class): def test_ernie4_5_processor_process(self, mock_class):
mock_class.return_value = None mock_class.return_value = None
ernie_processor = ErnieProcessor() ernie4_5_processor = Ernie4_5Processor()
mock_request = Request.from_dict( mock_request = Request.from_dict(
{"request_id": "123", "messages": ["hi"], "max_tokens": 128, "temperature": 1, "top_p": 1} {"request_id": "123", "messages": ["hi"], "max_tokens": 128, "temperature": 1, "top_p": 1}
) )
@@ -166,12 +166,12 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
def mock_messages2ids(text): def mock_messages2ids(text):
return [1] return [1]
ernie_processor._apply_default_parameters = mock_apply_default_parameters ernie4_5_processor._apply_default_parameters = mock_apply_default_parameters
ernie_processor.process_request_dict = mock_process_request ernie4_5_processor.process_request_dict = mock_process_request
ernie_processor.messages2ids = mock_messages2ids ernie4_5_processor.messages2ids = mock_messages2ids
ernie_processor.eos_token_ids = [1] ernie4_5_processor.eos_token_ids = [1]
ernie_processor.reasoning_parser = MagicMock() ernie4_5_processor.reasoning_parser = MagicMock()
result = ernie_processor.process_request(mock_request, chat_template="hello") result = ernie4_5_processor.process_request(mock_request, chat_template="hello")
self.assertEqual("hello", result.chat_template) self.assertEqual("hello", result.chat_template)
@patch("fastdeploy.entrypoints.llm.LLM.__init__") @patch("fastdeploy.entrypoints.llm.LLM.__init__")