mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature] Unify the registration name recognition for tool_parser and reasoning_parser to “-” (#4668)
* parser register name unify * change ernie_x1 to ernie-x1 * change ernie4_5_vl to ernie-45-vl * fix unit test
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
reasoning-parser: ernie_x1
|
reasoning-parser: ernie-x1
|
||||||
tool_call_parser: ernie_x1
|
tool_call_parser: ernie-x1
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
max_model_len: 65536
|
max_model_len: 65536
|
||||||
max_num_seqs: 128
|
max_num_seqs: 128
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
tensor_parallel_size: 1
|
tensor_parallel_size: 1
|
||||||
max_model_len: 131072
|
max_model_len: 131072
|
||||||
max_num_seqs: 32
|
max_num_seqs: 32
|
||||||
reasoning_parser: ernie_x1
|
reasoning_parser: ernie-x1
|
||||||
tool_call_parser: ernie_x1
|
tool_call_parser: ernie-x1
|
||||||
load_choices: "default_v1"
|
load_choices: "default_v1"
|
||||||
quantization: wint8
|
quantization: wint8
|
||||||
|
|||||||
@@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--tensor-parallel-size 1 \
|
--tensor-parallel-size 1 \
|
||||||
--max-model-len 131072 \
|
--max-model-len 131072 \
|
||||||
--quantization wint8 \
|
--quantization wint8 \
|
||||||
--reasoning-parser ernie_x1 \
|
--reasoning-parser ernie-x1 \
|
||||||
--tool-call-parser ernie_x1 \
|
--tool-call-parser ernie-x1 \
|
||||||
--max-num-seqs 32
|
--max-num-seqs 32
|
||||||
```
|
```
|
||||||
- `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
- `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Whether to use Machete for wint4 dense GEMM.
|
# Whether to use Machete for wint4 dense GEMM.
|
||||||
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),
|
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),
|
||||||
|
|
||||||
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
|
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
|
||||||
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
|
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
|
||||||
|
|
||||||
# Timeout for cache_transfer_manager process exit
|
# Timeout for cache_transfer_manager process exit
|
||||||
|
|||||||
@@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--tensor-parallel-size 1 \
|
--tensor-parallel-size 1 \
|
||||||
--max-model-len 131072 \
|
--max-model-len 131072 \
|
||||||
--quantization wint8 \
|
--quantization wint8 \
|
||||||
--reasoning-parser ernie_x1 \
|
--reasoning-parser ernie-x1 \
|
||||||
--tool-call-parser ernie_x1 \
|
--tool-call-parser ernie-x1 \
|
||||||
--max-num-seqs 32
|
--max-num-seqs 32
|
||||||
```
|
```
|
||||||
其中:
|
其中:
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# 是否使用 Machete 后端的 wint4 GEMM.
|
# 是否使用 Machete 后端的 wint4 GEMM.
|
||||||
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),
|
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),
|
||||||
|
|
||||||
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
|
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
|
||||||
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
|
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
|
||||||
|
|
||||||
# cache_transfer_manager 进程残留时退出等待超时时间
|
# cache_transfer_manager 进程残留时退出等待超时时间
|
||||||
|
|||||||
@@ -95,6 +95,7 @@ class ToolParserManager:
|
|||||||
|
|
||||||
Raise a KeyError exception if the name is not registered.
|
Raise a KeyError exception if the name is not registered.
|
||||||
"""
|
"""
|
||||||
|
name = name.replace("_", "-")
|
||||||
if name in cls.tool_parsers:
|
if name in cls.tool_parsers:
|
||||||
return cls.tool_parsers[name]
|
return cls.tool_parsers[name]
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
|
|||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
|
||||||
@ToolParserManager.register_module("ernie_45-vl-thinking")
|
@ToolParserManager.register_module("ernie-45-vl-thinking")
|
||||||
class Ernie45VLThinkingToolParser(ToolParser):
|
class Ernie45VLThinkingToolParser(ToolParser):
|
||||||
"""
|
"""
|
||||||
Tool parser for Ernie model version 4.5.1.
|
Tool parser for Ernie model version 4.5.1.
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
|
|||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
|
||||||
@ToolParserManager.register_module("ernie_x1")
|
@ToolParserManager.register_module("ernie-x1")
|
||||||
class ErnieX1ToolParser(ToolParser):
|
class ErnieX1ToolParser(ToolParser):
|
||||||
"""
|
"""
|
||||||
Tool parser for Ernie model version 4.5.1.
|
Tool parser for Ernie model version 4.5.1.
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")),
|
"FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")),
|
||||||
# enable return text, used when FD_ENABLE_INTERNAL_ADAPTER=1
|
# enable return text, used when FD_ENABLE_INTERNAL_ADAPTER=1
|
||||||
"FD_ENABLE_RETURN_TEXT": lambda: bool(int(os.getenv("FD_ENABLE_RETURN_TEXT", "0"))),
|
"FD_ENABLE_RETURN_TEXT": lambda: bool(int(os.getenv("FD_ENABLE_RETURN_TEXT", "0"))),
|
||||||
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
|
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
|
||||||
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
|
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
|
||||||
# Timeout for cache_transfer_manager process exit
|
# Timeout for cache_transfer_manager process exit
|
||||||
"FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),
|
"FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ def limit_thinking_content_length(
|
|||||||
line_break_id: int = None,
|
line_break_id: int = None,
|
||||||
):
|
):
|
||||||
if limit_strategy == "</think>":
|
if limit_strategy == "</think>":
|
||||||
# for ernie4_5_vl
|
# for ernie-45-vl
|
||||||
limit_thinking_content_length_v1(
|
limit_thinking_content_length_v1(
|
||||||
sampled_token_ids,
|
sampled_token_ids,
|
||||||
max_think_lens,
|
max_think_lens,
|
||||||
@@ -110,7 +110,7 @@ def limit_thinking_content_length(
|
|||||||
think_end_id,
|
think_end_id,
|
||||||
)
|
)
|
||||||
elif limit_strategy == "\n</think>\n\n":
|
elif limit_strategy == "\n</think>\n\n":
|
||||||
# for ernie_x1
|
# for ernie-x1
|
||||||
assert line_break_id > 0
|
assert line_break_id > 0
|
||||||
limit_thinking_content_length_v2(
|
limit_thinking_content_length_v2(
|
||||||
sampled_token_ids,
|
sampled_token_ids,
|
||||||
@@ -136,7 +136,7 @@ def speculate_limit_thinking_content_length(
|
|||||||
line_break_id: int = None,
|
line_break_id: int = None,
|
||||||
):
|
):
|
||||||
if limit_strategy == "</think>":
|
if limit_strategy == "</think>":
|
||||||
# for ernie4_5_vl
|
# for ernie-45-vl
|
||||||
speculate_limit_thinking_content_length_v1(
|
speculate_limit_thinking_content_length_v1(
|
||||||
accept_tokens,
|
accept_tokens,
|
||||||
max_think_lens,
|
max_think_lens,
|
||||||
@@ -147,7 +147,7 @@ def speculate_limit_thinking_content_length(
|
|||||||
think_end_id,
|
think_end_id,
|
||||||
)
|
)
|
||||||
elif limit_strategy == "\n</think>\n\n":
|
elif limit_strategy == "\n</think>\n\n":
|
||||||
# for ernie_x1
|
# for ernie-x1
|
||||||
assert line_break_id > 0
|
assert line_break_id > 0
|
||||||
speculate_limit_thinking_content_length_v2(
|
speculate_limit_thinking_content_length_v2(
|
||||||
accept_tokens,
|
accept_tokens,
|
||||||
|
|||||||
@@ -125,6 +125,7 @@ class ReasoningParserManager:
|
|||||||
|
|
||||||
Raise a KeyError exception if the name is not registered.
|
Raise a KeyError exception if the name is not registered.
|
||||||
"""
|
"""
|
||||||
|
name = name.replace("_", "-")
|
||||||
if name in cls.reasoning_parsers:
|
if name in cls.reasoning_parsers:
|
||||||
return cls.reasoning_parsers[name]
|
return cls.reasoning_parsers[name]
|
||||||
|
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaM
|
|||||||
from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager
|
from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
|
|
||||||
|
|
||||||
@ReasoningParserManager.register_module("ernie_x1")
|
@ReasoningParserManager.register_module("ernie-x1")
|
||||||
class ErnieX1ReasoningParser(ReasoningParser):
|
class ErnieX1ReasoningParser(ReasoningParser):
|
||||||
"""
|
"""
|
||||||
Reasoning parser for ernie_x1 model with stricter boundary checking.
|
Reasoning parser for ernie-x1 model with stricter boundary checking.
|
||||||
|
|
||||||
Unified rules:
|
Unified rules:
|
||||||
- Do not strip newline before </think>
|
- Do not strip newline before </think>
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ def xpu_post_process(
|
|||||||
step_idx = share_inputs["step_idx"]
|
step_idx = share_inputs["step_idx"]
|
||||||
limit_think_status = share_inputs["limit_think_status"]
|
limit_think_status = share_inputs["limit_think_status"]
|
||||||
if limit_strategy == "</think>":
|
if limit_strategy == "</think>":
|
||||||
# for ernie4_5_vl
|
# for ernie-45-vl
|
||||||
limit_thinking_content_length_v1(
|
limit_thinking_content_length_v1(
|
||||||
sampled_token_ids,
|
sampled_token_ids,
|
||||||
max_think_lens,
|
max_think_lens,
|
||||||
@@ -212,7 +212,7 @@ def xpu_post_process(
|
|||||||
think_end_id,
|
think_end_id,
|
||||||
)
|
)
|
||||||
elif limit_strategy == "\n</think>\n\n":
|
elif limit_strategy == "\n</think>\n\n":
|
||||||
# for ernie_x1
|
# for ernie-x1
|
||||||
assert line_break_id > 0
|
assert line_break_id > 0
|
||||||
limit_thinking_content_length_v2(
|
limit_thinking_content_length_v2(
|
||||||
sampled_token_ids,
|
sampled_token_ids,
|
||||||
|
|||||||
@@ -73,9 +73,9 @@ class TestOpenAIServingCompletion(unittest.TestCase):
|
|||||||
self.assertTrue(serving_completion._check_master())
|
self.assertTrue(serving_completion._check_master())
|
||||||
|
|
||||||
def test_calc_finish_reason_tool_calls(self):
|
def test_calc_finish_reason_tool_calls(self):
|
||||||
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1"
|
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1"
|
||||||
engine_client = Mock()
|
engine_client = Mock()
|
||||||
engine_client.reasoning_parser = "ernie_x1"
|
engine_client.reasoning_parser = "ernie-x1"
|
||||||
# 创建一个OpenAIServingCompletion实例
|
# 创建一个OpenAIServingCompletion实例
|
||||||
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
|
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
|
||||||
# 创建一个模拟的output,并设置finish_reason为"tool_call"
|
# 创建一个模拟的output,并设置finish_reason为"tool_call"
|
||||||
@@ -86,9 +86,9 @@ class TestOpenAIServingCompletion(unittest.TestCase):
|
|||||||
assert result == "tool_calls"
|
assert result == "tool_calls"
|
||||||
|
|
||||||
def test_calc_finish_reason_stop(self):
|
def test_calc_finish_reason_stop(self):
|
||||||
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1"
|
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1"
|
||||||
engine_client = Mock()
|
engine_client = Mock()
|
||||||
engine_client.reasoning_parser = "ernie_x1"
|
engine_client.reasoning_parser = "ernie-x1"
|
||||||
# 创建一个OpenAIServingCompletion实例
|
# 创建一个OpenAIServingCompletion实例
|
||||||
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
|
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
|
||||||
# 创建一个模拟的output,并设置finish_reason为其他值
|
# 创建一个模拟的output,并设置finish_reason为其他值
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ class TestReasoningParserManager(unittest.TestCase):
|
|||||||
Test that a parser can be registered and retrieved successfully.
|
Test that a parser can be registered and retrieved successfully.
|
||||||
Verifies normal registration and retrieval functionality.
|
Verifies normal registration and retrieval functionality.
|
||||||
"""
|
"""
|
||||||
ReasoningParserManager.register_module(module=TestReasoningParser, name="test_parser", force=True)
|
ReasoningParserManager.register_module(module=TestReasoningParser, name="test-parser", force=True)
|
||||||
parser_cls = ReasoningParserManager.get_reasoning_parser("test_parser")
|
parser_cls = ReasoningParserManager.get_reasoning_parser("test_parser")
|
||||||
self.assertIs(parser_cls, TestReasoningParser)
|
self.assertIs(parser_cls, TestReasoningParser)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user