add Tool Parser (#3272)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* add tool-parser

* add tool-parser

* add tool parser

* add tool parser

* fix

* add offline

* add offline

* fix

* parsers:tool&reasoning

* 修改tool parser名称·

* update

* fix reasoning-parser

* add requirements

* fix finish reason

* fix

* fix reasoning-parser

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: zhuzixuan <zhuzixuan@baidu.com>
This commit is contained in:
luukunn
2025-08-13 01:06:55 +08:00
committed by GitHub
parent 2d1a4cacdf
commit eda83ca672
23 changed files with 1056 additions and 38 deletions

View File

@@ -15,10 +15,10 @@
"""
import json
import os
from dataclasses import asdict, dataclass
from dataclasses import fields as dataclass_fields
from typing import Any, Dict, List, Optional
import os
from fastdeploy.config import (
CacheConfig,
@@ -94,6 +94,14 @@ class EngineArgs:
"""
specifies the reasoning parser to use for extracting reasoning content from the model output
"""
tool_call_parser: str = None
"""
specifies the tool call parser to use for extracting tool call from the model output
"""
tool_parser_plugin: str = None
"""
tool parser plugin used to register user defined tool parsers
"""
enable_mm: bool = False
"""
Flags to enable multi-modal model
@@ -434,6 +442,18 @@ class EngineArgs:
help="Flag specifies the reasoning parser to use for extracting "
"reasoning content from the model output",
)
model_group.add_argument(
"--tool-call-parser",
type=str,
default=EngineArgs.tool_call_parser,
help="Flag specifies the tool call parser to use for extracting" "tool call from the model output",
)
model_group.add_argument(
"--tool-parser-plugin",
type=str,
default=EngineArgs.tool_parser_plugin,
help="tool parser plugin used to register user defined tool parsers",
)
model_group.add_argument(
"--speculative-config",
type=json.loads,
@@ -885,10 +905,10 @@ class EngineArgs:
if self.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
else:
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
all_dict = asdict(self)
all_dict["model_cfg"] = model_cfg
@@ -927,6 +947,7 @@ class EngineArgs:
mm_processor_kwargs=self.mm_processor_kwargs,
# enable_mm=self.enable_mm,
reasoning_parser=self.reasoning_parser,
tool_parser=self.tool_call_parser,
splitwise_role=self.splitwise_role,
innode_prefill_ports=self.innode_prefill_ports,
max_num_partial_prefills=self.max_num_partial_prefills,

View File

@@ -86,6 +86,7 @@ class Config:
max_long_partial_prefills: int = 1,
long_prefill_token_threshold: int = 0,
reasoning_parser: str = None,
tool_parser: str = None,
guided_decoding_backend: Optional[str] = None,
disable_any_whitespace: bool = False,
enable_logprob: bool = False,
@@ -166,6 +167,7 @@ class Config:
self.max_long_partial_prefills = max_long_partial_prefills
self.long_prefill_token_threshold = long_prefill_token_threshold
self.reasoning_parser = reasoning_parser
self.tool_parser = tool_parser
self.graph_optimization_config = graph_optimization_config
self.early_stop_config = early_stop_config
self.guided_decoding_backend = guided_decoding_backend
@@ -245,10 +247,10 @@ class Config:
if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
else:
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -296,7 +298,7 @@ class Config:
)
if not self.cache_config.enable_chunked_prefill:
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
assert self.max_num_batched_tokens >= self.max_model_len, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
f"should be larger than or equal to max_model_len: {self.max_model_len}"

View File

@@ -106,6 +106,7 @@ class LLMEngine:
cfg.limit_mm_per_prompt,
cfg.mm_processor_kwargs,
cfg.enable_mm,
cfg.tool_parser,
)
self.start_queue_service()

View File

@@ -24,6 +24,7 @@ from typing import Any, Dict, Optional, Union
import numpy as np
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.entrypoints.openai.protocol import ToolCall
from fastdeploy.utils import data_processor_logger
from fastdeploy.worker.output import LogprobsLists, SampleLogprobs
@@ -249,6 +250,7 @@ class CompletionOutput:
draft_token_ids: list[int] = None
text: Optional[str] = None
reasoning_content: Optional[str] = None
tool_calls: Optional[ToolCall] = None
def to_dict(self):
"""