add tool parser

2025-09-26 20:41:53 +08:00 · 2025-08-13 01:06:55 +08:00
parent 132a8ef425
commit bbd50c6717
23 changed files with 1050 additions and 32 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -95,6 +95,14 @@ class EngineArgs:
    """
    specifies the reasoning parser to use for extracting reasoning content from the model output
    """
+    tool_call_parser: str = None
+    """
+    specifies the tool call parser  to use for extracting tool call from the model output
+    """
+    tool_parser_plugin: str = None
+    """
+    tool parser plugin used to register user defined tool parsers
+    """
    enable_mm: bool = False
    """
    Flags to enable multi-modal model
@@ -423,6 +431,18 @@ class EngineArgs:
            help="Flag specifies the reasoning parser to use for extracting "
            "reasoning content from the model output",
        )
+        model_group.add_argument(
+            "--tool-call-parser",
+            type=str,
+            default=EngineArgs.tool_call_parser,
+            help="Flag specifies the tool call parser to use for extracting" "tool call from the model output",
+        )
+        model_group.add_argument(
+            "--tool-parser-plugin",
+            type=str,
+            default=EngineArgs.tool_parser_plugin,
+            help="tool parser plugin used to register user defined tool parsers",
+        )
        model_group.add_argument(
            "--speculative-config",
            type=json.loads,
@@ -913,6 +933,7 @@ class EngineArgs:
            mm_processor_kwargs=self.mm_processor_kwargs,
            enable_mm=self.enable_mm,
            reasoning_parser=self.reasoning_parser,
+            tool_parser=self.tool_call_parser,
            splitwise_role=self.splitwise_role,
            innode_prefill_ports=self.innode_prefill_ports,
            max_num_partial_prefills=self.max_num_partial_prefills,
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -85,6 +85,7 @@ class Config:
        max_long_partial_prefills: int = 1,
        long_prefill_token_threshold: int = 0,
        reasoning_parser: str = None,
+        tool_parser: str = None,
        guided_decoding_backend: Optional[str] = None,
        disable_any_whitespace: bool = False,
        enable_logprob: bool = False,
@@ -165,6 +166,7 @@ class Config:
        self.max_long_partial_prefills = max_long_partial_prefills
        self.long_prefill_token_threshold = long_prefill_token_threshold
        self.reasoning_parser = reasoning_parser
+        self.tool_parser = tool_parser
        self.graph_optimization_config = graph_optimization_config
        self.early_stop_config = early_stop_config
        self.guided_decoding_backend = guided_decoding_backend
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -106,6 +106,7 @@ class LLMEngine:
            cfg.limit_mm_per_prompt,
            cfg.mm_processor_kwargs,
            cfg.enable_mm,
+            cfg.tool_parser,
        )

        self.start_queue_service()
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -24,6 +24,7 @@ from typing import Any, Dict, Optional, Union
 import numpy as np

 from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.openai.protocol import ToolCall
 from fastdeploy.utils import data_processor_logger
 from fastdeploy.worker.output import LogprobsLists, SampleLogprobs

@@ -249,6 +250,7 @@ class CompletionOutput:
    draft_token_ids: list[int] = None
    text: Optional[str] = None
    reasoning_content: Optional[str] = None
+    tool_calls: Optional[ToolCall] = None

    def to_dict(self):
        """
--- a/fastdeploy/entrypoints/chat_utils.py
+++ b/fastdeploy/entrypoints/chat_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """

+import uuid
 from copy import deepcopy
 from typing import List, Literal, Union
 from urllib.parse import urlparse
@@ -156,3 +157,7 @@ def parse_chat_messages(messages):

        conversation.append({"role": role, "content": parsed_content})
    return conversation
+
+
+def random_tool_call_id() -> str:
+    return f"chatcmpl-tool-{str(uuid.uuid4().hex)}"
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -45,6 +45,7 @@ class EngineClient:
        data_parallel_size=1,
        enable_logprob=False,
        workers=1,
+        tool_parser=None,
    ):
        input_processor = InputPreprocessor(
            tokenizer,
@@ -52,6 +53,7 @@ class EngineClient:
            limit_mm_per_prompt,
            mm_processor_kwargs,
            enable_mm,
+            tool_parser,
        )
        self.enable_logprob = enable_logprob
        self.enable_mm = enable_mm
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -28,6 +28,7 @@ from tqdm import tqdm
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.engine import LLMEngine
 from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager

 # from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam
 from fastdeploy.utils import llm_logger, retrive_model_from_server
@@ -73,6 +74,9 @@ class LLM:
        **kwargs,
    ):
        model = retrive_model_from_server(model, revision)
+        tool_parser_plugin = kwargs.get("tool_parser_plugin")
+        if tool_parser_plugin:
+            ToolParserManager.import_tool_parser(tool_parser_plugin)
        engine_args = EngineArgs(
            model=model,
            tokenizer=tokenizer,
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -41,6 +41,7 @@ from fastdeploy.entrypoints.openai.protocol import (
 )
 from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
 from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
 from fastdeploy.metrics.metrics import (
    EXCLUDE_LABELS,
    cleanup_prometheus_files,
@@ -73,7 +74,8 @@ parser.add_argument("--max-concurrency", default=512, type=int, help="max concur
 parser = EngineArgs.add_cli_args(parser)
 args = parser.parse_args()
 args.model = retrive_model_from_server(args.model, args.revision)
-
+if args.tool_parser_plugin:
+    ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 llm_engine = None


@@ -126,6 +128,7 @@ async def lifespan(app: FastAPI):
        args.data_parallel_size,
        args.enable_logprob,
        args.workers,
+        args.tool_call_parser,
    )
    app.state.dynamic_load_weight = args.dynamic_load_weight
    chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time)
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -72,7 +72,6 @@ class ToolCall(BaseModel):
    id: str = None
    type: Literal["function"] = "function"
    function: FunctionCall
-    index: int


 class DeltaFunctionCall(BaseModel):
@@ -96,6 +95,18 @@ class DeltaToolCall(BaseModel):
    function: Optional[DeltaFunctionCall] = None


+class ExtractedToolCallInformation(BaseModel):
+    # indicate if tools were called
+    tools_called: bool
+
+    # extracted tool calls
+    tool_calls: Optional[list[ToolCall]] = None
+
+    # content - per OpenAI spec, content AND tool calls can be returned rarely
+    # But some models will do this intentionally
+    content: Optional[str] = None
+
+
 class FunctionDefinition(BaseModel):
    """
    Function definition.
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -141,6 +141,7 @@ class OpenAIServingChat:
        previous_num_tokens = 0
        num_prompt_tokens = 0
        num_choices = 1
+        tool_called = False
        max_streaming_response_tokens = (
            request.max_streaming_response_tokens
            if request.max_streaming_response_tokens is not None
@@ -245,20 +246,28 @@ class OpenAIServingChat:
                    output = res["outputs"]
                    delta_text = output["text"]
                    output_top_logprobs = output["top_logprobs"]
+                    previous_num_tokens += len(output["token_ids"])
                    logprobs_res: Optional[LogProbs] = None
                    if request.logprobs and output_top_logprobs is not None:
                        logprobs_res = self._create_chat_logprobs(
                            output_top_logprobs, request.logprobs, request.top_logprobs
                        )
-
-                    previous_num_tokens += len(output["token_ids"])
-                    delta_message = DeltaMessage(
-                        content=delta_text,
-                        reasoning_content=output.get("reasoning_content"),
-                        prompt_token_ids=None,
-                        completion_token_ids=None,
-                        tool_calls=output.get("tool_call_content", []),
-                    )
+                    if self.engine_client.data_processor.tool_parser_obj and not res["finished"]:
+                        tool_delta_message = output["tool_delta_message"]
+                        if tool_delta_message is None:
+                            continue
+                        delta_message = tool_delta_message
+                        delta_message.reasoning_content = output.get("reasoning_content")
+                        if delta_message.tool_calls:
+                            tool_called = True
+                    else:
+                        delta_message = DeltaMessage(
+                            content=delta_text,
+                            reasoning_content=output.get("reasoning_content"),
+                            prompt_token_ids=None,
+                            completion_token_ids=None,
+                            tool_calls=None,
+                        )

                    choice = ChatCompletionResponseStreamChoice(
                        index=0,
@@ -276,10 +285,7 @@ class OpenAIServingChat:
                        max_tokens = request.max_completion_tokens or request.max_tokens
                        if has_no_token_limit or previous_num_tokens != max_tokens:
                            choice.finish_reason = "stop"
-                            if (
-                                self.engine_client.reasoning_parser == "ernie_x1"
-                                and output.get("finish_reason", "") == "tool_calls"
-                            ):
+                            if tool_called:
                                choice.finish_reason = "tool_calls"
                        else:
                            choice.finish_reason = "length"
@@ -419,7 +425,7 @@ class OpenAIServingChat:
            role="assistant",
            content=output["text"],
            reasoning_content=output.get("reasoning_content"),
-            tool_calls=output.get("tool_call_content"),
+            tool_calls=output.get("tool_call"),
            prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
            completion_token_ids=completion_token_ids if request.return_token_ids else None,
            text_after_process=text_after_process if request.return_token_ids else None,
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -240,9 +240,9 @@ class OpenAIServingCompletion:
                dealer.close()
                self.engine_client.semaphore.release()

-    def calc_finish_reason(self, max_tokens, token_num, output):
+    def calc_finish_reason(self, max_tokens, token_num, output, tool_called):
        if max_tokens is None or token_num != max_tokens:
-            if self.engine_client.reasoning_parser == "ernie_x1" and output.get("finish_reason", "") == "tool_calls":
+            if tool_called or output.get("tool_call"):
                return "tool_calls"
            else:
                return "stop"
@@ -271,6 +271,7 @@ class OpenAIServingCompletion:
            output_tokens = [0] * num_choices
            inference_start_time = [0] * num_choices
            first_iteration = [True] * num_choices
+            tool_called = False
            max_streaming_response_tokens = (
                request.max_streaming_response_tokens
                if request.max_streaming_response_tokens is not None
@@ -342,24 +343,41 @@ class OpenAIServingCompletion:
                    if request.logprobs and output_top_logprobs is not None:
                        logprobs_res = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)

-                    choices.append(
-                        CompletionResponseStreamChoice(
+                    output_tokens[idx] += 1
+                    if self.engine_client.data_processor.tool_parser_obj and not res["finished"]:
+                        tool_delta_message = output["tool_delta_message"]
+                        if tool_delta_message is None:
+                            continue
+                        delta_message = CompletionResponseStreamChoice(
                            index=idx,
                            text=output["text"],
-                            prompt_token_ids=None,
                            completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
-                            raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
-                            tool_calls=output.get("tool_call_content"),
+                            tool_calls=tool_delta_message.tool_calls,
                            reasoning_content=output.get("reasoning_content"),
                            arrival_time=arrival_time,
                            logprobs=logprobs_res,
                        )
-                    )
+                        if tool_delta_message.tool_calls:
+                            tool_called = True
+                    else:
+                        delta_message = CompletionResponseStreamChoice(
+                            index=idx,
+                            text=output["text"],
+                            prompt_token_ids=None,
+                            completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
+                            tool_calls=None,
+                            raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
+                            reasoning_content=output.get("reasoning_content"),
+                            arrival_time=arrival_time,
+                            logprobs=logprobs_res,
+                        )
+
+                    choices.append(delta_message)
                    output_tokens[idx] += 1

                    if res["finished"]:
                        choices[-1].finish_reason = self.calc_finish_reason(
-                            request.max_tokens, output_tokens[idx], output
+                            request.max_tokens, output_tokens[idx], output, tool_called
                        )
                    send_idx = output.get("send_idx")
                    # 只有当 send_idx 明确为 0 时才记录日志
@@ -458,7 +476,7 @@ class OpenAIServingCompletion:
                token_ids = output["token_ids"]
                output_text = output["text"]

-            finish_reason = self.calc_finish_reason(request.max_tokens, final_res["output_token_ids"], output)
+            finish_reason = self.calc_finish_reason(request.max_tokens, final_res["output_token_ids"], output, False)

            choice_data = CompletionResponseChoice(
                token_ids=token_ids,
@@ -469,7 +487,7 @@ class OpenAIServingCompletion:
                raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
                text_after_process=text_after_process_list[idx] if request.return_token_ids else None,
                reasoning_content=output.get("reasoning_content"),
-                tool_calls=output.get("tool_call_content"),
+                tool_calls=output.get("tool_call"),
                logprobs=aggregated_logprobs,
                finish_reason=finish_reason,
            )
--- a/fastdeploy/entrypoints/openai/tool_parsers/init.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/init.py
@@ -0,0 +1,24 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .abstract_tool_parser import ToolParser, ToolParserManager
+from .ernie_x1_tool_parser import ErnieX1ToolParser
+
+__all__ = [
+    "ToolParser",
+    "ToolParserManager",
+    "ErnieX1ToolParser",
+]
--- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -0,0 +1,159 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+from collections.abc import Sequence
+from functools import cached_property
+from typing import Callable, Optional, Union
+
+from fastdeploy.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ExtractedToolCallInformation,
+)
+from fastdeploy.utils import data_processor_logger, import_from_path, is_list_of
+
+
+class ToolParser:
+    """
+    Abstract ToolParser class that should not be used directly. Provided
+    properties and methods should be used in
+    derived classes.
+    """
+
+    def __init__(self, tokenizer):
+        self.prev_tool_call_arr: list[dict] = []
+        # the index of the tool call that is currently being parsed
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = []
+
+        self.model_tokenizer = tokenizer
+
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        """
+        Static method that used to adjust the request parameters.
+        """
+        return request
+
+    def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Static method that should be implemented for extracting tool calls from
+        a complete model-generated string.
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+        Static because it's stateless.
+        """
+        raise NotImplementedError("AbstractToolParser.extract_tool_calls has not been implemented!")
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting tool calls
+        from an incomplete response; for use when handling tool calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+        raise NotImplementedError("AbstractToolParser.extract_tool_calls_streaming has not been " "implemented!")
+
+
+class ToolParserManager:
+    tool_parsers: dict[str, type] = {}
+
+    @classmethod
+    def get_tool_parser(cls, name) -> type:
+        """
+        Get tool parser by name which is registered by `register_module`.
+
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.tool_parsers:
+            return cls.tool_parsers[name]
+
+        raise KeyError(f"tool helper: '{name}' not found in tool_parsers")
+
+    @classmethod
+    def _register_module(
+        cls, module: type, module_name: Optional[Union[str, list[str]]] = None, force: bool = True
+    ) -> None:
+        if not issubclass(module, ToolParser):
+            raise TypeError(f"module must be subclass of ToolParser, but got {type(module)}")
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.tool_parsers:
+                existed_module = cls.tool_parsers[name]
+                raise KeyError(f"{name} is already registered " f"at {existed_module.__module__}")
+            cls.tool_parsers[name] = module
+
+    @classmethod
+    def register_module(
+        cls, name: Optional[Union[str, list[str]]] = None, force: bool = True, module: Union[type, None] = None
+    ) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f"force must be a boolean, but got {type(force)}")
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_list_of(name, str)):
+            raise TypeError("name must be None, an instance of str, or a sequence of str, " f"but got {type(name)}")
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
+
+    @classmethod
+    def import_tool_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined tool parser by the path of the tool parser define
+        file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            data_processor_logger.exception("Failed to load module '%s' from %s.", module_name, plugin_path)
+            return
--- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+import uuid
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+
+
+def random_tool_call_id() -> str:
+    """Generate a random tool call ID"""
+    return f"chatcmpl-tool-{str(uuid.uuid4().hex)}"
+
+
+from fastdeploy.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+    ToolParserManager,
+)
+from fastdeploy.utils import data_processor_logger
+
+
+@ToolParserManager.register_module("ernie_x1")
+class ErnieX1ToolParser(ToolParser):
+    """
+    Tool parser for Ernie model version 4.5.1.
+    This parser handles tool calls with newline formats.
+    """
+
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = []  # map what has been streamed for each tool so far to a list
+        self.buffer: str = ""  # buffer for accumulating unprocessed streaming content
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolCallParser constructor during construction."
+            )
+
+    def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        Supports XML-style formats with newlines:
+        - XML format: <think>\n...\n</think>\n\n\n<tool_call>\n{...}\n</tool_call>\n...
+
+        Handles boundary cases:
+        1. Only name and partial arguments: {"name": "get_weather", "arguments": {"location": "北京"
+        2. Only partial name: {"name": "get_we
+        3. Only name and arguments field without content: {"name": "get_weather", "argume
+        """
+
+        try:
+            tool_calls = []
+
+            # Check for invalid <response> tags before tool calls
+            if re.search(r"<response>[\s\S]*?</response>\s*(?=<tool_call>)", model_output):
+                data_processor_logger.error("Invalid format: <response> tags found before <tool_call>")
+                return ExtractedToolCallInformation(tools_called=False, content=model_output)
+
+            function_call_arr = []
+            remaining_text = model_output
+
+            while True:
+                # 查找下一个tool_call块
+                tool_call_pos = remaining_text.find("<tool_call>")
+                if tool_call_pos == -1:
+                    break
+
+                # 提取tool_call开始位置后的内容
+                tool_content_start = tool_call_pos + len("<tool_call>")
+                tool_content_end = remaining_text.find("</tool_call>", tool_content_start)
+
+                tool_json = ""
+                if tool_content_end == -1:
+                    # 处理未闭合的tool_call块（截断情况）
+                    tool_json = remaining_text[tool_content_start:].strip()
+                    remaining_text = ""  # 没有更多内容需要处理
+                else:
+                    # 处理完整的tool_call块
+                    tool_json = remaining_text[tool_content_start:tool_content_end].strip()
+                    remaining_text = remaining_text[tool_content_end + len("</tool_call>") :]
+
+                if not tool_json:
+                    continue
+
+                # 处理JSON内容
+                tool_json = tool_json.strip()
+                if not tool_json.startswith("{"):
+                    tool_json = "{" + tool_json
+                if not tool_json.endswith("}"):
+                    tool_json = tool_json + "}"
+
+                try:
+                    # 首先尝试标准JSON解析
+                    try:
+                        tool_data = json.loads(tool_json)
+
+                        if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data:
+                            function_call_arr.append(
+                                {
+                                    "name": tool_data["name"],
+                                    "arguments": tool_data["arguments"],
+                                    "_is_complete": True,  # 明确标记为完整解析
+                                }
+                            )
+                            continue
+                    except json.JSONDecodeError:
+                        pass
+
+                    # 标准解析失败时尝试partial_json_parser
+                    from partial_json_parser.core.options import Allow
+
+                    try:
+                        tool_data = {}
+                        flags = Allow.ALL & ~Allow.STR
+
+                        # 解析name字段
+                        name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json)
+                        if name_match:
+                            tool_data["name"] = name_match.group(1)
+
+                        # 解析arguments字段
+                        args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json)
+                        if args_match:
+                            try:
+                                tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags)
+                            except:
+                                tool_data["arguments"] = None
+
+                        if isinstance(tool_data, dict):
+                            function_call_arr.append(
+                                {
+                                    "name": tool_data.get("name", ""),
+                                    "arguments": tool_data.get("arguments", {}),
+                                    "_is_partial": True,  # 标记为部分解析
+                                }
+                            )
+                    except Exception as e:
+                        data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
+                        continue
+                except Exception as e:
+                    data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
+                    continue
+
+            if not function_call_arr:
+                data_processor_logger.error("No valid tool calls found")
+                return ExtractedToolCallInformation(tools_called=False, content=model_output)
+
+            tool_calls = []
+            all_complete = True  # 初始设为True，只要有一个不完整就变为False
+
+            for tool_call in function_call_arr:
+                # 记录工具调用解析状态
+                is_complete = tool_call.get("_is_complete", False)
+                is_partial = tool_call.get("_is_partial", False)
+
+                # 只要有一个不完整就认为整体不完整
+                if not is_complete or is_partial:
+                    all_complete = False
+
+                # 处理参数序列化
+                tool_args = tool_call.get("arguments", {})
+                if not isinstance(tool_args, dict):
+                    tool_args = {}
+
+                try:
+                    args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}"
+                except:
+                    args_str = "{}"
+
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        id=random_tool_call_id(),
+                        function=FunctionCall(
+                            name=tool_call.get("name", ""),
+                            arguments=args_str,
+                        ),
+                    )
+                )
+
+            # 只有当所有工具调用都明确标记为complete时才返回tools_called=True
+            return ExtractedToolCallInformation(
+                tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content=""
+            )
+
+        except Exception as e:
+            data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}")
+            return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: dict,
+    ) -> Union[DeltaMessage, None]:
+        # 忽略空chunk
+        if len(delta_text.strip()) == 0:
+            return None
+
+        try:
+            delta = None
+            # 使用buffer累积delta_text内容
+            self.buffer += delta_text
+
+            # 处理增量中的新tool_call开始
+            if "<tool_call>" in delta_text and "<tool_call>" not in previous_text:
+                self.current_tool_id = (
+                    max(self.current_tool_id, 0) if self.current_tool_id == -1 else self.current_tool_id + 1
+                )
+                self.current_tool_name_sent = False
+                if len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+                data_processor_logger.debug(f"New tool call started with ID: {self.current_tool_id}")
+
+            # 增量解析逻辑
+
+            # 1. 尝试解析name字段
+            if not self.current_tool_name_sent and '"name"' in self.buffer:
+                name_match = re.search(r'"name"\s*:\s*"([^"]*)"', self.buffer)
+                if name_match:
+                    name = name_match.group(1)
+                    if name:
+                        delta = DeltaMessage(
+                            tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_id,
+                                    type="function",
+                                    id=random_tool_call_id(),
+                                    function=DeltaFunctionCall(name=name).model_dump(exclude_none=True),
+                                )
+                            ]
+                        )
+                        print("delta name:", delta)
+                        # 删除已处理的name部分
+                        self.buffer = self.buffer[name_match.end() :]
+                        self.current_tool_name_sent = True
+                        return delta
+            # 2. 尝试解析arguments字段
+            if '"arguments"' in self.buffer:
+                args_match = re.search(r'"arguments"\s*:\s*(\{.*)', self.buffer)
+                if args_match:
+                    args_content = args_match.group(1)
+                    # 处理多余的大括号
+                    open_braces = args_content.count("{")
+                    close_braces = args_content.count("}")
+                    if close_braces > open_braces:
+                        args_content = args_content[: args_content.rfind("}")]
+                    try:
+                        # 增量解析arguments
+                        parsed_args = json.loads(args_content)
+                        if isinstance(parsed_args, dict):
+                            args_json = json.dumps(parsed_args, ensure_ascii=False)
+                            if len(args_json) > len(self.streamed_args_for_tool[self.current_tool_id]):
+                                argument_diff = args_json[len(self.streamed_args_for_tool[self.current_tool_id]) :]
+                                delta = DeltaMessage(
+                                    tool_calls=[
+                                        DeltaToolCall(
+                                            index=self.current_tool_id,
+                                            function=DeltaFunctionCall(arguments=argument_diff).model_dump(
+                                                exclude_none=True
+                                            ),
+                                        )
+                                    ]
+                                )
+                                print("delta argument:", delta)
+                                # 删除已处理部分
+                                processed_pos = args_match.start() + len('"arguments":')
+                                self.buffer = (
+                                    self.buffer[:processed_pos] + self.buffer[processed_pos + len(args_json) :]
+                                )
+                                self.streamed_args_for_tool[self.current_tool_id] = args_json
+                                return delta
+                    except Exception as e:
+                        data_processor_logger.debug(f"Partial arguments parsing: {str(e)}")
+
+            if "</tool_call>" in self.buffer:
+                end_pos = self.buffer.find("</tool_call>")
+                self.buffer = self.buffer[end_pos + len("</tool_call>") :]
+
+                # 完成当前工具调用处理
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+
+            return delta
+
+        except Exception as e:
+            data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
+            return None
--- a/fastdeploy/entrypoints/openai/tool_parsers/utils.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/utils.py
@@ -0,0 +1,137 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import json
+from json import JSONDecodeError, JSONDecoder
+from typing import Any
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+
+def find_common_prefix(s1: str, s2: str) -> str:
+    """
+    Finds a common prefix that is shared between two strings, if there is one.
+    Order of arguments is NOT important.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely.
+
+    e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') ->
+    '{"fruit": "ap'
+    """
+    prefix = ""
+    min_length = min(len(s1), len(s2))
+    for i in range(0, min_length):
+        if s1[i] == s2[i]:
+            prefix += s1[i]
+        else:
+            break
+    return prefix
+
+
+def find_common_suffix(s1: str, s2: str) -> str:
+    """
+    Finds a common suffix shared between two strings, if there is one. Order of
+    arguments is NOT important.
+    Stops when the suffix ends OR it hits an alphanumeric character
+
+    e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}'
+    """
+    suffix = ""
+    min_length = min(len(s1), len(s2))
+    for i in range(1, min_length + 1):
+        if s1[-i] == s2[-i] and not s1[-i].isalnum():
+            suffix = s1[-i] + suffix
+        else:
+            break
+    return suffix
+
+
+def extract_intermediate_diff(curr: str, old: str) -> str:
+    """
+    Given two strings, extract the difference in the middle between two strings
+    that are known to have a common prefix and/or suffix.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely. The order of arguments IS
+    important - the new version of the partially-parsed JSON must be the first
+    argument, and the secnod argument must be from the previous generation.
+
+    What it returns, is tokens that should be streamed to the client.
+
+    e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}')
+        -> 'ple'
+
+    """
+    suffix = find_common_suffix(curr, old)
+
+    old = old[::-1].replace(suffix[::-1], "", 1)[::-1]
+    prefix = find_common_prefix(curr, old)
+    diff = curr
+    if len(suffix):
+        diff = diff[::-1].replace(suffix[::-1], "", 1)[::-1]
+
+    if len(prefix):
+        # replace the prefix only once in case it's mirrored
+        diff = diff.replace(prefix, "", 1)
+
+    return diff
+
+
+def find_all_indices(string: str, substring: str) -> list[int]:
+    """
+    Find all (starting) indices of a substring in a given string. Useful for
+    tool call extraction
+    """
+    indices = []
+    index = -1
+    while True:
+        index = string.find(substring, index + 1)
+        if index == -1:
+            break
+        indices.append(index)
+    return indices
+
+
+# partial_json_parser doesn't support extra data and
+# JSONDecoder.raw_decode doesn't support partial JSON
+def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
+    try:
+        return (partial_json_parser.loads(input_str, flags), len(input_str))
+    except JSONDecodeError as e:
+        if "Extra data" in e.msg:
+            dec = JSONDecoder()
+            return dec.raw_decode(input_str)
+        raise
+
+
+def is_complete_json(input_str: str) -> bool:
+    try:
+        json.loads(input_str)
+        return True
+    except JSONDecodeError:
+        return False
+
+
+def consume_space(i: int, s: str) -> int:
+    while i < len(s) and s[i].isspace():
+        i += 1
+    return i
--- a/fastdeploy/input/ernie_processor.py
+++ b/fastdeploy/input/ernie_processor.py
@@ -43,13 +43,14 @@ class ErnieProcessor(BaseDataProcessor):
        pad_token_id (int): 存储填充符号的token ID。
    """

-    def __init__(self, model_name_or_path, reasoning_parser_obj=None):
+    def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):

        self.model_name_or_path = model_name_or_path
        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
        self._init_config()

        self.decode_status = dict()
+        self.tool_parsers = dict()
        self.thinking_parser_dict = dict()
        self._load_tokenizer()
        data_processor_logger.info(
@@ -63,6 +64,7 @@ class ErnieProcessor(BaseDataProcessor):
        self.reasoning_parser = None
        if reasoning_parser_obj:
            self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
+        self.tool_parser_obj = tool_parser_obj

    def _init_config(self):
        self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
@@ -204,6 +206,12 @@ class ErnieProcessor(BaseDataProcessor):
            response_dict.outputs.reasoning_content = reasoning_content
        else:
            response_dict.outputs.text = full_text
+        if self.tool_parser_obj:
+            tool_parser = self.tool_parser_obj(self.tokenizer)
+            tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
+            if tool_call_info.tools_called:
+                response_dict.outputs.tool_calls = tool_call_info.tool_calls
+                response_dict.outputs.text = tool_call_info.content
        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
        if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
            return None
@@ -244,12 +252,20 @@ class ErnieProcessor(BaseDataProcessor):
        delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
        if is_end:
            full_text = previous_texts + delta_text
-            if enable_thinking and self.reasoning_parser:
+            if self.reasoning_parser and (
+                enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
+            ):
                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
                response_dict["outputs"]["text"] = text
                response_dict["outputs"]["reasoning_content"] = reasoning_content
            else:
                response_dict["outputs"]["text"] = full_text
+            if self.tool_parser_obj:
+                tool_parser = self.tool_parser_obj(self.tokenizer)
+                tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
+                if tool_call_info.tools_called:
+                    response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
+                    response_dict["outputs"]["text"] = tool_call_info.content
            response_dict["outputs"]["raw_prediction"] = full_text
            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
            del self.decode_status[req_id]
@@ -275,7 +291,9 @@ class ErnieProcessor(BaseDataProcessor):
                token_ids = token_ids[:-1]
        delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
        response_dict["outputs"]["raw_prediction"] = delta_text
-        if enable_thinking and self.reasoning_parser:
+        if self.reasoning_parser and (
+            enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
+        ):
            reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
                previous_texts,
                previous_texts + delta_text,
@@ -288,10 +306,25 @@ class ErnieProcessor(BaseDataProcessor):
            response_dict["outputs"]["reasoning_content"] = reasoning_content
        else:
            response_dict["outputs"]["text"] = delta_text
-        response_dict["outputs"]["raw_prediction"] = delta_text
+        if self.tool_parser_obj:
+            if req_id not in self.tool_parsers:
+                self.tool_parsers[req_id] = self.tool_parser_obj(self.tokenizer)
+            tool_parser = self.tool_parsers[req_id]
+            tool_call = tool_parser.extract_tool_calls_streaming(
+                previous_texts,
+                previous_texts + delta_text,
+                delta_text,
+                previous_token_ids,
+                previous_token_ids + token_ids,
+                token_ids,
+                response_dict,
+            )
+            response_dict["outputs"]["tool_delta_message"] = tool_call
        if is_end:
            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
            del self.decode_status[req_id]
+            if req_id in self.tool_parsers:
+                del self.tool_parsers[req_id]
        return response_dict

    def messages2ids(self, request_or_messages):
--- a/fastdeploy/input/ernie_vl_processor.py
+++ b/fastdeploy/input/ernie_vl_processor.py
@@ -34,6 +34,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
        limit_mm_per_prompt=None,
        mm_processor_kwargs=None,
        reasoning_parser_obj=None,
+        tool_parser_obj=None,
    ):
        self.use_hf_tokenizer = False

@@ -53,6 +54,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
        self.image_patch_id = self.ernie_processor.image_patch_id
        self.spatial_conv_size = self.ernie_processor.spatial_conv_size

+        self.tool_parsers = dict()
        self.decode_status = dict()
        self._load_tokenizer()
        self.eos_token_ids = [self.tokenizer.eos_token_id]
@@ -62,6 +64,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
        self.reasoning_parser = None
        if reasoning_parser_obj:
            self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
+        self.tool_parser_obj = tool_parser_obj

        # Generation config
        try:
--- a/fastdeploy/input/preprocess.py
+++ b/fastdeploy/input/preprocess.py
@@ -18,6 +18,7 @@ from typing import Any, Dict, Optional

 from fastdeploy.config import ErnieArchitectures
 from fastdeploy.engine.config import ModelConfig
+from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
 from fastdeploy.reasoning import ReasoningParserManager


@@ -48,6 +49,7 @@ class InputPreprocessor:
        limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
        enable_mm: bool = False,
+        tool_parser: str = None,
    ) -> None:

        self.model_name_or_path = model_name_or_path
@@ -55,6 +57,7 @@ class InputPreprocessor:
        self.enable_mm = enable_mm
        self.limit_mm_per_prompt = limit_mm_per_prompt
        self.mm_processor_kwargs = mm_processor_kwargs
+        self.tool_parser = tool_parser

    def create_processor(self):
        """
@@ -68,8 +71,11 @@ class InputPreprocessor:
            DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
        """
        reasoning_parser_obj = None
+        tool_parser_obj = None
        if self.reasoning_parser:
            reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser)
+        if self.tool_parser:
+            tool_parser_obj = ToolParserManager.get_tool_parser(self.tool_parser)
        architectures = ModelConfig({"model": self.model_name_or_path}).architectures[0]
        if not self.enable_mm:
            if not ErnieArchitectures.contains_ernie_arch(architectures):
@@ -78,6 +84,7 @@ class InputPreprocessor:
                self.processor = DataProcessor(
                    model_name_or_path=self.model_name_or_path,
                    reasoning_parser_obj=reasoning_parser_obj,
+                    tool_parser_obj=tool_parser_obj,
                )
            else:
                from fastdeploy.input.ernie_processor import ErnieProcessor
@@ -85,6 +92,7 @@ class InputPreprocessor:
                self.processor = ErnieProcessor(
                    model_name_or_path=self.model_name_or_path,
                    reasoning_parser_obj=reasoning_parser_obj,
+                    tool_parser_obj=tool_parser_obj,
                )
        else:
            if not architectures.startswith("Ernie4_5_VLMoeForConditionalGeneration"):
@@ -97,5 +105,6 @@ class InputPreprocessor:
                    limit_mm_per_prompt=self.limit_mm_per_prompt,
                    mm_processor_kwargs=self.mm_processor_kwargs,
                    reasoning_parser_obj=reasoning_parser_obj,
+                    tool_parser_obj=tool_parser_obj,
                )
        return self.processor
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -148,7 +148,7 @@ class BaseDataProcessor(ABC):


 class DataProcessor(BaseDataProcessor):
-    def __init__(self, model_name_or_path, reasoning_parser_obj=None):
+    def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):
        """
            Initializes the DecodeStatus object.

@@ -168,6 +168,7 @@ class DataProcessor(BaseDataProcessor):
        self._init_config()

        self.decode_status = dict()
+        self.tool_parsers = dict()
        self.tokenizer = self._load_tokenizer()
        data_processor_logger.info(
            f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
@@ -180,6 +181,7 @@ class DataProcessor(BaseDataProcessor):
        self.eos_token_id_len = len(self.eos_token_ids)
        self.pad_token_id = self.get_pad_id()
        self.reasoning_parser = None
+        self.tool_parser_obj = tool_parser_obj
        if reasoning_parser_obj:
            self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
        self.tokenizer.pad_token_id = self.pad_token_id
@@ -329,6 +331,12 @@ class DataProcessor(BaseDataProcessor):
        else:
            # 模型不支持思考,并且没单独设置enable_thinking为false
            response_dict.outputs.text = full_text
+        if self.tool_parser_obj:
+            tool_parser = self.tool_parser_obj(self.tokenizer)
+            tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
+            if tool_call_info.tools_called:
+                response_dict.outputs.tool_calls = tool_call_info.tool_calls
+                response_dict.outputs.text = tool_call_info.content
        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")

        return response_dict
@@ -360,6 +368,12 @@ class DataProcessor(BaseDataProcessor):
                response_dict["outputs"]["reasoning_content"] = reasoning_content
            else:
                response_dict["outputs"]["text"] = full_text
+            if self.tool_parser_obj:
+                tool_parser = self.tool_parser_obj(self.tokenizer)
+                tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
+                if tool_call_info.tools_called:
+                    response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
+                    response_dict["outputs"]["text"] = tool_call_info.content
            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
            del self.decode_status[req_id]
        return response_dict
@@ -397,9 +411,25 @@ class DataProcessor(BaseDataProcessor):
            response_dict["outputs"]["reasoning_content"] = reasoning_content
        else:
            response_dict["outputs"]["text"] = delta_text
+        if self.tool_parser_obj and not is_end:
+            if req_id not in self.tool_parsers:
+                self.tool_parsers[req_id] = self.tool_parser_obj(self.tokenizer)
+            tool_parser = self.tool_parsers[req_id]
+            tool_call = tool_parser.extract_tool_calls_streaming(
+                previous_texts,
+                previous_texts + delta_text,
+                delta_text,
+                previous_token_ids,
+                previous_token_ids + token_ids,
+                token_ids,
+                response_dict,
+            )
+            response_dict["outputs"]["tool_delta_message"] = tool_call
        if is_end:
            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
            del self.decode_status[req_id]
+            if req_id in self.tool_parsers:
+                del self.tool_parsers[req_id]
        return response_dict

    def process_response_dict(self, response_dict, **kwargs):
--- a/fastdeploy/reasoning/init.py
+++ b/fastdeploy/reasoning/init.py
@@ -16,6 +16,7 @@

 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .ernie_vl_reasoning_parsers import ErnieVLReasoningParser
+from .ernie_x1_reasoning_parsers import ErnieX1ReasoningParser
 from .qwen3_reasoning_parsers import Qwen3ReasoningParser

 __all__ = [
@@ -23,4 +24,5 @@ __all__ = [
    "ReasoningParserManager",
    "ErnieVLReasoningParser",
    "Qwen3ReasoningParser",
+    "ErnieX1ReasoningParser",
 ]
--- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py
+++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+#
+from collections.abc import Sequence
+from typing import Tuple
+
+from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest
+from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager
+
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+@ReasoningParserManager.register_module("ernie_x1")
+class ErnieX1ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for ernie_x1 model with stricter boundary checking.
+
+    This implementation follows the user's proposed approach:
+    1. For thinking content: waits for \n then checks for </think> tag
+    2. For response content: checks for <response> tag first, then waits for \n
+    3. Handles newlines in content more precisely
+    """
+
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        self.think_end_token = "</think>"
+        self.response_start_token = "<response>"
+        self.response_end_token = "</response>"
+        self.tool_call_start_token = "<tool_call>"
+        self.tool_call_end_token = "</tool_call>"
+
+        if not self.model_tokenizer:
+            raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.")
+
+        self.think_end_token_id = self.vocab.get("</think>")
+        if self.think_end_token_id is None:
+            raise RuntimeError("Could not find think end token id in tokenizer vocabulary")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> tuple[str, str]:
+        """
+        根据用户需求实现的流式解析方法:
+        1. 初始内容都视为思考内容
+        2. 当遇到\n时检查后续是否是</think>
+        3. 思考结束后检查是<response>还是<tool_call>
+        4. 对于<response>内容，处理换行和结束标记
+        """
+        # 如果还在思考阶段
+        if not previous_text.endswith(self.think_end_token):
+            # 如果遇到\n后接</think>或直接遇到</think>，思考结束
+            if (previous_text.endswith("\n") and delta_text == self.think_end_token) or (
+                not previous_text.endswith("\n") and delta_text == self.think_end_token
+            ):
+                return "", ""
+            # 否则继续返回思考内容
+            return delta_text, ""
+
+        # 思考结束后检查是tool_call还是response
+        remaining_text = previous_text + delta_text
+        after_think = remaining_text[remaining_text.find(self.think_end_token) + len(self.think_end_token) :]
+
+        # 跳过think后的换行
+        after_think = after_think.lstrip("\n")
+
+        # 处理tool_call情况
+        if after_think.startswith(self.tool_call_start_token):
+            return "", ""
+
+        # 处理response情况
+        if after_think.startswith(self.response_start_token):
+            response_content = after_think[len(self.response_start_token) :]
+            # 跳过response后的换行
+            response_content = response_content.lstrip("\n")
+
+            # 检查response是否结束
+            if response_content.endswith(self.response_end_token):
+                return "", ""
+
+            # 返回response内容(使用delta_text确保流式输出)
+            return "", delta_text
+
+        # 默认情况不返回内容
+        return "", ""
+
+    def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest) -> Tuple[str, str]:
+        """
+        Batch version of the enhanced parser.
+        Modified to preserve newlines in both reasoning and response content,
+        only removing the single newline before closing tags.
+        """
+        reasoning_content = ""
+        response_content = ""
+
+        think_end_pos = model_output.find(self.think_end_token)
+        if think_end_pos != -1:
+            # Extract thinking content - only remove the last newline before </think>
+            reasoning_content = model_output[:think_end_pos]
+            if think_end_pos > 0 and reasoning_content[-1] == "\n":
+                reasoning_content = reasoning_content[:-1]
+
+            remaining = model_output[think_end_pos + len(self.think_end_token) :]
+
+            # Skip newlines after </think>
+            remaining = remaining.lstrip("\n")
+
+            # Check for response or tool_call
+            if remaining.startswith(self.response_start_token):
+                response_pos = len(self.response_start_token)
+                remaining = remaining[response_pos:].lstrip("\n")
+                response_end_pos = remaining.find(self.response_end_token)
+                if response_end_pos != -1:
+                    # Only strip the last newline before </response>, not all
+                    if response_end_pos > 0 and remaining[response_end_pos - 1] == "\n":
+                        response_content = remaining[: response_end_pos - 1]
+                    else:
+                        response_content = remaining[:response_end_pos]
+                else:
+                    # If no </response> found, return the rest as response content
+                    response_content = remaining
+            elif remaining.startswith(self.tool_call_start_token):
+                pass  # No response content
+        else:
+            # No thinking content found, return the whole input as reasoning
+            reasoning_content = model_output
+            response_content = ""
+        return reasoning_content, response_content
+
+
+import unittest
+from unittest.mock import MagicMock
+
+
+class TestErnieX1ReasoningParser(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = MagicMock()
+        self.tokenizer.vocab = {
+            "\n</think>\n\n": 1001,
+            "<response>\n": 1002,
+            "\n</response>\n": 1003,
+            "<tool_call>\n": 1004,
+            "\n</tool_call>\n": 1005,
+        }
+        self.parser = ErnieX1ReasoningParser(self.tokenizer)
+
+    def test_streaming_with_think_and_response(self):
+        # 测试标准情况：\n</think>\n\n<response>\ncontent\n</response>\n
+        prev_text = "thinking"
+        delta_text = "\n</think>\n\n<response>\nanswer\n</response>\n"
+        result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [], [], [])
+        self.assertEqual(result, ("thinking", "answer"))
+
+    def test_streaming_with_think_and_tool_call(self):
+        # 测试tool_call情况
+        prev_text = "thinking"
+        delta_text = "\n</think>\n\n<tool_call>\ndetails\n</tool_call>\n"
+        result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [], [], [])
+        self.assertEqual(result, ("thinking", ""))
+
+    def test_streaming_with_think_no_newline(self):
+        # 测试没有前置换行的情况
+        prev_text = "thinking"
+        delta_text = "</think>\n\n<response>answer</response>\n"
+        result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [], [], [])
+        self.assertEqual(result, ("thinking", "answer"))
+
+    def test_streaming_response_without_leading_newline(self):
+        # 测试response内容没有前置换行
+        prev_text = "thinking\n</think>\n\n"
+        delta_text = "<response>answer\n</response>\n"
+        result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [1001], [], [])
+        self.assertEqual(result, ("thinking", "answer"))
+
+    def test_streaming_response_with_middle_newline(self):
+        # 测试response内容中间的换行符
+        prev_text = "thinking\n</think>\n\n<response>\n"
+        delta_text = "line1\nline2\n</response>\n"
+        result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [1001], [], [])
+        self.assertEqual(result, ("thinking", "line1\nline2"))
+
+    def test_streaming_partial_response(self):
+        # 测试不完整的response流式输出
+        prev_text = "thinking\n</think>\n\n<response>\n"
+        delta_text = "partial answer"
+        result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [1001], [], [])
+        self.assertEqual(result, ("thinking", "partial answer"))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/fastdeploy/utils.py
+++ b/fastdeploy/utils.py
@@ -23,6 +23,7 @@ import os
 import random
 import re
 import socket
+import sys
 import tarfile
 import time
 from datetime import datetime
@@ -591,6 +592,22 @@ def is_list_of(
    assert_never(check)


+def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
+    """
+    Import a Python file according to its file path.
+    """
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ModuleNotFoundError(f"No module named '{module_name}'")
+
+    assert spec.loader is not None
+
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
 def version():
    """
    Prints the contents of the version.txt file located in the parent directory of this script.
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,3 +37,4 @@ opentelemetry-instrumentation-mysql
 opentelemetry-distro 
 opentelemetry-exporter-otlp
 opentelemetry-instrumentation-fastapi
+partial_json_parser