Release/2.1 (#3414)

* Pre ce modified (#3335) (#3360) * Pre ce modified (#3335) * update * update * fix * fix * update * update * update * fix * update * update * update * add ut fix pr(3367) * [Bug Fix] Fix V1 video bug (#3387) * fix stopseq error info (#3342) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> * [BugFix] Fix default log level of paddleformers (#3377) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> * [Polish Code] Remove useless notes * feat(log):add_request_and_response_log (#3392) * Optimize CI execution workflow. (#3371) (#3384) * fix * [BugFix] fix control signal release failed (#3374) * [BugFix] * [BugFix] * [BugFix] * [BugFix] * fix * fix --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> * Revert "Merge branch 'feature/online/vs_think_20250813' into release/2.1" This reverts commit 02596fc537, reversing changes made to 03347626a6. * [XPU] Fixed the issue of performance degradation caused by enabling ENABLE_V1_KVCACHE_SCHEDULER (#3393) * fix v1 schedule oom bug * fix v1 schedule oom bug * [BugFix] fix ErnieProcessor not set raw_prediction (#3401) * [Doc]Release fastdeploy-xpu 2.1.0 (#3407) * fix v1 schedule oom bug * fix v1 schedule oom bug * update release note * [Doc]Release fastdeploy-xpu 2.0.3 (#3408) * fix v1 schedule oom bug * fix v1 schedule oom bug * update release note * update info --------- Co-authored-by: YUNSHEN XIE <1084314248@qq.com> Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com> Co-authored-by: JYChen <zoooo0820@qq.com> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com> Co-authored-by: xiaolei373 <zley373@gmail.com> Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com> Co-authored-by: yinwei <yinwei_hust@163.com> Co-authored-by: memoryCoderC <1137889088@qq.com>
2025-10-04 00:06:38 +08:00 · 2025-08-14 20:53:47 +08:00
parent e11331927f
commit 132a8ef425
30 changed files with 132 additions and 1068 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -20,6 +20,8 @@ from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional

+import paddle
+
 from fastdeploy.config import (
    CacheConfig,
    EarlyStopConfig,
@@ -93,14 +95,6 @@ class EngineArgs:
    """
    specifies the reasoning parser to use for extracting reasoning content from the model output
    """
-    tool_call_parser: str = None
-    """
-    specifies the tool call parser  to use for extracting tool call from the model output
-    """
-    tool_parser_plugin: str = None
-    """
-    tool parser plugin used to register user defined tool parsers
-    """
    enable_mm: bool = False
    """
    Flags to enable multi-modal model
@@ -429,18 +423,6 @@ class EngineArgs:
            help="Flag specifies the reasoning parser to use for extracting "
            "reasoning content from the model output",
        )
-        model_group.add_argument(
-            "--tool-call-parser",
-            type=str,
-            default=EngineArgs.tool_call_parser,
-            help="Flag specifies the tool call parser to use for extracting" "tool call from the model output",
-        )
-        model_group.add_argument(
-            "--tool-parser-plugin",
-            type=str,
-            default=EngineArgs.tool_parser_plugin,
-            help="tool parser plugin used to register user defined tool parsers",
-        )
        model_group.add_argument(
            "--speculative-config",
            type=json.loads,
@@ -889,7 +871,10 @@ class EngineArgs:
                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
                    self.max_num_batched_tokens = self.max_model_len
                else:
-                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                    if paddle.is_compiled_with_xpu():
+                        self.max_num_batched_tokens = self.max_model_len
+                    else:
+                        self.max_num_batched_tokens = 8192

        all_dict = asdict(self)
        all_dict["model_cfg"] = model_cfg
@@ -928,7 +913,6 @@ class EngineArgs:
            mm_processor_kwargs=self.mm_processor_kwargs,
            enable_mm=self.enable_mm,
            reasoning_parser=self.reasoning_parser,
-            tool_parser=self.tool_call_parser,
            splitwise_role=self.splitwise_role,
            innode_prefill_ports=self.innode_prefill_ports,
            max_num_partial_prefills=self.max_num_partial_prefills,