Release/2.1 (#3414)

* Pre ce modified (#3335) (#3360) * Pre ce modified (#3335) * update * update * fix * fix * update * update * update * fix * update * update * update * add ut fix pr(3367) * [Bug Fix] Fix V1 video bug (#3387) * fix stopseq error info (#3342) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> * [BugFix] Fix default log level of paddleformers (#3377) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> * [Polish Code] Remove useless notes * feat(log):add_request_and_response_log (#3392) * Optimize CI execution workflow. (#3371) (#3384) * fix * [BugFix] fix control signal release failed (#3374) * [BugFix] * [BugFix] * [BugFix] * [BugFix] * fix * fix --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> * Revert "Merge branch 'feature/online/vs_think_20250813' into release/2.1" This reverts commit 02596fc537, reversing changes made to 03347626a6. * [XPU] Fixed the issue of performance degradation caused by enabling ENABLE_V1_KVCACHE_SCHEDULER (#3393) * fix v1 schedule oom bug * fix v1 schedule oom bug * [BugFix] fix ErnieProcessor not set raw_prediction (#3401) * [Doc]Release fastdeploy-xpu 2.1.0 (#3407) * fix v1 schedule oom bug * fix v1 schedule oom bug * update release note * [Doc]Release fastdeploy-xpu 2.0.3 (#3408) * fix v1 schedule oom bug * fix v1 schedule oom bug * update release note * update info --------- Co-authored-by: YUNSHEN XIE <1084314248@qq.com> Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com> Co-authored-by: JYChen <zoooo0820@qq.com> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com> Co-authored-by: xiaolei373 <zley373@gmail.com> Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com> Co-authored-by: yinwei <yinwei_hust@163.com> Co-authored-by: memoryCoderC <1137889088@qq.com>
2025-10-05 08:37:06 +08:00 · 2025-08-14 20:53:47 +08:00
parent e11331927f
commit 132a8ef425
30 changed files with 132 additions and 1068 deletions
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -240,9 +240,9 @@ class OpenAIServingCompletion:
                dealer.close()
                self.engine_client.semaphore.release()

-    def calc_finish_reason(self, max_tokens, token_num, output, tool_called):
+    def calc_finish_reason(self, max_tokens, token_num, output):
        if max_tokens is None or token_num != max_tokens:
-            if tool_called or output.get("tool_call"):
+            if self.engine_client.reasoning_parser == "ernie_x1" and output.get("finish_reason", "") == "tool_calls":
                return "tool_calls"
            else:
                return "stop"
@@ -271,7 +271,6 @@ class OpenAIServingCompletion:
            output_tokens = [0] * num_choices
            inference_start_time = [0] * num_choices
            first_iteration = [True] * num_choices
-            tool_called = False
            max_streaming_response_tokens = (
                request.max_streaming_response_tokens
                if request.max_streaming_response_tokens is not None
@@ -343,41 +342,24 @@ class OpenAIServingCompletion:
                    if request.logprobs and output_top_logprobs is not None:
                        logprobs_res = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)

-                    output_tokens[idx] += 1
-                    if self.engine_client.data_processor.tool_parser_obj and not res["finished"]:
-                        tool_delta_message = output["tool_delta_message"]
-                        if tool_delta_message is None:
-                            continue
-                        delta_message = CompletionResponseStreamChoice(
-                            index=idx,
-                            text=output["text"],
-                            completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
-                            tool_calls=tool_delta_message.tool_calls,
-                            reasoning_content=output.get("reasoning_content"),
-                            arrival_time=arrival_time,
-                            logprobs=logprobs_res,
-                        )
-                        if tool_delta_message.tool_calls:
-                            tool_called = True
-                    else:
-                        delta_message = CompletionResponseStreamChoice(
+                    choices.append(
+                        CompletionResponseStreamChoice(
                            index=idx,
                            text=output["text"],
                            prompt_token_ids=None,
                            completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
-                            tool_calls=None,
                            raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
+                            tool_calls=output.get("tool_call_content"),
                            reasoning_content=output.get("reasoning_content"),
                            arrival_time=arrival_time,
                            logprobs=logprobs_res,
                        )
-
-                    choices.append(delta_message)
+                    )
                    output_tokens[idx] += 1

                    if res["finished"]:
                        choices[-1].finish_reason = self.calc_finish_reason(
-                            request.max_tokens, output_tokens[idx], output, tool_called
+                            request.max_tokens, output_tokens[idx], output
                        )
                    send_idx = output.get("send_idx")
                    # 只有当 send_idx 明确为 0 时才记录日志
@@ -476,7 +458,7 @@ class OpenAIServingCompletion:
                token_ids = output["token_ids"]
                output_text = output["text"]

-            finish_reason = self.calc_finish_reason(request.max_tokens, final_res["output_token_ids"], output, False)
+            finish_reason = self.calc_finish_reason(request.max_tokens, final_res["output_token_ids"], output)

            choice_data = CompletionResponseChoice(
                token_ids=token_ids,
@@ -487,7 +469,7 @@ class OpenAIServingCompletion:
                raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
                text_after_process=text_after_process_list[idx] if request.return_token_ids else None,
                reasoning_content=output.get("reasoning_content"),
-                tool_calls=output.get("tool_call"),
+                tool_calls=output.get("tool_call_content"),
                logprobs=aggregated_logprobs,
                finish_reason=finish_reason,
            )