From cc5430e4c2a90249a6975f01394c8bf74e3afb09 Mon Sep 17 00:00:00 2001
From: ltd0924 <32387785+ltd0924@users.noreply.github.com>
Date: Tue, 2 Sep 2025 21:03:36 +0800
Subject: [PATCH] [BugFix] [CP] fix max streaming tokens invalid  (#3798)

* Update serving_chat.py

* Update serving_completion.py
---
 fastdeploy/entrypoints/openai/serving_chat.py       | 7 +------
 fastdeploy/entrypoints/openai/serving_completion.py | 5 +----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 292bf1e8b..48931ba27 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -148,7 +148,7 @@ class OpenAIServingChat:
             if request.max_streaming_response_tokens is not None
             else (request.metadata or {}).get("max_streaming_response_tokens", 1)
         )  # dierctly passed & passed in metadata
-
+        max_streaming_response_tokens = max(1, max_streaming_response_tokens)
         enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
         if enable_thinking is None:
             enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
@@ -316,11 +316,6 @@ class OpenAIServingChat:
                             api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
                         choices = []
 
-                if choices:
-                    chunk.choices = choices
-                    yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
-                    choices = []
-
             if include_usage:
                 completion_tokens = previous_num_tokens
                 usage = UsageInfo(
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 9372b8e4d..e49de00c5 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -285,6 +285,7 @@ class OpenAIServingCompletion:
                 if request.max_streaming_response_tokens is not None
                 else (request.suffix or {}).get("max_streaming_response_tokens", 1)
             )  # dierctly passed & passed in suffix
+            max_streaming_response_tokens = max(max_streaming_response_tokens, 1)
             choices = []
             chunk = CompletionStreamResponse(
                 id=request_id,
@@ -418,10 +419,6 @@ class OpenAIServingCompletion:
                             )
                             yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
                         api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
-                if choices:
-                    chunk.choices = choices
-                    yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
-                    choices = []
 
         except Exception as e:
             yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n"