From cc5430e4c2a90249a6975f01394c8bf74e3afb09 Mon Sep 17 00:00:00 2001 From: ltd0924 <32387785+ltd0924@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:03:36 +0800 Subject: [PATCH] [BugFix] [CP] fix max streaming tokens invalid (#3798) * Update serving_chat.py * Update serving_completion.py --- fastdeploy/entrypoints/openai/serving_chat.py | 7 +------ fastdeploy/entrypoints/openai/serving_completion.py | 5 +---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 292bf1e8b..48931ba27 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -148,7 +148,7 @@ class OpenAIServingChat: if request.max_streaming_response_tokens is not None else (request.metadata or {}).get("max_streaming_response_tokens", 1) ) # dierctly passed & passed in metadata - + max_streaming_response_tokens = max(1, max_streaming_response_tokens) enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None if enable_thinking is None: enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None @@ -316,11 +316,6 @@ class OpenAIServingChat: api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}") choices = [] - if choices: - chunk.choices = choices - yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" - choices = [] - if include_usage: completion_tokens = previous_num_tokens usage = UsageInfo( diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 9372b8e4d..e49de00c5 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -285,6 +285,7 @@ class OpenAIServingCompletion: if request.max_streaming_response_tokens is not None else (request.suffix or {}).get("max_streaming_response_tokens", 1) ) # dierctly passed & passed in suffix + max_streaming_response_tokens = max(max_streaming_response_tokens, 1) choices = [] chunk = CompletionStreamResponse( id=request_id, @@ -418,10 +419,6 @@ class OpenAIServingCompletion: ) yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}") - if choices: - chunk.choices = choices - yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" - choices = [] except Exception as e: yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n"