diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index df074771c..a71c6d14f 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -182,7 +182,7 @@ class OpenAIServingChat: if request.max_streaming_response_tokens is not None else (request.metadata or {}).get("max_streaming_response_tokens", 1) ) # dierctly passed & passed in metadata - + max_streaming_response_tokens = max(max_streaming_response_tokens, 1) enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None if enable_thinking is None: enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None @@ -370,11 +370,6 @@ class OpenAIServingChat: api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}") choices = [] - if choices: - chunk.choices = choices - yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" - choices = [] - if include_usage: completion_tokens = previous_num_tokens usage = UsageInfo( diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 059c22da3..c42c81839 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -331,6 +331,7 @@ class OpenAIServingCompletion: if request.max_streaming_response_tokens is not None else (request.suffix or {}).get("max_streaming_response_tokens", 1) ) # dierctly passed & passed in suffix + max_streaming_response_tokens = max(max_streaming_response_tokens, 1) choices = [] chunk = CompletionStreamResponse( id=request_id, @@ -461,10 +462,6 @@ class OpenAIServingCompletion: ) yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}") - if choices: - chunk.choices = choices - yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" - choices = [] except Exception as e: api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}")