[BugFix] [CP] fix max streaming tokens invalid (#3798)

* Update serving_chat.py

* Update serving_completion.py
This commit is contained in:
ltd0924
2025-09-02 21:03:36 +08:00
committed by GitHub
parent 1e19833ba5
commit cc5430e4c2
2 changed files with 2 additions and 10 deletions

View File

@@ -148,7 +148,7 @@ class OpenAIServingChat:
if request.max_streaming_response_tokens is not None if request.max_streaming_response_tokens is not None
else (request.metadata or {}).get("max_streaming_response_tokens", 1) else (request.metadata or {}).get("max_streaming_response_tokens", 1)
) # dierctly passed & passed in metadata ) # dierctly passed & passed in metadata
max_streaming_response_tokens = max(1, max_streaming_response_tokens)
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
if enable_thinking is None: if enable_thinking is None:
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
@@ -316,11 +316,6 @@ class OpenAIServingChat:
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}") api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
choices = [] choices = []
if choices:
chunk.choices = choices
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
choices = []
if include_usage: if include_usage:
completion_tokens = previous_num_tokens completion_tokens = previous_num_tokens
usage = UsageInfo( usage = UsageInfo(

View File

@@ -285,6 +285,7 @@ class OpenAIServingCompletion:
if request.max_streaming_response_tokens is not None if request.max_streaming_response_tokens is not None
else (request.suffix or {}).get("max_streaming_response_tokens", 1) else (request.suffix or {}).get("max_streaming_response_tokens", 1)
) # dierctly passed & passed in suffix ) # dierctly passed & passed in suffix
max_streaming_response_tokens = max(max_streaming_response_tokens, 1)
choices = [] choices = []
chunk = CompletionStreamResponse( chunk = CompletionStreamResponse(
id=request_id, id=request_id,
@@ -418,10 +419,6 @@ class OpenAIServingCompletion:
) )
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}") api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
if choices:
chunk.choices = choices
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
choices = []
except Exception as e: except Exception as e:
yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n" yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n"