mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[BugFix] [CP] fix max streaming tokens invalid (#3798)
* Update serving_chat.py * Update serving_completion.py
This commit is contained in:
@@ -148,7 +148,7 @@ class OpenAIServingChat:
|
|||||||
if request.max_streaming_response_tokens is not None
|
if request.max_streaming_response_tokens is not None
|
||||||
else (request.metadata or {}).get("max_streaming_response_tokens", 1)
|
else (request.metadata or {}).get("max_streaming_response_tokens", 1)
|
||||||
) # dierctly passed & passed in metadata
|
) # dierctly passed & passed in metadata
|
||||||
|
max_streaming_response_tokens = max(1, max_streaming_response_tokens)
|
||||||
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
|
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
|
||||||
if enable_thinking is None:
|
if enable_thinking is None:
|
||||||
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
|
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
|
||||||
@@ -316,11 +316,6 @@ class OpenAIServingChat:
|
|||||||
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
|
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
|
||||||
choices = []
|
choices = []
|
||||||
|
|
||||||
if choices:
|
|
||||||
chunk.choices = choices
|
|
||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
|
||||||
choices = []
|
|
||||||
|
|
||||||
if include_usage:
|
if include_usage:
|
||||||
completion_tokens = previous_num_tokens
|
completion_tokens = previous_num_tokens
|
||||||
usage = UsageInfo(
|
usage = UsageInfo(
|
||||||
|
@@ -285,6 +285,7 @@ class OpenAIServingCompletion:
|
|||||||
if request.max_streaming_response_tokens is not None
|
if request.max_streaming_response_tokens is not None
|
||||||
else (request.suffix or {}).get("max_streaming_response_tokens", 1)
|
else (request.suffix or {}).get("max_streaming_response_tokens", 1)
|
||||||
) # dierctly passed & passed in suffix
|
) # dierctly passed & passed in suffix
|
||||||
|
max_streaming_response_tokens = max(max_streaming_response_tokens, 1)
|
||||||
choices = []
|
choices = []
|
||||||
chunk = CompletionStreamResponse(
|
chunk = CompletionStreamResponse(
|
||||||
id=request_id,
|
id=request_id,
|
||||||
@@ -418,10 +419,6 @@ class OpenAIServingCompletion:
|
|||||||
)
|
)
|
||||||
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
|
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
|
||||||
if choices:
|
|
||||||
chunk.choices = choices
|
|
||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
|
||||||
choices = []
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n"
|
||||||
|
Reference in New Issue
Block a user