mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-07 17:41:52 +08:00
After enabling "top_logprobs supports passing 0 and fix max_completion_tokens", an incorrect finish_reason was returned. (#2815)
* /v1/chat/completions endpoint now supports max_completion_tokens and fixes the return value of finish_reason * top_logprobs supports passing 0
This commit is contained in:
@@ -198,7 +198,8 @@ class OpenAIServingChat:
|
||||
sampled_token_ranks=raw_top_logprobs[2],
|
||||
)
|
||||
logprobs_res = self.build_logprobs_response(
|
||||
logprobs=top_logprobs,
|
||||
request_logprobs= request.logprobs,
|
||||
response_logprobs=top_logprobs,
|
||||
request_top_logprobs=request.top_logprobs,
|
||||
)
|
||||
|
||||
@@ -215,7 +216,9 @@ class OpenAIServingChat:
|
||||
if res["finished"]:
|
||||
num_choices -= 1
|
||||
work_process_metrics.e2e_request_latency.observe(time.time() - res["metrics"]["request_start_time"])
|
||||
if request.max_tokens is None or previous_num_tokens != request.max_tokens:
|
||||
has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
|
||||
max_tokens = request.max_completion_tokens or request.max_tokens
|
||||
if has_no_token_limit or previous_num_tokens != max_tokens:
|
||||
choice.finish_reason = "stop"
|
||||
if self.engine_client.reasoning_parser == "ernie_x1" and \
|
||||
output.get("finish_reason", "") == "tool_calls":
|
||||
@@ -323,7 +326,8 @@ class OpenAIServingChat:
|
||||
sampled_token_ranks=raw_top_logprobs[2],
|
||||
)
|
||||
logprobs_res = self.build_logprobs_response(
|
||||
logprobs=top_logprobs,
|
||||
request_logprobs=request.logprobs,
|
||||
response_logprobs=top_logprobs,
|
||||
request_top_logprobs=request.top_logprobs,
|
||||
)
|
||||
if logprobs_res and logprobs_res.content is not None:
|
||||
@@ -355,7 +359,9 @@ class OpenAIServingChat:
|
||||
logprobs=logprobs_full_res,
|
||||
finish_reason=None
|
||||
)
|
||||
if request.max_tokens is None or previous_num_tokens != request.max_tokens:
|
||||
has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
|
||||
max_tokens = request.max_completion_tokens or request.max_tokens
|
||||
if has_no_token_limit or previous_num_tokens != max_tokens:
|
||||
choice.finish_reason = "stop"
|
||||
if self.engine_client.reasoning_parser == "ernie_x1" and \
|
||||
output.get("finish_reason", "") == "tool_calls":
|
||||
@@ -386,7 +392,8 @@ class OpenAIServingChat:
|
||||
|
||||
def build_logprobs_response(
|
||||
self,
|
||||
logprobs: Optional[LogprobsLists],
|
||||
request_logprobs: bool,
|
||||
response_logprobs: Optional[LogprobsLists],
|
||||
request_top_logprobs: int,
|
||||
) -> Optional[LogProbs]:
|
||||
"""
|
||||
@@ -396,17 +403,17 @@ class OpenAIServingChat:
|
||||
|
||||
# Parameter validation
|
||||
if (
|
||||
logprobs is None
|
||||
response_logprobs is None
|
||||
or not request_logprobs
|
||||
or request_top_logprobs is None
|
||||
or request_top_logprobs <= 0
|
||||
or len(logprobs.logprob_token_ids) == 0
|
||||
or request_top_logprobs < 0
|
||||
):
|
||||
return None
|
||||
|
||||
try:
|
||||
# The top-k candidates for the current token
|
||||
topk_token_ids = logprobs.logprob_token_ids[0][:request_top_logprobs + 1]
|
||||
topk_logprobs = logprobs.logprobs[0][:request_top_logprobs + 1]
|
||||
topk_token_ids = response_logprobs.logprob_token_ids[0][:request_top_logprobs + 1]
|
||||
topk_logprobs = response_logprobs.logprobs[0][:request_top_logprobs + 1]
|
||||
|
||||
# Construct the candidate token structure (LogProbEntry) of topk
|
||||
top_logprob_entries: List[LogProbEntry] = []
|
||||
|
Reference in New Issue
Block a user