mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Sync Code] Update vs branch (#3403)
* Pre ce modified (#3335) (#3360) * Pre ce modified (#3335) * update * update * fix * fix * update * update * update * fix * update * update * update * add ut fix pr(3367) * [Bug Fix] Fix V1 video bug (#3387) * fix stopseq error info (#3342) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> * [BugFix] Fix default log level of paddleformers (#3377) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> * [Polish Code] Remove useless notes * feat(log):add_request_and_response_log (#3392) * Optimize CI execution workflow. (#3371) (#3384) * fix * [BugFix] fix control signal release failed (#3374) * [BugFix] * [BugFix] * [BugFix] * [BugFix] * fix * fix --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --------- Co-authored-by: YUNSHEN XIE <1084314248@qq.com> Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com> Co-authored-by: JYChen <zoooo0820@qq.com> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com> Co-authored-by: xiaolei373 <zley373@gmail.com> Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
@@ -101,6 +101,13 @@ class OpenAIServingCompletion:
|
||||
api_server_logger.info(f"start inference for request {num_choices}")
|
||||
prompt_batched_token_ids = []
|
||||
text_after_process_list = []
|
||||
try:
|
||||
if self.max_waiting_time < 0:
|
||||
await self.engine_client.semaphore.acquire()
|
||||
else:
|
||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
try:
|
||||
for idx, prompt in enumerate(request_prompts):
|
||||
request_id_idx = f"{request_id}-{idx}"
|
||||
@@ -117,14 +124,6 @@ class OpenAIServingCompletion:
|
||||
|
||||
del current_req_dict
|
||||
|
||||
try:
|
||||
if self.max_waiting_time < 0:
|
||||
await self.engine_client.semaphore.acquire()
|
||||
else:
|
||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
|
||||
if request.stream:
|
||||
return self.completion_stream_generator(
|
||||
request=request,
|
||||
@@ -221,8 +220,7 @@ class OpenAIServingCompletion:
|
||||
valid_results[rid] = data
|
||||
num_choices -= 1
|
||||
break
|
||||
|
||||
return self.request_output_to_completion_response(
|
||||
res = self.request_output_to_completion_response(
|
||||
final_res_batch=valid_results,
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
@@ -232,6 +230,8 @@ class OpenAIServingCompletion:
|
||||
completion_batched_token_ids=completion_batched_token_ids,
|
||||
text_after_process_list=text_after_process_list,
|
||||
)
|
||||
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
|
||||
return res
|
||||
except Exception as e:
|
||||
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
|
||||
raise
|
||||
@@ -323,6 +323,9 @@ class OpenAIServingCompletion:
|
||||
],
|
||||
)
|
||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||
api_server_logger.info(
|
||||
f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}"
|
||||
)
|
||||
first_iteration[idx] = False
|
||||
|
||||
self.engine_client.data_processor.process_response_dict(
|
||||
@@ -376,6 +379,15 @@ class OpenAIServingCompletion:
|
||||
choices[-1].finish_reason = self.calc_finish_reason(
|
||||
request.max_tokens, output_tokens[idx], output, tool_called
|
||||
)
|
||||
send_idx = output.get("send_idx")
|
||||
# 只有当 send_idx 明确为 0 时才记录日志
|
||||
if send_idx == 0 and not request.return_token_ids:
|
||||
chunk_temp = chunk
|
||||
chunk_temp.choices = choices
|
||||
api_server_logger.info(
|
||||
f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}"
|
||||
)
|
||||
del chunk_temp
|
||||
|
||||
if len(choices) == max_streaming_response_tokens or res["finished"]:
|
||||
chunk = CompletionStreamResponse(
|
||||
@@ -401,6 +413,7 @@ class OpenAIServingCompletion:
|
||||
),
|
||||
)
|
||||
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
|
||||
if choices:
|
||||
chunk.choices = choices
|
||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||
|
Reference in New Issue
Block a user