[Sync Code] Update vs branch (#3403)

* Pre ce modified (#3335) (#3360)

* Pre ce modified (#3335)

* update

* update

* fix

* fix

* update

* update

* update

* fix

* update

* update

* update

* add ut fix pr(3367)

* [Bug Fix] Fix V1 video bug (#3387)

* fix stopseq error info (#3342)

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>

* [BugFix] Fix default log level of paddleformers (#3377)

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>

* [Polish Code] Remove useless notes

* feat(log):add_request_and_response_log (#3392)

* Optimize CI execution workflow. (#3371) (#3384)

* fix

* [BugFix] fix control signal release failed (#3374)

* [BugFix]

* [BugFix]

* [BugFix]

* [BugFix]

* fix

* fix

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>

---------

Co-authored-by: YUNSHEN XIE <1084314248@qq.com>
Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com>
Co-authored-by: JYChen <zoooo0820@qq.com>
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
Co-authored-by: xiaolei373 <zley373@gmail.com>
Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
Jiang-Jia-Jun
2025-08-14 17:14:45 +08:00
committed by GitHub
parent 81092c0fe3
commit e11331927f
16 changed files with 329 additions and 155 deletions

View File

@@ -101,6 +101,13 @@ class OpenAIServingCompletion:
api_server_logger.info(f"start inference for request {num_choices}")
prompt_batched_token_ids = []
text_after_process_list = []
try:
if self.max_waiting_time < 0:
await self.engine_client.semaphore.acquire()
else:
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
except Exception:
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
try:
for idx, prompt in enumerate(request_prompts):
request_id_idx = f"{request_id}-{idx}"
@@ -117,14 +124,6 @@ class OpenAIServingCompletion:
del current_req_dict
try:
if self.max_waiting_time < 0:
await self.engine_client.semaphore.acquire()
else:
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
except Exception:
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
if request.stream:
return self.completion_stream_generator(
request=request,
@@ -221,8 +220,7 @@ class OpenAIServingCompletion:
valid_results[rid] = data
num_choices -= 1
break
return self.request_output_to_completion_response(
res = self.request_output_to_completion_response(
final_res_batch=valid_results,
request=request,
request_id=request_id,
@@ -232,6 +230,8 @@ class OpenAIServingCompletion:
completion_batched_token_ids=completion_batched_token_ids,
text_after_process_list=text_after_process_list,
)
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
return res
except Exception as e:
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
raise
@@ -323,6 +323,9 @@ class OpenAIServingCompletion:
],
)
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
api_server_logger.info(
f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}"
)
first_iteration[idx] = False
self.engine_client.data_processor.process_response_dict(
@@ -376,6 +379,15 @@ class OpenAIServingCompletion:
choices[-1].finish_reason = self.calc_finish_reason(
request.max_tokens, output_tokens[idx], output, tool_called
)
send_idx = output.get("send_idx")
# 只有当 send_idx 明确为 0 时才记录日志
if send_idx == 0 and not request.return_token_ids:
chunk_temp = chunk
chunk_temp.choices = choices
api_server_logger.info(
f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}"
)
del chunk_temp
if len(choices) == max_streaming_response_tokens or res["finished"]:
chunk = CompletionStreamResponse(
@@ -401,6 +413,7 @@ class OpenAIServingCompletion:
),
)
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
if choices:
chunk.choices = choices
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"