diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 37fc0c158..2a4c0e7ab 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -251,6 +251,7 @@ async def create_chat_completion(request: ChatCompletionRequest): """ Create a chat completion for the provided prompt and parameters. """ + api_server_logger.info(f"Chat Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: @@ -279,6 +280,7 @@ async def create_completion(request: CompletionRequest): """ Create a completion for the provided prompt and parameters. """ + api_server_logger.info(f"Completion Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 6632fde3a..b14f28e62 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -239,6 +239,7 @@ class OpenAIServingChat: prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens), ) yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n" + api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}") first_iteration = False output = res["outputs"] @@ -273,6 +274,7 @@ class OpenAIServingChat: logprobs=logprobs_res, arrival_time=arrival_time, ) + if res["finished"]: num_choices -= 1 work_process_metrics.e2e_request_latency.observe( @@ -304,6 +306,9 @@ class OpenAIServingChat: if len(choices) == max_streaming_response_tokens or res["finished"]: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + # 打印尾包 + if res["finished"]: + api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}") choices = [] if choices: @@ -456,13 +461,15 @@ class OpenAIServingChat: prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)), ) work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"]) - return ChatCompletionResponse( + res = ChatCompletionResponse( id=request_id, created=created_time, model=model_name, choices=choices, usage=usage, ) + api_server_logger.info(f"Chat response: {res.model_dump_json()}") + return res def _create_chat_logprobs( self, diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 71a38ec0d..a6aadcf06 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -221,8 +221,7 @@ class OpenAIServingCompletion: valid_results[rid] = data num_choices -= 1 break - - return self.request_output_to_completion_response( + res = self.request_output_to_completion_response( final_res_batch=valid_results, request=request, request_id=request_id, @@ -232,6 +231,8 @@ class OpenAIServingCompletion: completion_batched_token_ids=completion_batched_token_ids, text_after_process_list=text_after_process_list, ) + api_server_logger.info(f"Completion response: {res.model_dump_json()}") + return res except Exception as e: api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True) raise @@ -323,6 +324,9 @@ class OpenAIServingCompletion: ], ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}" + ) first_iteration[idx] = False self.engine_client.data_processor.process_response_dict( @@ -376,6 +380,15 @@ class OpenAIServingCompletion: choices[-1].finish_reason = self.calc_finish_reason( request.max_tokens, output_tokens[idx], output, tool_called ) + send_idx = output.get("send_idx") + # 只有当 send_idx 明确为 0 时才记录日志 + if send_idx == 0 and not request.return_token_ids: + chunk_temp = chunk + chunk_temp.choices = choices + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}" + ) + del chunk_temp if len(choices) == max_streaming_response_tokens or res["finished"]: chunk = CompletionStreamResponse( @@ -402,6 +415,7 @@ class OpenAIServingCompletion: ), ) yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}") if choices: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"