diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index fa287a063..1a1bc3e61 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -197,6 +197,7 @@ async def create_chat_completion(request: ChatCompletionRequest): """ Create a chat completion for the provided prompt and parameters. """ + api_server_logger.info(f"Chat Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: @@ -218,6 +219,7 @@ async def create_completion(request: CompletionRequest): """ Create a completion for the provided prompt and parameters. """ + api_server_logger.info(f"Completion Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index d28eb3c7f..73459c9d4 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -225,6 +225,7 @@ class OpenAIServingChat: prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens), ) yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n" + api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}") first_iteration = False output = res["outputs"] @@ -258,6 +259,7 @@ class OpenAIServingChat: logprobs=logprobs_res, arrival_time=arrival_time, ) + if res["finished"]: num_choices -= 1 work_process_metrics.e2e_request_latency.observe( @@ -291,6 +293,9 @@ class OpenAIServingChat: if len(choices) == max_streaming_response_tokens or res["finished"]: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + # 打印尾包 + if res["finished"]: + api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}") choices = [] if choices: @@ -449,13 +454,15 @@ class OpenAIServingChat: prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)), ) work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"]) - return ChatCompletionResponse( + res = ChatCompletionResponse( id=request_id, created=created_time, model=model_name, choices=choices, usage=usage, ) + api_server_logger.info(f"Chat response: {res.model_dump_json()}") + return res def build_logprobs_response( self, diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 871604799..0a422d0fa 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -190,8 +190,7 @@ class OpenAIServingCompletion: valid_results[rid] = data num_choices -= 1 break - - return self.request_output_to_completion_response( + res = self.request_output_to_completion_response( final_res_batch=valid_results, request=request, request_id=request_id, @@ -200,6 +199,8 @@ class OpenAIServingCompletion: prompt_batched_token_ids=prompt_batched_token_ids, completion_batched_token_ids=completion_batched_token_ids, ) + api_server_logger.info(f"Completion response: {res.model_dump_json()}") + return res except Exception as e: api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True) raise @@ -278,6 +279,9 @@ class OpenAIServingCompletion: ], ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}" + ) first_iteration[idx] = False self.engine_client.data_processor.process_response_dict(res, stream=True) @@ -313,6 +317,16 @@ class OpenAIServingCompletion: output_tokens[idx] += 1 + send_idx = output.get("send_idx") + # 只有当 send_idx 明确为 0 时才记录日志 + if send_idx == 0 and not request.return_token_ids: + chunk_temp = chunk + chunk_temp.choices = choices + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}" + ) + del chunk_temp + if len(choices) == max_streaming_response_tokens or res["finished"]: chunk = CompletionStreamResponse( id=request_id, @@ -337,6 +351,7 @@ class OpenAIServingCompletion: ), ) yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}") if choices: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"