From 2dfbcf3cc9af4a742b28d99dd19bc80c61d3dc84 Mon Sep 17 00:00:00 2001
From: kxz2002 <115912648+kxz2002@users.noreply.github.com>
Date: Mon, 10 Nov 2025 19:28:44 +0800
Subject: [PATCH] [BugFix] Fix inference_start_time (#4922)

* fix inference_start_time

* fix inference_start_time
---
 fastdeploy/entrypoints/openai/serving_chat.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 0cc760ebf..723e48e3f 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -196,6 +196,7 @@ class OpenAIServingChat:
         num_cached_tokens = 0
         num_image_tokens = [0] * num_choices
         tool_called = [False] * num_choices
+        inference_start_time = [0] * num_choices
         max_streaming_response_tokens = (
             request.max_streaming_response_tokens
             if request.max_streaming_response_tokens is not None
@@ -272,9 +273,9 @@ class OpenAIServingChat:
 
                     if res["metrics"]["first_token_time"] is not None:
                         arrival_time = res["metrics"]["first_token_time"]
-                        inference_start_time = res["metrics"]["inference_start_time"]
+                        inference_start_time[idx] = res["metrics"]["inference_start_time"]
                     else:
-                        arrival_time = res["metrics"]["arrival_time"] - inference_start_time
+                        arrival_time = res["metrics"]["arrival_time"] - inference_start_time[idx]
                     if first_iteration:
                         num_prompt_tokens = len(prompt_token_ids)
                         num_cached_tokens = res.get("num_cached_tokens", 0)