diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 1f46a0952..38d048ec8 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -344,6 +344,7 @@ class TokenProcessor: first_token_time=time.time() - task.inference_start_time, time_in_queue=task.schedule_start_time - task.preprocess_end_time, preprocess_cost_time=task.preprocess_end_time - task.preprocess_start_time, + request_start_time=task.arrival_time, ) self._record_first_token_metrics(task, current_time) @@ -476,6 +477,7 @@ class TokenProcessor: first_token_time=time.time() - task.inference_start_time, time_in_queue=task.schedule_start_time - task.preprocess_end_time, preprocess_cost_time=task.preprocess_end_time - task.preprocess_start_time, + request_start_time=task.arrival_time, ) self._record_first_token_metrics(task, current_time) diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 9627ea773..beeb2d99d 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -696,3 +696,25 @@ def test_non_streaming_chat_completion_disable_chat_template(openai_client, caps assert hasattr(disabled_response, "choices") assert len(disabled_response.choices) > 0 assert enabled_response.choices[0].message.content == disabled_response.choices[0].message.content + + +def test_non_streaming_min_max_token_equals_one(openai_client, capsys): + """ + Test chat/completion when min_tokens equals max_tokens equals 1. + Verify it returns exactly one token. + """ + # Test non-streaming chat + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=1, + temperature=0.0, + stream=False, + ) + assert hasattr(response, "choices") + assert len(response.choices) > 0 + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "content") + # Verify usage shows exactly 1 completion token + assert hasattr(response, "usage") + assert response.usage.completion_tokens == 1