[Metrics] Update time_to_first_token to include tokenization & queue time, and remove redundant metrics (#4993)

* [update] update time_to_first_tokens to include queue time, and remove first_token_latency and infer_latency * [doc] update docs * [ci] fix test * [chore] delete redundant code --------- Co-authored-by: Jiaxin Sui <95567040+plusNew001@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-11-26 14:42:17 +08:00
parent 287751f19d
commit cead6b26fa
9 changed files with 92 additions and 139 deletions
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -702,6 +702,7 @@ class EngineService:
                    batch=num_prefill_batch,
                )
                for task in tasks:
+                    task.schedule_start_time = time.time()
                    trace_print(LoggingEventName.REQUEST_QUEUE_END, task.request_id, getattr(task, "user", ""))

                if self.cfg.scheduler_config.splitwise_role == "decode":
@@ -814,6 +815,7 @@ class EngineService:
                                break
                            else:
                                raise
+
                # 2. Schedule requests
                tasks, error_tasks = self.resource_manager.schedule()