From cead6b26fa2c6b29377695b97dcb21854911fc08 Mon Sep 17 00:00:00 2001 From: Yonghua Li <39643373+liyonghua0910@users.noreply.github.com> Date: Wed, 26 Nov 2025 14:42:17 +0800 Subject: [PATCH] [Metrics] Update time_to_first_token to include tokenization & queue time, and remove redundant metrics (#4993) * [update] update time_to_first_tokens to include queue time, and remove first_token_latency and infer_latency * [doc] update docs * [ci] fix test * [chore] delete redundant code --------- Co-authored-by: Jiaxin Sui <95567040+plusNew001@users.noreply.github.com> --- docs/online_serving/metrics.md | 73 +++++++++---------- docs/zh/online_serving/metrics.md | 73 +++++++++---------- .../cache_manager/prefix_cache_manager.py | 1 + fastdeploy/engine/common_engine.py | 2 + fastdeploy/engine/request.py | 10 +-- .../engine/sched/resource_manager_v1.py | 2 - fastdeploy/metrics/metrics.py | 14 ---- fastdeploy/output/token_processor.py | 43 ++--------- tests/metrics/test_new_metrics.py | 13 ++-- 9 files changed, 92 insertions(+), 139 deletions(-) diff --git a/docs/online_serving/metrics.md b/docs/online_serving/metrics.md index 49af50700..a538db548 100644 --- a/docs/online_serving/metrics.md +++ b/docs/online_serving/metrics.md @@ -4,43 +4,42 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeploy service status through Metrics. When starting FastDeploy, you can specify the port for the Metrics service by configuring the `metrics-port` parameter. -| Metric Name | Type | Description | Unit | -| --------------------------------------------- | --------- |-------------------------------------|------| -| `fastdeploy:num_requests_running` | Gauge | Number of currently running requests | Count | -| `fastdeploy:num_requests_waiting` | Gauge | Number of currently waiting requests | Count | -| `fastdeploy:time_to_first_token_seconds` | Histogram | Time required to generate the first token | Seconds | -| `fastdeploy:time_per_output_token_seconds` | Histogram | Generation time for interval output tokens | Seconds | -| `fastdeploy:e2e_request_latency_seconds` | Histogram | Distribution of end-to-end request latency | Seconds | -| `fastdeploy:request_inference_time_seconds` | Histogram | Time consumed by requests in the RUNNING phase | Seconds | -| `fastdeploy:request_queue_time_seconds` | Histogram | Time consumed by requests in the WAITING phase | Seconds | -| `fastdeploy:request_prefill_time_seconds` | Histogram | Time consumed by requests in the prefill phase | Seconds | -| `fastdeploy:request_decode_time_seconds` | Histogram | Time consumed by requests in the decode phase | Seconds | -| `fastdeploy:prompt_tokens_total` | Counter | Total number of processed prompt tokens | Count | -| `fastdeploy:generation_tokens_total` | Counter | Total number of generated tokens | Count | -| `fastdeploy:request_prompt_tokens` | Histogram | Number of prompt tokens per request | Count | -| `fastdeploy:request_generation_tokens` | Histogram | Number of tokens generated per request | Count | -| `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU KV-cache usage rate | Percentage | -| `fastdeploy:request_params_max_tokens` | Histogram | Distribution of max_tokens for requests | Count | -| `fastdeploy:request_success_total` | Counter | Number of successfully processed requests | Count | -| `fastdeploy:cache_config_info` | Gauge | Information of the engine's CacheConfig | Count | -| `fastdeploy:available_batch_size` | Gauge | Number of requests that can still be inserted during the Decode phase| Count | -| `fastdeploy:hit_req_rate` | Gauge | Request-level prefix cache hit rate | Percentage | -| `fastdeploy:hit_token_rate` | Gauge | Token-level prefix cache hit rate | Percentage | -| `fastdeploy:cpu_hit_token_rate` | Gauge | Token-level CPU prefix cache hit rate | Percentage | -| `fastdeploy:gpu_hit_token_rate` | Gauge | Token-level GPU prefix cache hit rate | Percentage | -| `fastdeploy:prefix_cache_token_num` | Counter | Total number of cached tokens | Count | -| `fastdeploy:prefix_gpu_cache_token_num` | Counter | Total number of cached tokens on GPU | Count | -| `fastdeploy:prefix_cpu_cache_token_num` | Counter | Total number of cached tokens on CPU | Count | -| `fastdeploy:batch_size` | Gauge | Real batch size during inference | Count | -| `fastdeploy:max_batch_size` | Gauge | Maximum batch size determined when service started | Count | -| `fastdeploy:available_gpu_block_num` | Gauge | Number of available gpu blocks in cache, including prefix caching blocks that are not officially released | Count | -| `fastdeploy:free_gpu_block_num` | Gauge | Number of free blocks in cache | Count | -| `fastdeploy:max_gpu_block_num` | Gauge | Number of total blocks determined when service started| Count | -| `fastdeploy:available_gpu_resource` | Gauge | Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num | Count | -| `fastdeploy:requests_number` | Counter | Total number of requests received | Count | -| `fastdeploy:send_cache_failed_num` | Counter | Total number of failures of sending cache | Count | -| `fastdeploy:first_token_latency` | Gauge | Latest time to generate first token in seconds | Seconds | -| `fastdeploy:infer_latency` | Gauge | Latest time to generate one token in seconds | Seconds | +| Category | Metric Name | Type | Description | Unit | +| :---: | ----------------------------------------- | --------- | ------------------------------ | ------ | +| Request | `fastdeploy:requests_number` | Counter | Total number of received requests | count | +| Request | `fastdeploy:request_success_total` | Counter | Number of successfully processed requests | count | +| Request | `fastdeploy:num_requests_running` | Gauge | Number of requests currently running | count | +| Request | `fastdeploy:num_requests_waiting` | Gauge | Number of requests currently waiting | count | +| Latency | `fastdeploy:time_to_first_token_seconds` | Histogram | Time to generate the first token (TTFT) | s | +| Latency | `fastdeploy:time_per_output_token_seconds` | Histogram | Time interval between generated tokens (TPOT) | s | +| Latency | `fastdeploy:e2e_request_latency_seconds` | Histogram | End-to-end request latency distribution | s | +| Latency | `fastdeploy:request_inference_time_seconds` | Histogram | Time spent in the RUNNING phase | s | +| Latency | `fastdeploy:request_queue_time_seconds` | Histogram | Time spent in the WAITING phase | s | +| Latency | `fastdeploy:request_prefill_time_seconds` | Histogram | Time spent in the Prefill phase | s | +| Latency | `fastdeploy:request_decode_time_seconds` | Histogram | Time spent in the Decode phase | s | +| Token | `fastdeploy:prompt_tokens_total` | Counter | Total number of processed prompt tokens | count | +| Token | `fastdeploy:generation_tokens_total` | Counter | Total number of generated tokens | count | +| Token | `fastdeploy:request_prompt_tokens` | Histogram | Prompt token count per request | count | +| Token | `fastdeploy:request_generation_tokens` | Histogram | Generation token count per request | count | +| Token | `fastdeploy:request_params_max_tokens` | Histogram | Distribution of `max_tokens` per request | count | +| Batch | `fastdeploy:available_batch_size` | Gauge | Number of additional requests that can be inserted during Decode | count | +| Batch | `fastdeploy:batch_size` | Gauge | Actual batch size during inference | count | +| Batch | `fastdeploy:max_batch_size` | Gauge | Maximum batch size configured at service startup | count | +| KV Cache | `fastdeploy:cache_config_info` | Gauge | Cache configuration info of the inference engine | count | +| KV Cache | `fastdeploy:hit_req_rate` | Gauge | Prefix cache hit rate at the request level | % | +| KV Cache | `fastdeploy:hit_token_rate` | Gauge | Prefix cache hit rate at the token level | % | +| KV Cache | `fastdeploy:cpu_hit_token_rate` | Gauge | CPU-side token-level prefix cache hit rate | % | +| KV Cache | `fastdeploy:gpu_hit_token_rate` | Gauge | GPU-side token-level prefix cache hit rate | % | +| KV Cache | `fastdeploy:prefix_cache_token_num` | Counter | Total number of tokens in prefix cache | count | +| KV Cache | `fastdeploy:prefix_gpu_cache_token_num` | Counter | Total number of prefix cache tokens on GPU | count | +| KV Cache | `fastdeploy:prefix_cpu_cache_token_num` | Counter | Total number of prefix cache tokens on CPU | count | +| KV Cache | `fastdeploy:available_gpu_block_num` | Gauge | Available GPU blocks in cache (including unreleased prefix blocks) | count | +| KV Cache | `fastdeploy:free_gpu_block_num` | Gauge | Number of free GPU blocks in cache | count | +| KV Cache | `fastdeploy:max_gpu_block_num` | Gauge | Total number of GPU blocks initialized at startup | count | +| KV Cache | `fastdeploy:available_gpu_resource` | Gauge | Ratio of available GPU blocks to total GPU blocks | % | +| KV Cache | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU KV cache utilization | % | +| KV Cache | `fastdeploy:send_cache_failed_num` | Counter | Total number of cache send failures | count | + ## Accessing Metrics - Access URL: `http://localhost:8000/metrics` diff --git a/docs/zh/online_serving/metrics.md b/docs/zh/online_serving/metrics.md index 82244183b..75576995e 100644 --- a/docs/zh/online_serving/metrics.md +++ b/docs/zh/online_serving/metrics.md @@ -4,43 +4,42 @@ 在 FastDeploy 启动后,支持通过 Metrics 持续监控的 FastDeploy 的服务状态。启动 FastDeploy 时,可以通过配置 `metrics-port` 参数指定 Metircs 服务的端口。 -| 指标名称 | 类型 | 说明 | 单位 | -| ----------------------------------------- | --------- |------------------------------|------| -| `fastdeploy:num_requests_running` | Gauge | 当前正在运行的请求数量 | 个 | -| `fastdeploy:num_requests_waiting` | Gauge | 当前等待中的请求数量 | 个 | -| `fastdeploy:time_to_first_token_seconds` | Histogram | 首 token 生成所需时间 | 秒 | -| `fastdeploy:time_per_output_token_seconds`| Histogram | 间隔输出 token 的生成时间 | 秒 | -| `fastdeploy:e2e_request_latency_seconds` | Histogram | 请求的端到端延迟分布 | 秒 | -| `fastdeploy:request_inference_time_seconds`| Histogram | 请求在 RUNNING 阶段耗时 | 秒 | -| `fastdeploy:request_queue_time_seconds` | Histogram | 请求在 WAITING 阶段耗时 | 秒 | -| `fastdeploy:request_prefill_time_seconds` | Histogram | 请求的 prefill 阶段耗时 | 秒 | -| `fastdeploy:request_decode_time_seconds` | Histogram | 请求的 decode 阶段耗时 | 秒 | -| `fastdeploy:prompt_tokens_total` | Counter | 已处理的 prompt token 总数 | 个 | -| `fastdeploy:generation_tokens_total` | Counter | 已生成的 token 总数 | 个 | -| `fastdeploy:request_prompt_tokens` | Histogram | 每个请求的 prompt token 数量 | 个 | -| `fastdeploy:request_generation_tokens` | Histogram | 每个请求生成的 token 数量 | 个 | -| `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU KV-cache 使用率 | 百分比 | -| `fastdeploy:request_params_max_tokens` | Histogram | 请求的 max_tokens 分布 | 个 | -| `fastdeploy:request_success_total` | Counter | 成功处理的请求个数 | 个 | -| `fastdeploy:cache_config_info` | Gauge | 推理引擎的缓存配置信息 | 个 | -| `fastdeploy:available_batch_size` | Gauge | Decode阶段还可以插入的请求数量 | 个 | -| `fastdeploy:hit_req_rate` | Gauge | 请求级别前缀缓存命中率 | 百分比 | -| `fastdeploy:hit_token_rate` | Gauge | token级别前缀缓存命中率 | 百分比 | -| `fastdeploy:cpu_hit_token_rate` | Gauge | token级别CPU前缀缓存命中率 | 百分比 | -| `fastdeploy:gpu_hit_token_rate` | Gauge | token级别GPU前缀缓存命中率 | 百分比 | -| `fastdeploy:prefix_cache_token_num` | Counter | 前缀缓存token总数 | 个 | -| `fastdeploy:prefix_gpu_cache_token_num` | Counter | 位于GPU上的前缀缓存token总数 | 个 | -| `fastdeploy:prefix_cpu_cache_token_num` | Counter | 位于GPU上的前缀缓存token总数 | 个 | -| `fastdeploy:batch_size` | Gauge | 推理时的真实批处理大小 | 个 | -| `fastdeploy:max_batch_size` | Gauge | 服务启动时确定的最大批处理大小 | 个 | -| `fastdeploy:available_gpu_block_num` | Gauge | 缓存中可用的GPU块数量(包含尚未正式释放的前缀缓存块)| 个 | -| `fastdeploy:free_gpu_block_num` | Gauge | 缓存中的可用块数 | 个 | -| `fastdeploy:max_gpu_block_num` | Gauge | 服务启动时确定的总块数 | 个 | -| `fastdeploy:available_gpu_resource` | Gauge | 可用块占比,即可用GPU块数量 / 最大GPU块数量| 个 | -| `fastdeploy:requests_number` | Counter | 已接收的请求总数 | 个 | -| `fastdeploy:send_cache_failed_num` | Counter | 发送缓存失败的总次数 | 个 | -| `fastdeploy:first_token_latency` | Gauge | 最近一次生成首token耗时 | 秒 | -| `fastdeploy:infer_latency` | Gauge | 最近一次生成单个token的耗时 | 秒 | +| 指标分类 | 指标名称 | 数据类型 | 说明 | 单位 | +| :---: | ----------------------------------------- | --------- |------------------------------|------| +| 请求| `fastdeploy:requests_number` | Counter | 已接收的请求总数 | 个 | +| 请求 | `fastdeploy:request_success_total` | Counter | 成功处理的请求个数 | 个 | +| 请求 | `fastdeploy:num_requests_running` | Gauge | 当前正在运行的请求数量 | 个 | +| 请求 | `fastdeploy:num_requests_waiting` | Gauge | 当前等待中的请求数量 | 个 | +| 时延 | `fastdeploy:time_to_first_token_seconds` | Histogram | 首 token 生成所需时间 | 秒 | +| 时延 | `fastdeploy:time_per_output_token_seconds`| Histogram | 间隔输出 token 的生成时间 | 秒 | +| 时延 | `fastdeploy:e2e_request_latency_seconds` | Histogram | 请求的端到端延迟分布 | 秒 | +| 时延 | `fastdeploy:request_inference_time_seconds`| Histogram | 请求在 RUNNING 阶段耗时 | 秒 | +| 时延 | `fastdeploy:request_queue_time_seconds` | Histogram | 请求在 WAITING 阶段耗时 | 秒 | +| 时延 | `fastdeploy:request_prefill_time_seconds` | Histogram | 请求的 Prefill 阶段耗时 | 秒 | +| 时延 | `fastdeploy:request_decode_time_seconds` | Histogram | 请求的 Decode 阶段耗时 | 秒 | +| Token | `fastdeploy:prompt_tokens_total` | Counter | 已处理的 prompt token 总数 | 个 | +| Token | `fastdeploy:generation_tokens_total` | Counter | 已生成的 token 总数 | 个 | +| Token | `fastdeploy:request_prompt_tokens` | Histogram | 每个请求的 prompt token 数量 | 个 | +| Token | `fastdeploy:request_generation_tokens` | Histogram | 每个请求的 generation token 数量 | 个 | +| Token | `fastdeploy:request_params_max_tokens` | Histogram | 请求的 max_tokens 分布 | 个 | +| 批处理 | `fastdeploy:available_batch_size` | Gauge | Decode 阶段还可以插入的请求数量 | 个 | +| 批处理 | `fastdeploy:batch_size` | Gauge | 推理时的真实批处理大小 | 个 | +| 批处理 | `fastdeploy:max_batch_size` | Gauge | 服务启动时确定的最大批处理大小 | 个 | +| KV缓存 | `fastdeploy:cache_config_info` | Gauge | 推理引擎的缓存配置信息 | 个 | +| KV缓存 | `fastdeploy:hit_req_rate` | Gauge | 请求级别前缀缓存命中率 | 百分比 | +| KV缓存 | `fastdeploy:hit_token_rate` | Gauge | token 级别前缀缓存命中率 | 百分比 | +| KV缓存 | `fastdeploy:cpu_hit_token_rate` | Gauge | token 级别 CPU 前缀缓存命中率 | 百分比 | +| KV缓存 | `fastdeploy:gpu_hit_token_rate` | Gauge | token 级别 GPU 前缀缓存命中率 | 百分比 | +| KV缓存 | `fastdeploy:prefix_cache_token_num` | Counter | 前缀缓存token总数 | 个 | +| KV缓存 | `fastdeploy:prefix_gpu_cache_token_num` | Counter | 位于 GPU 上的前缀缓存 token 总数 | 个 | +| KV缓存 | `fastdeploy:prefix_cpu_cache_token_num` | Counter | 位于 GPU 上的前缀缓存 token 总数 | 个 | +| KV缓存 | `fastdeploy:available_gpu_block_num` | Gauge | 缓存中可用的 GPU 块数量(包含尚未正式释放的前缀缓存块)| 个 | +| KV缓存 | `fastdeploy:free_gpu_block_num` | Gauge | 缓存中的可用块数 | 个 | +| KV缓存 | `fastdeploy:max_gpu_block_num` | Gauge | 服务启动时确定的总块数 | 个 | +| KV缓存 | `fastdeploy:available_gpu_resource` | Gauge | 可用块占比,即可用 GPU 块数量 / 最大GPU块数量| 百分比 | +| KV缓存 | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU 上的 KV 缓存使用率 | 百分比 | +| KV缓存 | `fastdeploy:send_cache_failed_num` | Counter | 发送缓存失败的总次数 | 个 | + ## 指标访问 - 访问地址:`http://localhost:8000/metrics` diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 14a86de79..a91e1061c 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -1820,6 +1820,7 @@ class PrefixCacheManager: # reset metrics self.metrics.reset_metrics() main_process_metrics.free_gpu_block_num.set(len(self.gpu_free_block_list)) + main_process_metrics.available_gpu_block_num.set(len(self.gpu_free_block_list)) main_process_metrics.available_gpu_resource.set(self.available_gpu_resource) def clear_prefix_cache(self): diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 136f19508..a8ca3d947 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -702,6 +702,7 @@ class EngineService: batch=num_prefill_batch, ) for task in tasks: + task.schedule_start_time = time.time() trace_print(LoggingEventName.REQUEST_QUEUE_END, task.request_id, getattr(task, "user", "")) if self.cfg.scheduler_config.splitwise_role == "decode": @@ -814,6 +815,7 @@ class EngineService: break else: raise + # 2. Schedule requests tasks, error_tasks = self.resource_manager.schedule() diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index acd0f7004..ecafaec7f 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -75,8 +75,9 @@ class Request: pooling_params: Optional[PoolingParams] = None, preprocess_start_time: Optional[float] = None, preprocess_end_time: Optional[float] = None, - inference_start_time: float = 0, - llm_engine_recv_req_timestamp: float = 0, + schedule_start_time: Optional[float] = None, + inference_start_time: Optional[float] = None, + llm_engine_recv_req_timestamp: Optional[float] = None, multimodal_inputs: Optional[dict] = None, multimodal_data: Optional[dict] = None, disable_chat_template: bool = False, @@ -122,10 +123,9 @@ class Request: self.arrival_time = arrival_time self.preprocess_start_time = preprocess_start_time self.preprocess_end_time = preprocess_end_time + self.schedule_start_time = schedule_start_time self.inference_start_time = inference_start_time - self.llm_engine_recv_req_timestamp = ( - llm_engine_recv_req_timestamp if llm_engine_recv_req_timestamp else time.time() - ) + self.llm_engine_recv_req_timestamp = llm_engine_recv_req_timestamp or time.time() self.disable_chat_template = disable_chat_template self.disaggregate_info = disaggregate_info diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index fbe30d24b..9f4873112 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -691,7 +691,6 @@ class ResourceManagerV1(ResourceManager): self.running.append(request) scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) request.inference_start_time = time.time() - request.schedule_start_time = time.time() token_budget -= num_new_tokens request.num_computed_tokens += num_new_tokens if self.config.cache_config.enable_prefix_caching: @@ -926,7 +925,6 @@ class ResourceManagerV1(ResourceManager): with self.lock: for request in requests: request.inference_start_time = time.time() - request.schedule_start_time = time.time() self.running.append(request) def preallocate_resource_in_p(self, request: Request): diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py index a499c3237..679df29de 100644 --- a/fastdeploy/metrics/metrics.py +++ b/fastdeploy/metrics/metrics.py @@ -164,8 +164,6 @@ class MetricsManager: available_gpu_resource: "Gauge" requests_number: "Counter" send_cache_failed_num: "Counter" - first_token_latency: "Gauge" - infer_latency: "Gauge" cache_config_info: "Gauge" available_batch_size: "Gauge" hit_req_rate: "Gauge" @@ -349,18 +347,6 @@ class MetricsManager: "description": "Total number of failures of sending cache", "kwargs": {}, }, - "first_token_latency": { - "type": Gauge, - "name": "fastdeploy:first_token_latency", - "description": "Latest time to first token in seconds", - "kwargs": {}, - }, - "infer_latency": { - "type": Gauge, - "name": "fastdeploy:infer_latency", - "description": "Latest time to generate one token in seconds", - "kwargs": {}, - }, "available_batch_size": { "type": Gauge, "name": "fastdeploy:available_batch_size", diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 7eed4b662..58a64401e 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -15,7 +15,6 @@ """ import copy -import os import threading import time import traceback @@ -37,7 +36,7 @@ from fastdeploy.engine.request import ( RequestMetrics, RequestOutput, ) -from fastdeploy.inter_communicator import IPCSignal, ZmqIpcServer +from fastdeploy.inter_communicator import ZmqIpcServer from fastdeploy.metrics.metrics import main_process_metrics from fastdeploy.platforms import current_platform from fastdeploy.trace.constants import LoggingEventName @@ -114,14 +113,6 @@ class TokenProcessor: self.num_accept_requests_per_head = [ 0, ] * MAX_DRAFT_TOKENS - prefill_time_data = np.zeros([100], dtype=np.float32) - self.prefill_time_signal = IPCSignal( - name="prefill_time_signal", - array=prefill_time_data, - dtype=np.float32, - suffix=os.getpid(), - create=True, - ) self.executor = ThreadPoolExecutor(max_workers=1) self.prefill_result_status = dict() self._finalizer = weakref.finalize(self, self._cleanup_resources) @@ -129,9 +120,6 @@ class TokenProcessor: def _cleanup_resources(self): """Cleaning up shared memory resources""" - if hasattr(self, "prefill_time_signal"): - self.prefill_time_signal.clear() - if hasattr(self, "executor"): self.executor.shutdown(wait=False) @@ -255,7 +243,7 @@ class TokenProcessor: arrival_time=task.arrival_time, inference_start_time=task.inference_start_time, first_token_time=time.time() - task.inference_start_time, - time_in_queue=task.schedule_start_time - task.preprocess_end_time, + time_in_queue=task.inference_start_time - task.preprocess_end_time, preprocess_cost_time=task.preprocess_end_time - task.preprocess_start_time, request_start_time=task.arrival_time, ) @@ -416,28 +404,10 @@ class TokenProcessor: if self.output_tokens[0, 0] == -2: continue llm_logger.debug(f"rank_id {rank_id} self.output_tokens[0, 0] {self.output_tokens[0, 0]}") - self._process_prefill_metrics() self._process_batch_output() except Exception as e: llm_logger.info(f"while get input_data error: {e} {traceback.format_exc()!s}") - def _process_prefill_metrics(self): - """Asynchronous processing prefill time indicators""" - - def process_metrics(): - try: - current_index = 0 - while current_index < len(self.prefill_time_signal.value): - prefill_time = self.prefill_time_signal.value[current_index] - if prefill_time > 0: - main_process_metrics.request_prefill_time.observe(prefill_time) - self.prefill_time_signal.value[current_index] = 0 - current_index += 1 - except Exception as e: - llm_logger.error(f"Error processing prefill metrics: {e}, {str(traceback.format_exc())}") - - self.executor.submit(process_metrics) - def postprocess(self, batch_result: List[RequestOutput], mtype=3): """ single post-processing function @@ -654,7 +624,7 @@ class TokenProcessor: inference_start_time=task.inference_start_time, model_execute_time=time.time() - task.inference_start_time, first_token_time=time.time() - task.inference_start_time, - time_in_queue=task.schedule_start_time - task.preprocess_end_time, + time_in_queue=task.inference_start_time - task.preprocess_end_time, preprocess_cost_time=task.preprocess_end_time - task.preprocess_start_time, request_start_time=task.arrival_time, llm_engine_recv_req_timestamp=task.llm_engine_recv_req_timestamp, @@ -784,9 +754,9 @@ class TokenProcessor: task.first_token_time = current_time trace_print(LoggingEventName.FIRST_TOKEN_GENERATED, task.request_id, getattr(task, "user", "")) trace_print(LoggingEventName.DECODE_START, task.request_id, getattr(task, "user", "")) - main_process_metrics.first_token_latency.set(current_time - task.inference_start_time) - main_process_metrics.time_to_first_token.observe(current_time - task.inference_start_time) - main_process_metrics.request_queue_time.observe(task.schedule_start_time - task.preprocess_end_time) + main_process_metrics.time_to_first_token.observe(current_time - task.arrival_time) + main_process_metrics.request_queue_time.observe(task.inference_start_time - task.preprocess_end_time) + main_process_metrics.request_prefill_time.observe(current_time - task.inference_start_time) def _record_completion_metrics(self, task, current_time): """Record metrics when request completes""" @@ -797,7 +767,6 @@ class TokenProcessor: trace_print(LoggingEventName.POSTPROCESSING_START, task.request_id, getattr(task, "user", "")) main_process_metrics.num_requests_running.dec(1) main_process_metrics.request_success_total.inc() - main_process_metrics.infer_latency.set(current_time - task.inference_start_time) main_process_metrics.request_inference_time.observe(current_time - task.inference_start_time) main_process_metrics.request_generation_tokens.observe(self.tokens_counter[task.request_id]) diff --git a/tests/metrics/test_new_metrics.py b/tests/metrics/test_new_metrics.py index c1bbcd67b..030acaf42 100644 --- a/tests/metrics/test_new_metrics.py +++ b/tests/metrics/test_new_metrics.py @@ -59,13 +59,12 @@ class TestCoverageFix(unittest.TestCase): self.mock_split_connector = MagicMock() self.mock_resource_manager = MagicMock() - with patch("fastdeploy.output.token_processor.IPCSignal"): - self.processor = TokenProcessor( - cfg=self.mock_cfg, - cached_generated_tokens=self.mock_cached_generated_tokens, - engine_worker_queue=self.mock_engine_worker_queue, - split_connector=self.mock_split_connector, - ) + self.processor = TokenProcessor( + cfg=self.mock_cfg, + cached_generated_tokens=self.mock_cached_generated_tokens, + engine_worker_queue=self.mock_engine_worker_queue, + split_connector=self.mock_split_connector, + ) self.processor.resource_manager = self.mock_resource_manager # 使用 patch 来模拟 token_processor 模块中引用的 main_process_metrics