[Metrics] move prompt_tokens_total to main process metrics (#5118)

* [update] move prompt_tokens_total to main process metrics

* [update] move again from zmq thread to scheduler
This commit is contained in:
Yonghua Li
2025-11-19 14:12:15 +08:00
committed by GitHub
parent 3672afb487
commit c5510e9b43
4 changed files with 17 additions and 7 deletions

View File

@@ -562,6 +562,8 @@ class EngineService:
else:
continue
for task in tasks:
main_process_metrics.inc_value("prompt_tokens_total", task.prompt_token_ids_len)
main_process_metrics.dec_value("num_requests_waiting", len(tasks))
main_process_metrics.inc_value("num_requests_running", len(tasks))
except Exception as e:
@@ -597,6 +599,7 @@ class EngineService:
)
for task in tasks:
main_process_metrics.inc_value("prompt_tokens_total", task.prompt_token_ids_len)
task.schedule_start_time = time.time()
self.llm_logger.debug(f"get tasks from scheduler: {tasks}")

View File

@@ -159,7 +159,7 @@ class EngineClient:
del task["messages"]
api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}")
work_process_metrics.obs_value("request_params_max_tokens", task["max_tokens"])
work_process_metrics.inc_value("prompt_tokens_total", input_ids_len)
# work_process_metrics.inc_value("prompt_tokens_total", input_ids_len)
work_process_metrics.obs_value("request_prompt_tokens", input_ids_len)
except Exception as e:
api_server_logger.error(f"add_requests error: {e}, {str(traceback.format_exc())}")

View File

@@ -148,6 +148,7 @@ class MetricsManager(MetricsManagerInterface):
request_queue_time: "Histogram"
gpu_cache_usage_perc: "Gauge"
generation_tokens_total: "Counter"
prompt_tokens_total: "Counter"
request_prefill_time: "Histogram"
request_decode_time: "Histogram"
request_generation_tokens: "Histogram"
@@ -252,6 +253,12 @@ class MetricsManager(MetricsManagerInterface):
"description": "GPU KV-cache usage. 1 means 100 percent usage",
"kwargs": {},
},
"prompt_tokens_total": {
"type": Counter,
"name": "fastdeploy:prompt_tokens_total",
"description": "Total number of prompt tokens processed",
"kwargs": {},
},
"generation_tokens_total": {
"type": Counter,
"name": "fastdeploy:generation_tokens_total",
@@ -504,7 +511,7 @@ class MetricsManager(MetricsManagerInterface):
if workers == 1:
registry.register(work_process_metrics.e2e_request_latency)
registry.register(work_process_metrics.request_params_max_tokens)
registry.register(work_process_metrics.prompt_tokens_total)
# registry.register(work_process_metrics.prompt_tokens_total)
registry.register(work_process_metrics.request_prompt_tokens)
if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
self.register_speculative_metrics(registry)

View File

@@ -72,11 +72,11 @@ class WorkMetricsManager(MetricsManagerInterface):
buckets=build_1_2_5_buckets(33792),
labelnames=LABEL_NAMES,
)
self.prompt_tokens_total = Counter(
name="fastdeploy:prompt_tokens_total",
documentation="Total number of prompt tokens processed",
labelnames=LABEL_NAMES,
)
# self.prompt_tokens_total = Counter(
# name="fastdeploy:prompt_tokens_total",
# documentation="Total number of prompt tokens processed",
# labelnames=LABEL_NAMES,
# )
self.request_prompt_tokens = Histogram(
name="fastdeploy:request_prompt_tokens",
documentation="Number of prefill tokens processed.",