mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Metrics] move prompt_tokens_total to main process metrics (#5118)
* [update] move prompt_tokens_total to main process metrics * [update] move again from zmq thread to scheduler
This commit is contained in:
@@ -562,6 +562,8 @@ class EngineService:
|
||||
else:
|
||||
continue
|
||||
|
||||
for task in tasks:
|
||||
main_process_metrics.inc_value("prompt_tokens_total", task.prompt_token_ids_len)
|
||||
main_process_metrics.dec_value("num_requests_waiting", len(tasks))
|
||||
main_process_metrics.inc_value("num_requests_running", len(tasks))
|
||||
except Exception as e:
|
||||
@@ -597,6 +599,7 @@ class EngineService:
|
||||
)
|
||||
|
||||
for task in tasks:
|
||||
main_process_metrics.inc_value("prompt_tokens_total", task.prompt_token_ids_len)
|
||||
task.schedule_start_time = time.time()
|
||||
|
||||
self.llm_logger.debug(f"get tasks from scheduler: {tasks}")
|
||||
|
||||
@@ -159,7 +159,7 @@ class EngineClient:
|
||||
del task["messages"]
|
||||
api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}")
|
||||
work_process_metrics.obs_value("request_params_max_tokens", task["max_tokens"])
|
||||
work_process_metrics.inc_value("prompt_tokens_total", input_ids_len)
|
||||
# work_process_metrics.inc_value("prompt_tokens_total", input_ids_len)
|
||||
work_process_metrics.obs_value("request_prompt_tokens", input_ids_len)
|
||||
except Exception as e:
|
||||
api_server_logger.error(f"add_requests error: {e}, {str(traceback.format_exc())}")
|
||||
|
||||
@@ -148,6 +148,7 @@ class MetricsManager(MetricsManagerInterface):
|
||||
request_queue_time: "Histogram"
|
||||
gpu_cache_usage_perc: "Gauge"
|
||||
generation_tokens_total: "Counter"
|
||||
prompt_tokens_total: "Counter"
|
||||
request_prefill_time: "Histogram"
|
||||
request_decode_time: "Histogram"
|
||||
request_generation_tokens: "Histogram"
|
||||
@@ -252,6 +253,12 @@ class MetricsManager(MetricsManagerInterface):
|
||||
"description": "GPU KV-cache usage. 1 means 100 percent usage",
|
||||
"kwargs": {},
|
||||
},
|
||||
"prompt_tokens_total": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:prompt_tokens_total",
|
||||
"description": "Total number of prompt tokens processed",
|
||||
"kwargs": {},
|
||||
},
|
||||
"generation_tokens_total": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:generation_tokens_total",
|
||||
@@ -504,7 +511,7 @@ class MetricsManager(MetricsManagerInterface):
|
||||
if workers == 1:
|
||||
registry.register(work_process_metrics.e2e_request_latency)
|
||||
registry.register(work_process_metrics.request_params_max_tokens)
|
||||
registry.register(work_process_metrics.prompt_tokens_total)
|
||||
# registry.register(work_process_metrics.prompt_tokens_total)
|
||||
registry.register(work_process_metrics.request_prompt_tokens)
|
||||
if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
|
||||
self.register_speculative_metrics(registry)
|
||||
|
||||
@@ -72,11 +72,11 @@ class WorkMetricsManager(MetricsManagerInterface):
|
||||
buckets=build_1_2_5_buckets(33792),
|
||||
labelnames=LABEL_NAMES,
|
||||
)
|
||||
self.prompt_tokens_total = Counter(
|
||||
name="fastdeploy:prompt_tokens_total",
|
||||
documentation="Total number of prompt tokens processed",
|
||||
labelnames=LABEL_NAMES,
|
||||
)
|
||||
# self.prompt_tokens_total = Counter(
|
||||
# name="fastdeploy:prompt_tokens_total",
|
||||
# documentation="Total number of prompt tokens processed",
|
||||
# labelnames=LABEL_NAMES,
|
||||
# )
|
||||
self.request_prompt_tokens = Histogram(
|
||||
name="fastdeploy:request_prompt_tokens",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
|
||||
Reference in New Issue
Block a user