diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index b30be69bf..7464e2d95 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -562,6 +562,8 @@ class EngineService: else: continue + for task in tasks: + main_process_metrics.inc_value("prompt_tokens_total", task.prompt_token_ids_len) main_process_metrics.dec_value("num_requests_waiting", len(tasks)) main_process_metrics.inc_value("num_requests_running", len(tasks)) except Exception as e: @@ -597,6 +599,7 @@ class EngineService: ) for task in tasks: + main_process_metrics.inc_value("prompt_tokens_total", task.prompt_token_ids_len) task.schedule_start_time = time.time() self.llm_logger.debug(f"get tasks from scheduler: {tasks}") diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 9987fce89..ffff367cb 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -159,7 +159,7 @@ class EngineClient: del task["messages"] api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}") work_process_metrics.obs_value("request_params_max_tokens", task["max_tokens"]) - work_process_metrics.inc_value("prompt_tokens_total", input_ids_len) + # work_process_metrics.inc_value("prompt_tokens_total", input_ids_len) work_process_metrics.obs_value("request_prompt_tokens", input_ids_len) except Exception as e: api_server_logger.error(f"add_requests error: {e}, {str(traceback.format_exc())}") diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py index a04ac127c..f82e3c0b0 100644 --- a/fastdeploy/metrics/metrics.py +++ b/fastdeploy/metrics/metrics.py @@ -148,6 +148,7 @@ class MetricsManager(MetricsManagerInterface): request_queue_time: "Histogram" gpu_cache_usage_perc: "Gauge" generation_tokens_total: "Counter" + prompt_tokens_total: "Counter" request_prefill_time: "Histogram" request_decode_time: "Histogram" request_generation_tokens: "Histogram" @@ -252,6 +253,12 @@ class MetricsManager(MetricsManagerInterface): "description": "GPU KV-cache usage. 1 means 100 percent usage", "kwargs": {}, }, + "prompt_tokens_total": { + "type": Counter, + "name": "fastdeploy:prompt_tokens_total", + "description": "Total number of prompt tokens processed", + "kwargs": {}, + }, "generation_tokens_total": { "type": Counter, "name": "fastdeploy:generation_tokens_total", @@ -504,7 +511,7 @@ class MetricsManager(MetricsManagerInterface): if workers == 1: registry.register(work_process_metrics.e2e_request_latency) registry.register(work_process_metrics.request_params_max_tokens) - registry.register(work_process_metrics.prompt_tokens_total) + # registry.register(work_process_metrics.prompt_tokens_total) registry.register(work_process_metrics.request_prompt_tokens) if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"): self.register_speculative_metrics(registry) diff --git a/fastdeploy/metrics/work_metrics.py b/fastdeploy/metrics/work_metrics.py index 2d211c087..cb93abb41 100644 --- a/fastdeploy/metrics/work_metrics.py +++ b/fastdeploy/metrics/work_metrics.py @@ -72,11 +72,11 @@ class WorkMetricsManager(MetricsManagerInterface): buckets=build_1_2_5_buckets(33792), labelnames=LABEL_NAMES, ) - self.prompt_tokens_total = Counter( - name="fastdeploy:prompt_tokens_total", - documentation="Total number of prompt tokens processed", - labelnames=LABEL_NAMES, - ) + # self.prompt_tokens_total = Counter( + # name="fastdeploy:prompt_tokens_total", + # documentation="Total number of prompt tokens processed", + # labelnames=LABEL_NAMES, + # ) self.request_prompt_tokens = Histogram( name="fastdeploy:request_prompt_tokens", documentation="Number of prefill tokens processed.",