[feat] add metrics for yiyan adapter (#3219)

* [feat] add metrics for yiyan adapter * [fix] fix metrics num_requests_waiting and num_requests_running * [fix] fix metrics gpu_cache_usage_perc * [refactor] change where requests_number increases * [chore] rename xxx_block_num as xxx_gpu_block_num, and update their values accordingly * [chore] delete useless code
2025-10-05 16:48:03 +08:00 · 2025-08-21 16:58:10 +08:00
parent 6854506533
commit d18a637a17
7 changed files with 181 additions and 18 deletions
--- a/fastdeploy/metrics/metrics.py
+++ b/fastdeploy/metrics/metrics.py
@@ -154,6 +154,22 @@ class MetricsManager:
    spec_decode_num_emitted_tokens_total: "Counter"
    spec_decode_draft_single_head_acceptance_rate: "list[Gauge]"

+    # for YIYAN Adapter
+    prefix_cache_token_num: "Gauge"
+    prefix_gpu_cache_token_num: "Gauge"
+    prefix_cpu_cache_token_num: "Gauge"
+    prefix_ssd_cache_token_num: "Gauge"
+    batch_size: "Gauge"
+    max_batch_size: "Gauge"
+    available_gpu_block_num: "Gauge"
+    free_gpu_block_num: "Gauge"
+    max_gpu_block_num: "Gauge"
+    available_gpu_resource: "Gauge"
+    requests_number: "Counter"
+    send_cache_failed_num: "Counter"
+    first_token_latency: "Gauge"
+    infer_latency: "Gauge"
+
    # 定义所有指标配置
    METRICS = {
        "num_requests_running": {
@@ -258,6 +274,91 @@ class MetricsManager:
            "description": "Total number of successfully processed requests",
            "kwargs": {},
        },
+        # for YIYAN Adapter
+        "prefix_cache_token_num": {
+            "type": Counter,
+            "name": "fastdeploy:prefix_cache_token_num",
+            "description": "Total number of cached tokens",
+            "kwargs": {},
+        },
+        "prefix_gpu_cache_token_num": {
+            "type": Counter,
+            "name": "fastdeploy:prefix_gpu_cache_token_num",
+            "description": "Total number of cached tokens on GPU",
+            "kwargs": {},
+        },
+        "prefix_cpu_cache_token_num": {
+            "type": Counter,
+            "name": "fastdeploy:prefix_cpu_cache_token_num",
+            "description": "Total number of cached tokens on CPU",
+            "kwargs": {},
+        },
+        "prefix_ssd_cache_token_num": {
+            "type": Counter,
+            "name": "fastdeploy:prefix_ssd_cache_token_num",
+            "description": "Total number of cached tokens on SSD",
+            "kwargs": {},
+        },
+        "batch_size": {
+            "type": Gauge,
+            "name": "fastdeploy:batch_size",
+            "description": "Real batch size during inference",
+            "kwargs": {},
+        },
+        "max_batch_size": {
+            "type": Gauge,
+            "name": "fastdeploy:max_batch_size",
+            "description": "Maximum batch size determined when service started",
+            "kwargs": {},
+        },
+        "available_gpu_block_num": {
+            "type": Gauge,
+            "name": "fastdeploy:available_gpu_block_num",
+            "description": "Number of available gpu blocks in cache, including prefix caching blocks that are not officially released",
+            "kwargs": {},
+        },
+        "free_gpu_block_num": {
+            "type": Gauge,
+            "name": "fastdeploy:free_gpu_block_num",
+            "description": "Number of free blocks in cache",
+            "kwargs": {},
+        },
+        "max_gpu_block_num": {
+            "type": Gauge,
+            "name": "fastdeploy:max_gpu_block_num",
+            "description": "Number of total blocks determined when service started",
+            "kwargs": {},
+        },
+        "available_gpu_resource": {
+            "type": Gauge,
+            "name": "fastdeploy:available_gpu_resource",
+            "description": "Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num",
+            "kwargs": {},
+        },
+        "requests_number": {
+            "type": Counter,
+            "name": "fastdeploy:requests_number",
+            "description": "Total number of requests received",
+            "kwargs": {},
+        },
+        "send_cache_failed_num": {
+            "type": Counter,
+            "name": "fastdeploy:send_cache_failed_num",
+            "description": "Total number of failures of sending cache",
+            "kwargs": {},
+        },
+        "first_token_latency": {
+            "type": Gauge,
+            "name": "fastdeploy:first_token_latency",
+            "description": "Latest time to first token in seconds",
+            "kwargs": {},
+        },
+        "infer_latency": {
+            "type": Gauge,
+            "name": "fastdeploy:infer_latency",
+            "description": "Latest time to generate one token in seconds",
+            "kwargs": {},
+        },
    }
    SPECULATIVE_METRICS = {}