[Features] Add speculative metrics (#2857)

2025-10-05 16:48:03 +08:00 · 2025-07-17 11:08:55 +08:00
parent 52aca233e8
commit 42d4001400
2 changed files with 164 additions and 10 deletions
--- a/fastdeploy/metrics/metrics.py
+++ b/fastdeploy/metrics/metrics.py
@@ -107,8 +107,6 @@ REQUEST_LATENCY_BUCKETS = [
 ]


-
-
 class MetricsManager:
    """Prometheus Metrics Manager handles all metric updates """

@@ -126,6 +124,12 @@ class MetricsManager:
    request_decode_time: 'Histogram'
    request_generation_tokens: 'Histogram'
    request_success_total: 'Counter'
+    spec_decode_draft_acceptance_rate: 'Gauge'
+    spec_decode_efficiency: 'Gauge'
+    spec_decode_num_accepted_tokens_total: 'Counter'
+    spec_decode_num_draft_tokens_total: 'Counter'
+    spec_decode_num_emitted_tokens_total: 'Counter'
+    spec_decode_draft_single_head_acceptance_rate: 'list[Gauge]'

    # 定义所有指标配置
    METRICS = {
@@ -216,8 +220,9 @@ class MetricsManager:
            'name': 'fastdeploy:request_success_total',
            'description': 'Total number of successfully processed requests',
            'kwargs': {}
-        }
+        },
    }
+    SPECULATIVE_METRICS = {}

    def __init__(self):
        """Initializes the Prometheus metrics and starts the HTTP server if not already initialized."""
@@ -229,6 +234,75 @@ class MetricsManager:
                **config['kwargs']
            ))

+    def _init_speculative_metrics(self, speculative_method, num_speculative_tokens):
+        self.SPECULATIVE_METRICS = {
+            "spec_decode_draft_acceptance_rate": {
+                "type": Gauge,
+                "name": "fastdeploy:spec_decode_draft_acceptance_rate",
+                "description": "Acceptance rate of speculative decoding",
+                "kwargs": {},
+            },
+            "spec_decode_num_accepted_tokens_total": {
+                "type": Counter,
+                "name": "fastdeploy:spec_decode_num_accepted_tokens_total",
+                "description": "Total number of tokens accepted by the scoring model and verification program",
+                "kwargs": {},
+            },
+            "spec_decode_num_emitted_tokens_total": {
+                "type": Counter,
+                "name": "fastdeploy:spec_decode_num_emitted_tokens_total",
+                "description": "Total number of tokens output by the entire system",
+                "kwargs": {},
+            },
+        }
+        if speculative_method == "mtp":
+            self.SPECULATIVE_METRICS["spec_decode_efficiency"]={
+                "type": Gauge,
+                "name": "fastdeploy:spec_decode_efficiency",
+                "description": "Efficiency of speculative decoding",
+                "kwargs": {},
+            }
+            self.SPECULATIVE_METRICS["spec_decode_num_draft_tokens_total"]={
+                "type": Counter,
+                "name": "fastdeploy:spec_decode_num_draft_tokens_total",
+                "description": "Total number of speculative tokens generated by the proposal method",
+                "kwargs": {},
+            }
+            self.SPECULATIVE_METRICS["spec_decode_draft_single_head_acceptance_rate"]={
+                "type": list[Gauge],
+                "name": "fastdeploy:spec_decode_draft_single_head_acceptance_rate",
+                "description": "Single head acceptance rate of speculative decoding",
+                "kwargs": {},
+            }
+        for metric_name, config in self.SPECULATIVE_METRICS.items():
+            if metric_name == "spec_decode_draft_single_head_acceptance_rate":
+                gauges = []
+                for i in range(num_speculative_tokens):
+                    gauges.append(
+                        Gauge(
+                            f"{config['name']}_{i}",
+                            f"{config['description']} (head {i})",
+                        )
+                    )
+                    setattr(self, metric_name, gauges)
+            else:
+                setattr(
+                    self,
+                    metric_name,
+                    config["type"](
+                        config["name"], config["description"], **config["kwargs"]
+                    ),
+                )
+
+    def register_speculative_metrics(self, registry: CollectorRegistry):
+        """Register all speculative metrics to the specified registry"""
+        for metric_name in self.SPECULATIVE_METRICS:
+            if metric_name == "spec_decode_draft_single_head_acceptance_rate":
+                for gauge in getattr(self, metric_name):
+                    registry.register(gauge)
+            else:
+                registry.register(getattr(self, metric_name))
+
    def register_all(self, registry: CollectorRegistry, workers: int = 1):
        """Register all metrics to the specified registry"""
        for metric_name in self.METRICS:
@@ -238,6 +312,8 @@ class MetricsManager:
            registry.register(work_process_metrics.request_params_max_tokens)
            registry.register(work_process_metrics.prompt_tokens_total)
            registry.register(work_process_metrics.request_prompt_tokens)
+        if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
+            self.register_speculative_metrics(registry)

    @classmethod
    def get_excluded_metrics(cls) -> Set[str]: