mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Features] Add speculative metrics (#2857)
This commit is contained in:
@@ -107,8 +107,6 @@ REQUEST_LATENCY_BUCKETS = [
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
class MetricsManager:
|
||||
"""Prometheus Metrics Manager handles all metric updates """
|
||||
|
||||
@@ -126,6 +124,12 @@ class MetricsManager:
|
||||
request_decode_time: 'Histogram'
|
||||
request_generation_tokens: 'Histogram'
|
||||
request_success_total: 'Counter'
|
||||
spec_decode_draft_acceptance_rate: 'Gauge'
|
||||
spec_decode_efficiency: 'Gauge'
|
||||
spec_decode_num_accepted_tokens_total: 'Counter'
|
||||
spec_decode_num_draft_tokens_total: 'Counter'
|
||||
spec_decode_num_emitted_tokens_total: 'Counter'
|
||||
spec_decode_draft_single_head_acceptance_rate: 'list[Gauge]'
|
||||
|
||||
# 定义所有指标配置
|
||||
METRICS = {
|
||||
@@ -216,8 +220,9 @@ class MetricsManager:
|
||||
'name': 'fastdeploy:request_success_total',
|
||||
'description': 'Total number of successfully processed requests',
|
||||
'kwargs': {}
|
||||
}
|
||||
},
|
||||
}
|
||||
SPECULATIVE_METRICS = {}
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the Prometheus metrics and starts the HTTP server if not already initialized."""
|
||||
@@ -229,6 +234,75 @@ class MetricsManager:
|
||||
**config['kwargs']
|
||||
))
|
||||
|
||||
def _init_speculative_metrics(self, speculative_method, num_speculative_tokens):
|
||||
self.SPECULATIVE_METRICS = {
|
||||
"spec_decode_draft_acceptance_rate": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:spec_decode_draft_acceptance_rate",
|
||||
"description": "Acceptance rate of speculative decoding",
|
||||
"kwargs": {},
|
||||
},
|
||||
"spec_decode_num_accepted_tokens_total": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:spec_decode_num_accepted_tokens_total",
|
||||
"description": "Total number of tokens accepted by the scoring model and verification program",
|
||||
"kwargs": {},
|
||||
},
|
||||
"spec_decode_num_emitted_tokens_total": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:spec_decode_num_emitted_tokens_total",
|
||||
"description": "Total number of tokens output by the entire system",
|
||||
"kwargs": {},
|
||||
},
|
||||
}
|
||||
if speculative_method == "mtp":
|
||||
self.SPECULATIVE_METRICS["spec_decode_efficiency"]={
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:spec_decode_efficiency",
|
||||
"description": "Efficiency of speculative decoding",
|
||||
"kwargs": {},
|
||||
}
|
||||
self.SPECULATIVE_METRICS["spec_decode_num_draft_tokens_total"]={
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:spec_decode_num_draft_tokens_total",
|
||||
"description": "Total number of speculative tokens generated by the proposal method",
|
||||
"kwargs": {},
|
||||
}
|
||||
self.SPECULATIVE_METRICS["spec_decode_draft_single_head_acceptance_rate"]={
|
||||
"type": list[Gauge],
|
||||
"name": "fastdeploy:spec_decode_draft_single_head_acceptance_rate",
|
||||
"description": "Single head acceptance rate of speculative decoding",
|
||||
"kwargs": {},
|
||||
}
|
||||
for metric_name, config in self.SPECULATIVE_METRICS.items():
|
||||
if metric_name == "spec_decode_draft_single_head_acceptance_rate":
|
||||
gauges = []
|
||||
for i in range(num_speculative_tokens):
|
||||
gauges.append(
|
||||
Gauge(
|
||||
f"{config['name']}_{i}",
|
||||
f"{config['description']} (head {i})",
|
||||
)
|
||||
)
|
||||
setattr(self, metric_name, gauges)
|
||||
else:
|
||||
setattr(
|
||||
self,
|
||||
metric_name,
|
||||
config["type"](
|
||||
config["name"], config["description"], **config["kwargs"]
|
||||
),
|
||||
)
|
||||
|
||||
def register_speculative_metrics(self, registry: CollectorRegistry):
|
||||
"""Register all speculative metrics to the specified registry"""
|
||||
for metric_name in self.SPECULATIVE_METRICS:
|
||||
if metric_name == "spec_decode_draft_single_head_acceptance_rate":
|
||||
for gauge in getattr(self, metric_name):
|
||||
registry.register(gauge)
|
||||
else:
|
||||
registry.register(getattr(self, metric_name))
|
||||
|
||||
def register_all(self, registry: CollectorRegistry, workers: int = 1):
|
||||
"""Register all metrics to the specified registry"""
|
||||
for metric_name in self.METRICS:
|
||||
@@ -238,6 +312,8 @@ class MetricsManager:
|
||||
registry.register(work_process_metrics.request_params_max_tokens)
|
||||
registry.register(work_process_metrics.prompt_tokens_total)
|
||||
registry.register(work_process_metrics.request_prompt_tokens)
|
||||
if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
|
||||
self.register_speculative_metrics(registry)
|
||||
|
||||
@classmethod
|
||||
def get_excluded_metrics(cls) -> Set[str]:
|
||||
|
Reference in New Issue
Block a user