[metrics] Add serveral observability metrics (#3868)

* Add several observability metrics

* [wenxin-tools-584] 【可观测性】支持查看本节点的并发数、剩余block_size、排队请求数等信息

* adjust some metrics and md files

* trigger ci

* adjust ci file

* trigger ci

* trigger ci

---------

Co-authored-by: K11OntheBoat <your_email@example.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
qwes5s5
2025-09-08 14:13:13 +08:00
committed by GitHub
parent 3d0aaa5923
commit 17169a14f2
10 changed files with 244 additions and 6 deletions

View File

@@ -169,7 +169,12 @@ class MetricsManager:
send_cache_failed_num: "Counter"
first_token_latency: "Gauge"
infer_latency: "Gauge"
cache_config_info: "Gauge"
available_batch_size: "Gauge"
hit_req_rate: "Gauge"
hit_token_rate: "Gauge"
cpu_hit_token_rate: "Gauge"
gpu_hit_token_rate: "Gauge"
# 定义所有指标配置
METRICS = {
"num_requests_running": {
@@ -359,6 +364,36 @@ class MetricsManager:
"description": "Latest time to generate one token in seconds",
"kwargs": {},
},
"available_batch_size": {
"type": Gauge,
"name": "fastdeploy:available_batch_size",
"description": "Number of requests that can still be inserted during the Decode phase",
"kwargs": {},
},
"hit_req_rate": {
"type": Gauge,
"name": "fastdeploy:hit_req_rate",
"description": "Request-level prefix cache hit rate",
"kwargs": {},
},
"hit_token_rate": {
"type": Gauge,
"name": "fastdeploy:hit_token_rate",
"description": "Token-level prefix cache hit rate",
"kwargs": {},
},
"cpu_hit_token_rate": {
"type": Gauge,
"name": "fastdeploy:cpu_hit_token_rate",
"description": "Token-level CPU prefix cache hit rate",
"kwargs": {},
},
"gpu_hit_token_rate": {
"type": Gauge,
"name": "fastdeploy:gpu_hit_token_rate",
"description": "Token-level GPU prefix cache hit rate",
"kwargs": {},
},
}
SPECULATIVE_METRICS = {}
@@ -434,6 +469,26 @@ class MetricsManager:
),
)
def set_cache_config_info(self, obj) -> None:
if hasattr(self, "cache_config_info") and isinstance(self.cache_config_info, Gauge):
metrics_info = obj.metrics_info()
if metrics_info:
self.cache_config_info.labels(**metrics_info).set(1)
return
metrics_info = obj.metrics_info()
if not metrics_info:
return
self.cache_config_info = Gauge(
name="fastdeploy:cache_config_info",
documentation="Information of the engine's CacheConfig",
labelnames=list(metrics_info.keys()),
multiprocess_mode="mostrecent",
)
self.cache_config_info.labels(**metrics_info).set(1)
def register_speculative_metrics(self, registry: CollectorRegistry):
"""Register all speculative metrics to the specified registry"""
for metric_name in self.SPECULATIVE_METRICS:
@@ -447,6 +502,8 @@ class MetricsManager:
"""Register all metrics to the specified registry"""
for metric_name in self.METRICS:
registry.register(getattr(self, metric_name))
if self.cache_config_info is not None:
registry.register(self.cache_config_info)
if workers == 1:
registry.register(work_process_metrics.e2e_request_latency)
registry.register(work_process_metrics.request_params_max_tokens)