mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[metrics] Add serveral observability metrics (#3868)
* Add several observability metrics * [wenxin-tools-584] 【可观测性】支持查看本节点的并发数、剩余block_size、排队请求数等信息 * adjust some metrics and md files * trigger ci * adjust ci file * trigger ci * trigger ci --------- Co-authored-by: K11OntheBoat <your_email@example.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -169,7 +169,12 @@ class MetricsManager:
|
||||
send_cache_failed_num: "Counter"
|
||||
first_token_latency: "Gauge"
|
||||
infer_latency: "Gauge"
|
||||
|
||||
cache_config_info: "Gauge"
|
||||
available_batch_size: "Gauge"
|
||||
hit_req_rate: "Gauge"
|
||||
hit_token_rate: "Gauge"
|
||||
cpu_hit_token_rate: "Gauge"
|
||||
gpu_hit_token_rate: "Gauge"
|
||||
# 定义所有指标配置
|
||||
METRICS = {
|
||||
"num_requests_running": {
|
||||
@@ -359,6 +364,36 @@ class MetricsManager:
|
||||
"description": "Latest time to generate one token in seconds",
|
||||
"kwargs": {},
|
||||
},
|
||||
"available_batch_size": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:available_batch_size",
|
||||
"description": "Number of requests that can still be inserted during the Decode phase",
|
||||
"kwargs": {},
|
||||
},
|
||||
"hit_req_rate": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:hit_req_rate",
|
||||
"description": "Request-level prefix cache hit rate",
|
||||
"kwargs": {},
|
||||
},
|
||||
"hit_token_rate": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:hit_token_rate",
|
||||
"description": "Token-level prefix cache hit rate",
|
||||
"kwargs": {},
|
||||
},
|
||||
"cpu_hit_token_rate": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:cpu_hit_token_rate",
|
||||
"description": "Token-level CPU prefix cache hit rate",
|
||||
"kwargs": {},
|
||||
},
|
||||
"gpu_hit_token_rate": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:gpu_hit_token_rate",
|
||||
"description": "Token-level GPU prefix cache hit rate",
|
||||
"kwargs": {},
|
||||
},
|
||||
}
|
||||
SPECULATIVE_METRICS = {}
|
||||
|
||||
@@ -434,6 +469,26 @@ class MetricsManager:
|
||||
),
|
||||
)
|
||||
|
||||
def set_cache_config_info(self, obj) -> None:
|
||||
if hasattr(self, "cache_config_info") and isinstance(self.cache_config_info, Gauge):
|
||||
metrics_info = obj.metrics_info()
|
||||
if metrics_info:
|
||||
self.cache_config_info.labels(**metrics_info).set(1)
|
||||
return
|
||||
|
||||
metrics_info = obj.metrics_info()
|
||||
if not metrics_info:
|
||||
return
|
||||
|
||||
self.cache_config_info = Gauge(
|
||||
name="fastdeploy:cache_config_info",
|
||||
documentation="Information of the engine's CacheConfig",
|
||||
labelnames=list(metrics_info.keys()),
|
||||
multiprocess_mode="mostrecent",
|
||||
)
|
||||
|
||||
self.cache_config_info.labels(**metrics_info).set(1)
|
||||
|
||||
def register_speculative_metrics(self, registry: CollectorRegistry):
|
||||
"""Register all speculative metrics to the specified registry"""
|
||||
for metric_name in self.SPECULATIVE_METRICS:
|
||||
@@ -447,6 +502,8 @@ class MetricsManager:
|
||||
"""Register all metrics to the specified registry"""
|
||||
for metric_name in self.METRICS:
|
||||
registry.register(getattr(self, metric_name))
|
||||
if self.cache_config_info is not None:
|
||||
registry.register(self.cache_config_info)
|
||||
if workers == 1:
|
||||
registry.register(work_process_metrics.e2e_request_latency)
|
||||
registry.register(work_process_metrics.request_params_max_tokens)
|
||||
|
Reference in New Issue
Block a user