[feat] add metrics for yiyan adapter (#3219)

* [feat] add metrics for yiyan adapter

* [fix] fix metrics num_requests_waiting and num_requests_running

* [fix] fix metrics gpu_cache_usage_perc

* [refactor] change where requests_number increases

* [chore] rename xxx_block_num as xxx_gpu_block_num, and update their values accordingly

* [chore] delete useless code
This commit is contained in:
李泳桦
2025-08-21 16:58:10 +08:00
committed by GitHub
parent 6854506533
commit d18a637a17
7 changed files with 181 additions and 18 deletions

View File

@@ -154,6 +154,22 @@ class MetricsManager:
spec_decode_num_emitted_tokens_total: "Counter"
spec_decode_draft_single_head_acceptance_rate: "list[Gauge]"
# for YIYAN Adapter
prefix_cache_token_num: "Gauge"
prefix_gpu_cache_token_num: "Gauge"
prefix_cpu_cache_token_num: "Gauge"
prefix_ssd_cache_token_num: "Gauge"
batch_size: "Gauge"
max_batch_size: "Gauge"
available_gpu_block_num: "Gauge"
free_gpu_block_num: "Gauge"
max_gpu_block_num: "Gauge"
available_gpu_resource: "Gauge"
requests_number: "Counter"
send_cache_failed_num: "Counter"
first_token_latency: "Gauge"
infer_latency: "Gauge"
# 定义所有指标配置
METRICS = {
"num_requests_running": {
@@ -258,6 +274,91 @@ class MetricsManager:
"description": "Total number of successfully processed requests",
"kwargs": {},
},
# for YIYAN Adapter
"prefix_cache_token_num": {
"type": Counter,
"name": "fastdeploy:prefix_cache_token_num",
"description": "Total number of cached tokens",
"kwargs": {},
},
"prefix_gpu_cache_token_num": {
"type": Counter,
"name": "fastdeploy:prefix_gpu_cache_token_num",
"description": "Total number of cached tokens on GPU",
"kwargs": {},
},
"prefix_cpu_cache_token_num": {
"type": Counter,
"name": "fastdeploy:prefix_cpu_cache_token_num",
"description": "Total number of cached tokens on CPU",
"kwargs": {},
},
"prefix_ssd_cache_token_num": {
"type": Counter,
"name": "fastdeploy:prefix_ssd_cache_token_num",
"description": "Total number of cached tokens on SSD",
"kwargs": {},
},
"batch_size": {
"type": Gauge,
"name": "fastdeploy:batch_size",
"description": "Real batch size during inference",
"kwargs": {},
},
"max_batch_size": {
"type": Gauge,
"name": "fastdeploy:max_batch_size",
"description": "Maximum batch size determined when service started",
"kwargs": {},
},
"available_gpu_block_num": {
"type": Gauge,
"name": "fastdeploy:available_gpu_block_num",
"description": "Number of available gpu blocks in cache, including prefix caching blocks that are not officially released",
"kwargs": {},
},
"free_gpu_block_num": {
"type": Gauge,
"name": "fastdeploy:free_gpu_block_num",
"description": "Number of free blocks in cache",
"kwargs": {},
},
"max_gpu_block_num": {
"type": Gauge,
"name": "fastdeploy:max_gpu_block_num",
"description": "Number of total blocks determined when service started",
"kwargs": {},
},
"available_gpu_resource": {
"type": Gauge,
"name": "fastdeploy:available_gpu_resource",
"description": "Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num",
"kwargs": {},
},
"requests_number": {
"type": Counter,
"name": "fastdeploy:requests_number",
"description": "Total number of requests received",
"kwargs": {},
},
"send_cache_failed_num": {
"type": Counter,
"name": "fastdeploy:send_cache_failed_num",
"description": "Total number of failures of sending cache",
"kwargs": {},
},
"first_token_latency": {
"type": Gauge,
"name": "fastdeploy:first_token_latency",
"description": "Latest time to first token in seconds",
"kwargs": {},
},
"infer_latency": {
"type": Gauge,
"name": "fastdeploy:infer_latency",
"description": "Latest time to generate one token in seconds",
"kwargs": {},
},
}
SPECULATIVE_METRICS = {}