polish code with new pre-commit rule (#2923)

This commit is contained in:
Zero Rains
2025-07-19 23:19:27 +08:00
committed by GitHub
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions

View File

@@ -19,30 +19,34 @@ metrics
"""
import os
import shutil
from typing import Set, TYPE_CHECKING
from typing import Set
from prometheus_client import Gauge, Histogram, multiprocess, CollectorRegistry, generate_latest, Counter
from prometheus_client import (
CollectorRegistry,
Counter,
Gauge,
Histogram,
generate_latest,
multiprocess,
)
from prometheus_client.registry import Collector
from fastdeploy.metrics import build_1_2_5_buckets
from fastdeploy.metrics.work_metrics import work_process_metrics
if TYPE_CHECKING:
from prometheus_client import Gauge, Histogram, Counter
def cleanup_prometheus_files(is_main):
"""
Cleans and recreates the Prometheus multiprocess directory.
Cleans and recreates the Prometheus multiprocess directory.
Depending on whether it's the main process or a worker, this function removes the corresponding
Prometheus multiprocess directory (/tmp/prom_main or /tmp/prom_worker) and recreates it as an empty directory.
Depending on whether it's the main process or a worker, this function removes the corresponding
Prometheus multiprocess directory (/tmp/prom_main or /tmp/prom_worker) and recreates it as an empty directory.
Args:
is_main (bool): Indicates whether the current process is the main process.
Args:
is_main (bool): Indicates whether the current process is the main process.
Returns:
str: The path to the newly created Prometheus multiprocess directory.
Returns:
str: The path to the newly created Prometheus multiprocess directory.
"""
PROM_DIR = "/tmp/prom_main" if is_main else "/tmp/prom_worker"
if os.path.exists(PROM_DIR):
@@ -53,30 +57,30 @@ def cleanup_prometheus_files(is_main):
class SimpleCollector(Collector):
"""
A custom Prometheus collector that filters out specific metrics by name.
A custom Prometheus collector that filters out specific metrics by name.
This collector wraps an existing registry and yields only those metrics
whose names are not in the specified exclusion set.
This collector wraps an existing registry and yields only those metrics
whose names are not in the specified exclusion set.
"""
def __init__(self, base_registry, exclude_names: Set[str]):
"""
Initializes the SimpleCollector.
Initializes the SimpleCollector.
Args:
base_registry (CollectorRegistry): The source registry from which metrics are collected.
exclude_names (Set[str]): A set of metric names to exclude from collection.
Args:
base_registry (CollectorRegistry): The source registry from which metrics are collected.
exclude_names (Set[str]): A set of metric names to exclude from collection.
"""
self.base_registry = base_registry
self.exclude_names = exclude_names
def collect(self):
"""
Collects and yields metrics not in the exclusion list.
Collects and yields metrics not in the exclusion list.
Yields:
Metric: Prometheus Metric objects that are not excluded.
"""
Yields:
Metric: Prometheus Metric objects that are not excluded.
"""
for metric in self.base_registry.collect():
if not any(name.startswith(metric.name) for name in self.exclude_names):
yield metric
@@ -102,124 +106,157 @@ def get_filtered_metrics(exclude_names: Set[str], extra_register_func=None) -> s
REQUEST_LATENCY_BUCKETS = [
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
0.3,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
120.0,
240.0,
480.0,
960.0,
1920.0,
7680.0,
]
class MetricsManager:
"""Prometheus Metrics Manager handles all metric updates """
"""Prometheus Metrics Manager handles all metric updates"""
_instance = None
num_requests_running: 'Gauge'
num_requests_waiting: 'Gauge'
time_to_first_token: 'Histogram'
time_per_output_token: 'Histogram'
request_inference_time: 'Histogram'
request_queue_time: 'Histogram'
gpu_cache_usage_perc: 'Gauge'
generation_tokens_total: 'Counter'
request_prefill_time: 'Histogram'
request_decode_time: 'Histogram'
request_generation_tokens: 'Histogram'
request_success_total: 'Counter'
spec_decode_draft_acceptance_rate: 'Gauge'
spec_decode_efficiency: 'Gauge'
spec_decode_num_accepted_tokens_total: 'Counter'
spec_decode_num_draft_tokens_total: 'Counter'
spec_decode_num_emitted_tokens_total: 'Counter'
spec_decode_draft_single_head_acceptance_rate: 'list[Gauge]'
num_requests_running: "Gauge"
num_requests_waiting: "Gauge"
time_to_first_token: "Histogram"
time_per_output_token: "Histogram"
request_inference_time: "Histogram"
request_queue_time: "Histogram"
gpu_cache_usage_perc: "Gauge"
generation_tokens_total: "Counter"
request_prefill_time: "Histogram"
request_decode_time: "Histogram"
request_generation_tokens: "Histogram"
request_success_total: "Counter"
spec_decode_draft_acceptance_rate: "Gauge"
spec_decode_efficiency: "Gauge"
spec_decode_num_accepted_tokens_total: "Counter"
spec_decode_num_draft_tokens_total: "Counter"
spec_decode_num_emitted_tokens_total: "Counter"
spec_decode_draft_single_head_acceptance_rate: "list[Gauge]"
# 定义所有指标配置
METRICS = {
'num_requests_running': {
'type': Gauge,
'name': 'fastdeploy:num_requests_running',
'description': 'Number of requests currently running',
'kwargs': {}
"num_requests_running": {
"type": Gauge,
"name": "fastdeploy:num_requests_running",
"description": "Number of requests currently running",
"kwargs": {},
},
'num_requests_waiting': {
'type': Gauge,
'name': 'fastdeploy:num_requests_waiting',
'description': 'Number of requests currently waiting',
'kwargs': {}
"num_requests_waiting": {
"type": Gauge,
"name": "fastdeploy:num_requests_waiting",
"description": "Number of requests currently waiting",
"kwargs": {},
},
'time_to_first_token': {
'type': Histogram,
'name': 'fastdeploy:time_to_first_token_seconds',
'description': 'Time to first token in seconds',
'kwargs': {
'buckets': [0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0]
}
"time_to_first_token": {
"type": Histogram,
"name": "fastdeploy:time_to_first_token_seconds",
"description": "Time to first token in seconds",
"kwargs": {
"buckets": [
0.001,
0.005,
0.01,
0.02,
0.04,
0.06,
0.08,
0.1,
0.25,
0.5,
0.75,
1.0,
]
},
},
'time_per_output_token': {
'type': Histogram,
'name': 'fastdeploy:time_per_output_token_seconds',
'description': 'Time per output token in seconds',
'kwargs': {
'buckets': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0]
}
"time_per_output_token": {
"type": Histogram,
"name": "fastdeploy:time_per_output_token_seconds",
"description": "Time per output token in seconds",
"kwargs": {
"buckets": [
0.01,
0.025,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
]
},
},
'request_inference_time': {
'type': Histogram,
'name': 'fastdeploy:request_inference_time_seconds',
'description': 'Time spent in inference phase (from inference start to last token)',
'kwargs': {
'buckets': REQUEST_LATENCY_BUCKETS
}
"request_inference_time": {
"type": Histogram,
"name": "fastdeploy:request_inference_time_seconds",
"description": "Time spent in inference phase (from inference start to last token)",
"kwargs": {"buckets": REQUEST_LATENCY_BUCKETS},
},
'request_queue_time': {
'type': Histogram,
'name': 'fastdeploy:request_queue_time_seconds',
'description': 'Time spent in waiting queue (from preprocess end to inference start)',
'kwargs': {
'buckets': REQUEST_LATENCY_BUCKETS
}
"request_queue_time": {
"type": Histogram,
"name": "fastdeploy:request_queue_time_seconds",
"description": "Time spent in waiting queue (from preprocess end to inference start)",
"kwargs": {"buckets": REQUEST_LATENCY_BUCKETS},
},
'gpu_cache_usage_perc': {
'type': Gauge,
'name': 'fastdeploy:gpu_cache_usage_perc',
'description': 'GPU KV-cache usage. 1 means 100 percent usage',
'kwargs': {}
"gpu_cache_usage_perc": {
"type": Gauge,
"name": "fastdeploy:gpu_cache_usage_perc",
"description": "GPU KV-cache usage. 1 means 100 percent usage",
"kwargs": {},
},
'generation_tokens_total': {
'type': Counter,
'name': 'fastdeploy:generation_tokens_total',
'description': 'Total number of generation tokens processed',
'kwargs': {}
"generation_tokens_total": {
"type": Counter,
"name": "fastdeploy:generation_tokens_total",
"description": "Total number of generation tokens processed",
"kwargs": {},
},
'request_prefill_time': {
'type': Histogram,
'name': 'fastdeploy:request_prefill_time_seconds',
'description': 'Time spent in prefill phase (from preprocess start to preprocess end)',
'kwargs': {
'buckets': REQUEST_LATENCY_BUCKETS
}
"request_prefill_time": {
"type": Histogram,
"name": "fastdeploy:request_prefill_time_seconds",
"description": "Time spent in prefill phase (from preprocess start to preprocess end)",
"kwargs": {"buckets": REQUEST_LATENCY_BUCKETS},
},
'request_decode_time': {
'type': Histogram,
'name': 'fastdeploy:request_decode_time_seconds',
'description': 'Time spent in decode phase (from first token to last token)',
'kwargs': {
'buckets': REQUEST_LATENCY_BUCKETS
}
"request_decode_time": {
"type": Histogram,
"name": "fastdeploy:request_decode_time_seconds",
"description": "Time spent in decode phase (from first token to last token)",
"kwargs": {"buckets": REQUEST_LATENCY_BUCKETS},
},
'request_generation_tokens': {
'type': Histogram,
'name': 'fastdeploy:request_generation_tokens',
'description': 'Number of generation tokens processed.',
'kwargs': {
'buckets': build_1_2_5_buckets(33792)
}
"request_generation_tokens": {
"type": Histogram,
"name": "fastdeploy:request_generation_tokens",
"description": "Number of generation tokens processed.",
"kwargs": {"buckets": build_1_2_5_buckets(33792)},
},
'request_success_total': {
'type': Counter,
'name': 'fastdeploy:request_success_total',
'description': 'Total number of successfully processed requests',
'kwargs': {}
"request_success_total": {
"type": Counter,
"name": "fastdeploy:request_success_total",
"description": "Total number of successfully processed requests",
"kwargs": {},
},
}
SPECULATIVE_METRICS = {}
@@ -228,11 +265,11 @@ class MetricsManager:
"""Initializes the Prometheus metrics and starts the HTTP server if not already initialized."""
# 动态创建所有指标
for metric_name, config in self.METRICS.items():
setattr(self, metric_name, config['type'](
config['name'],
config['description'],
**config['kwargs']
))
setattr(
self,
metric_name,
config["type"](config["name"], config["description"], **config["kwargs"]),
)
def _init_speculative_metrics(self, speculative_method, num_speculative_tokens):
self.SPECULATIVE_METRICS = {
@@ -256,19 +293,19 @@ class MetricsManager:
},
}
if speculative_method == "mtp":
self.SPECULATIVE_METRICS["spec_decode_efficiency"]={
self.SPECULATIVE_METRICS["spec_decode_efficiency"] = {
"type": Gauge,
"name": "fastdeploy:spec_decode_efficiency",
"description": "Efficiency of speculative decoding",
"kwargs": {},
}
self.SPECULATIVE_METRICS["spec_decode_num_draft_tokens_total"]={
self.SPECULATIVE_METRICS["spec_decode_num_draft_tokens_total"] = {
"type": Counter,
"name": "fastdeploy:spec_decode_num_draft_tokens_total",
"description": "Total number of speculative tokens generated by the proposal method",
"kwargs": {},
}
self.SPECULATIVE_METRICS["spec_decode_draft_single_head_acceptance_rate"]={
self.SPECULATIVE_METRICS["spec_decode_draft_single_head_acceptance_rate"] = {
"type": list[Gauge],
"name": "fastdeploy:spec_decode_draft_single_head_acceptance_rate",
"description": "Single head acceptance rate of speculative decoding",
@@ -290,7 +327,9 @@ class MetricsManager:
self,
metric_name,
config["type"](
config["name"], config["description"], **config["kwargs"]
config["name"],
config["description"],
**config["kwargs"],
),
)
@@ -318,7 +357,7 @@ class MetricsManager:
@classmethod
def get_excluded_metrics(cls) -> Set[str]:
"""Get the set of indicator names that need to be excluded"""
return {config['name'] for config in cls.METRICS.values()}
return {config["name"] for config in cls.METRICS.values()}
main_process_metrics = MetricsManager()