Sync v2.0 version of code to github repo

2025-10-05 08:37:06 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/fastdeploy/metrics/metrics.py
+++ b/fastdeploy/metrics/metrics.py
@@ -21,14 +21,14 @@ import os
 import shutil
 from typing import Set, TYPE_CHECKING

-from prometheus_client import Gauge, Histogram, multiprocess, CollectorRegistry, generate_latest
+from prometheus_client import Gauge, Histogram, multiprocess, CollectorRegistry, generate_latest, Counter
 from prometheus_client.registry import Collector

+from fastdeploy.metrics import build_1_2_5_buckets
 from fastdeploy.metrics.work_metrics import work_process_metrics
-from fastdeploy.utils import api_server_logger

 if TYPE_CHECKING:
-    from prometheus_client import Gauge, Histogram
+    from prometheus_client import Gauge, Histogram, Counter


 def cleanup_prometheus_files(is_main):
@@ -78,7 +78,7 @@ class SimpleCollector(Collector):
                    Metric: Prometheus Metric objects that are not excluded.
                """
        for metric in self.base_registry.collect():
-            if metric.name not in self.exclude_names:
+            if not any(name.startswith(metric.name) for name in self.exclude_names):
                yield metric


@@ -107,6 +107,8 @@ REQUEST_LATENCY_BUCKETS = [
 ]


+
+
 class MetricsManager:
    """Prometheus Metrics Manager handles all metric updates """

@@ -118,6 +120,12 @@ class MetricsManager:
    time_per_output_token: 'Histogram'
    request_inference_time: 'Histogram'
    request_queue_time: 'Histogram'
+    gpu_cache_usage_perc: 'Gauge'
+    generation_tokens_total: 'Counter'
+    request_prefill_time: 'Histogram'
+    request_decode_time: 'Histogram'
+    request_generation_tokens: 'Histogram'
+    request_success_total: 'Counter'

    # 定义所有指标配置
    METRICS = {
@@ -165,6 +173,49 @@ class MetricsManager:
            'kwargs': {
                'buckets': REQUEST_LATENCY_BUCKETS
            }
+        },
+        'gpu_cache_usage_perc': {
+            'type': Gauge,
+            'name': 'fastdeploy:gpu_cache_usage_perc',
+            'description': 'GPU KV-cache usage. 1 means 100 percent usage',
+            'kwargs': {}
+        },
+
+        'generation_tokens_total': {
+            'type': Counter,
+            'name': 'fastdeploy:generation_tokens_total',
+            'description': 'Total number of generation tokens processed',
+            'kwargs': {}
+        },
+        'request_prefill_time': {
+            'type': Histogram,
+            'name': 'fastdeploy:request_prefill_time_seconds',
+            'description': 'Time spent in prefill phase (from preprocess start to preprocess end)',
+            'kwargs': {
+                'buckets': REQUEST_LATENCY_BUCKETS
+            }
+        },
+        'request_decode_time': {
+            'type': Histogram,
+            'name': 'fastdeploy:request_decode_time_seconds',
+            'description': 'Time spent in decode phase (from first token to last token)',
+            'kwargs': {
+                'buckets': REQUEST_LATENCY_BUCKETS
+            }
+        },
+        'request_generation_tokens': {
+            'type': Histogram,
+            'name': 'fastdeploy:request_generation_tokens',
+            'description': 'Number of generation tokens processed.',
+            'kwargs': {
+                'buckets': build_1_2_5_buckets(33792)
+            }
+        },
+        'request_success_total': {
+            'type': Counter,
+            'name': 'fastdeploy:request_success_total',
+            'description': 'Total number of successfully processed requests',
+            'kwargs': {}
        }
    }

@@ -184,6 +235,9 @@ class MetricsManager:
            registry.register(getattr(self, metric_name))
        if workers == 1:
            registry.register(work_process_metrics.e2e_request_latency)
+            registry.register(work_process_metrics.request_params_max_tokens)
+            registry.register(work_process_metrics.prompt_tokens_total)
+            registry.register(work_process_metrics.request_prompt_tokens)

    @classmethod
    def get_excluded_metrics(cls) -> Set[str]: