[metrics] Add serveral observability metrics (#3868) (#4011)

* [metrics] Add serveral observability metrics (#3868) * Add several observability metrics * [wenxin-tools-584] 【可观测性】支持查看本节点的并发数、剩余block_size、排队请求数等信息 * adjust some metrics and md files * trigger ci * adjust ci file * trigger ci * trigger ci --------- Co-authored-by: K11OntheBoat <your_email@example.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> * version adjust --------- Co-authored-by: K11OntheBoat <your_email@example.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-05 16:48:03 +08:00 · 2025-09-10 10:59:57 +08:00
parent 187ccb0f04
commit 2ee91d7a96
12 changed files with 1026 additions and 7 deletions
--- a/docs/online_serving/metrics.md
+++ b/docs/online_serving/metrics.md
@@ -20,7 +20,12 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo
 | `fastdeploy:gpu_cache_usage_perc`            | Gauge     | GPU KV-cache usage rate          | Percentage    |
 | `fastdeploy:request_params_max_tokens`       | Histogram | Distribution of max_tokens for requests       | Count   |
 | `fastdeploy:request_success_total`           | Counter   | Number of successfully processed requests           | Count   |
-
+| `fastdeploy:cache_config_info`               | Gauge     | Information of the engine's CacheConfig             | Count   |
+| `fastdeploy:available_batch_size`            | Gauge     | Number of requests that can still be inserted during the Decode phase| Count   |
+| `fastdeploy:hit_req_rate`                    | Gauge     | Request-level prefix cache hit rate                 | Percentage   |
+| `fastdeploy:hit_token_rate`                  | Gauge     | Token-level prefix cache hit rate                   | Percentage   |
+| `fastdeploy:cpu_hit_token_rate`              | Gauge     | Token-level CPU prefix cache hit rate               | Percentage   |
+| `fastdeploy:gpu_hit_token_rate`              | Gauge     | Token-level GPU prefix cache hit rate               | Percentage   |
 ## Accessing Metrics

 - Access URL: `http://localhost:8000/metrics`
--- a/docs/zh/online_serving/metrics.md
+++ b/docs/zh/online_serving/metrics.md
@@ -20,7 +20,12 @@
 | `fastdeploy:gpu_cache_usage_perc`         | Gauge     | GPU KV-cache 使用率          | 百分比    |
 | `fastdeploy:request_params_max_tokens`    | Histogram | 请求的 max_tokens 分布       | 个   |
 | `fastdeploy:request_success_total`        | Counter   | 成功处理的请求个数           | 个   |
-
+| `fastdeploy:cache_config_info`            | Gauge     | 推理引擎的缓存配置信息        | 个   |
+| `fastdeploy:available_batch_size`         | Gauge     | Decode阶段还可以插入的请求数量 | 个   |
+| `fastdeploy:hit_req_rate`                 | Gauge     | 请求级别前缀缓存命中率        | 百分比   |
+| `fastdeploy:hit_token_rate`               | Gauge     | token级别前缀缓存命中率      | 百分比   |
+| `fastdeploy:cpu_hit_token_rate`           | Gauge     | token级别CPU前缀缓存命中率   | 百分比   |
+| `fastdeploy:gpu_hit_token_rate`           | Gauge     | token级别GPU前缀缓存命中率   | 百分比   |
 ## 指标访问

 - 访问地址：`http://localhost:8000/metrics`
--- a/fastdeploy/cache_manager/cache_metrics.py
+++ b/fastdeploy/cache_manager/cache_metrics.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """

+from fastdeploy.metrics.metrics import main_process_metrics
 from fastdeploy.utils import get_logger

 logger = get_logger("prefix_cache_manager", "prefix_cache_manager.log")
@@ -54,6 +55,11 @@ class CacheMetrics:
        self.cpu_hit_token_ratio = self.total_cpu_matched_token_num / self.total_token_num
        self.gpu_hit_token_ratio = self.total_gpu_matched_token_num / self.total_token_num

+        main_process_metrics.hit_req_rate.set(self.hit_req_ratio)
+        main_process_metrics.hit_token_rate.set(self.hit_token_ratio)
+        main_process_metrics.cpu_hit_token_rate.set(self.cpu_hit_token_ratio)
+        main_process_metrics.gpu_hit_token_rate.set(self.gpu_hit_token_ratio)
+
        logger.info(
            f"Metrics for all requests: req_count {self.req_count} hit_req_count {self.hit_req_count}"
            + f" hit_req_ratio {self.hit_req_ratio:.2f} hit_token_ratio {self.hit_token_ratio:.2f}"
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -165,6 +165,7 @@ class LLMEngine:
                self.cfg.guided_decoding_backend,
                disable_any_whitespace=self.cfg.disable_any_whitespace,
            )
+        main_process_metrics.set_cache_config_info(obj=self.cfg.cache_config)

    def start(self, api_server_pid=None):
        """
--- a/fastdeploy/engine/resource_manager.py
+++ b/fastdeploy/engine/resource_manager.py
@@ -318,7 +318,6 @@ class ResourceManager:
        main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
        main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
        main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
-
        llm_logger.info(
            f"Number of allocated requests: {len(tasks)}, number of " f"running requests in worker: {self.real_bsz}"
        )
--- a/fastdeploy/metrics/metrics.py
+++ b/fastdeploy/metrics/metrics.py
@@ -169,7 +169,12 @@ class MetricsManager:
    send_cache_failed_num: "Counter"
    first_token_latency: "Gauge"
    infer_latency: "Gauge"
-
+    cache_config_info: "Gauge"
+    available_batch_size: "Gauge"
+    hit_req_rate: "Gauge"
+    hit_token_rate: "Gauge"
+    cpu_hit_token_rate: "Gauge"
+    gpu_hit_token_rate: "Gauge"
    # 定义所有指标配置
    METRICS = {
        "num_requests_running": {
@@ -359,6 +364,36 @@ class MetricsManager:
            "description": "Latest time to generate one token in seconds",
            "kwargs": {},
        },
+        "available_batch_size": {
+            "type": Gauge,
+            "name": "fastdeploy:available_batch_size",
+            "description": "Number of requests that can still be inserted during the Decode phase",
+            "kwargs": {},
+        },
+        "hit_req_rate": {
+            "type": Gauge,
+            "name": "fastdeploy:hit_req_rate",
+            "description": "Request-level prefix cache hit rate",
+            "kwargs": {},
+        },
+        "hit_token_rate": {
+            "type": Gauge,
+            "name": "fastdeploy:hit_token_rate",
+            "description": "Token-level prefix cache hit rate",
+            "kwargs": {},
+        },
+        "cpu_hit_token_rate": {
+            "type": Gauge,
+            "name": "fastdeploy:cpu_hit_token_rate",
+            "description": "Token-level CPU prefix cache hit rate",
+            "kwargs": {},
+        },
+        "gpu_hit_token_rate": {
+            "type": Gauge,
+            "name": "fastdeploy:gpu_hit_token_rate",
+            "description": "Token-level GPU prefix cache hit rate",
+            "kwargs": {},
+        },
    }
    SPECULATIVE_METRICS = {}

@@ -434,6 +469,26 @@ class MetricsManager:
                    ),
                )

+    def set_cache_config_info(self, obj) -> None:
+        if hasattr(self, "cache_config_info") and isinstance(self.cache_config_info, Gauge):
+            metrics_info = obj.metrics_info()
+            if metrics_info:
+                self.cache_config_info.labels(**metrics_info).set(1)
+            return
+
+        metrics_info = obj.metrics_info()
+        if not metrics_info:
+            return
+
+        self.cache_config_info = Gauge(
+            name="fastdeploy:cache_config_info",
+            documentation="Information of the engine's CacheConfig",
+            labelnames=list(metrics_info.keys()),
+            multiprocess_mode="mostrecent",
+        )
+
+        self.cache_config_info.labels(**metrics_info).set(1)
+
    def register_speculative_metrics(self, registry: CollectorRegistry):
        """Register all speculative metrics to the specified registry"""
        for metric_name in self.SPECULATIVE_METRICS:
@@ -447,6 +502,8 @@ class MetricsManager:
        """Register all metrics to the specified registry"""
        for metric_name in self.METRICS:
            registry.register(getattr(self, metric_name))
+        if self.cache_config_info is not None:
+            registry.register(self.cache_config_info)
        if workers == 1:
            registry.register(work_process_metrics.e2e_request_latency)
            registry.register(work_process_metrics.request_params_max_tokens)
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -284,6 +284,7 @@ class TokenProcessor:
        main_process_metrics.batch_size.set(
            self.resource_manager.max_num_seqs - self.resource_manager.available_batch()
        )
+        main_process_metrics.available_batch_size.set(self.resource_manager.available_batch())

        if task_id in self.tokens_counter:
            del self.tokens_counter[task_id]
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ tqdm
 pynvml
 uvicorn==0.29.0
 fastapi
-paddleformers
+paddleformers==0.1.2
 redis
 etcd3
 httpx
--- a/scripts/run_ci_xpu.sh
+++ b/scripts/run_ci_xpu.sh
@@ -16,7 +16,7 @@ python -m pip install -r requirements.txt
 echo "uninstall org"
 python -m pip uninstall paddlepaddle-xpu -y
 python -m pip uninstall fastdeploy-xpu -y
-python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 echo "build whl"
 bash build.sh || exit 1
 echo "pip others"
--- a/test/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py
+++ b/test/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py
@@ -417,6 +417,12 @@ def test_metrics_endpoint(metrics_url):
    gpu_cache_usage_perc_found = False
    request_params_max_tokens_sum_found = False
    request_success_total_found = False
+    cache_config_info_found = False
+    available_batch_size_found = False
+    hit_req_rate_found = False
+    hit_token_rate_found = False
+    cpu_hit_token_rate_found = False
+    gpu_hit_token_rate_found = False

    for line in metric_lines:
        if line.startswith("fastdeploy:num_requests_running"):
@@ -483,7 +489,30 @@ def test_metrics_endpoint(metrics_url):
            _, value = line.rsplit(" ", 1)
            assert float(value) >= 0, "request_success_total 值错误"
            request_success_total_found = True
-
+        elif line.startswith("fastdeploy:cache_config_info"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "cache_config_info 值错误"
+            cache_config_info_found = True
+        elif line.startswith("fastdeploy:available_batch_size"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "available_batch_size 值错误"
+            available_batch_size_found = True
+        elif line.startswith("fastdeploy:hit_req_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "hit_req_rate 值错误"
+            hit_req_rate_found = True
+        elif line.startswith("fastdeploy:hit_token_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "hit_token_rate 值错误"
+            hit_token_rate_found = True
+        elif line.startswith("fastdeploy:cpu_hit_token_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "cpu_hit_token_rate 值错误"
+            cpu_hit_token_rate_found = True
+        elif line.startswith("fastdeploy:gpu_hit_token_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "gpu_hit_token_rate 值错误"
+            gpu_hit_token_rate_found = True
    assert num_requests_running_found, "缺少 fastdeploy:num_requests_running 指标"
    assert num_requests_waiting_found, "缺少 fastdeploy:num_requests_waiting 指标"
    assert time_to_first_token_seconds_sum_found, "缺少 fastdeploy:time_to_first_token_seconds_sum 指标"
@@ -500,6 +529,12 @@ def test_metrics_endpoint(metrics_url):
    assert gpu_cache_usage_perc_found, "缺少 fastdeploy:gpu_cache_usage_perc 指标"
    assert request_params_max_tokens_sum_found, "缺少 fastdeploy:request_params_max_tokens_sum 指标"
    assert request_success_total_found, "缺少 fastdeploy:request_success_total 指标"
+    assert cache_config_info_found, "缺少 fastdeploy:cache_config_info 指标"
+    assert available_batch_size_found, "缺少 fastdeploy:available_batch_size 指标"
+    assert hit_req_rate_found, "缺少 fastdeploy:hit_req_rate 指标"
+    assert hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标"
+    assert cpu_hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标"
+    assert gpu_hit_token_rate_found, "缺少 fastdeploy:gpu_hit_token_rate 指标"


 # ==========================
--- a/tests/e2e/test_Qwen2-7B-Instruct_serving.py
+++ b/tests/e2e/test_Qwen2-7B-Instruct_serving.py
@@ -0,0 +1,818 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import concurrent.futures
+import json
+import os
+import re
+import shutil
+import signal
+import socket
+import subprocess
+import sys
+import time
+
+import openai
+import pytest
+import requests
+from jsonschema import validate
+
+# Read ports from environment variables; use default values if not set
+FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
+FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
+FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
+FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
+
+# List of ports to clean before and after tests
+PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT]
+
+
+def is_port_open(host: str, port: int, timeout=1.0):
+    """
+    Check if a TCP port is open on the given host.
+    Returns True if connection succeeds, False otherwise.
+    """
+    try:
+        with socket.create_connection((host, port), timeout):
+            return True
+    except Exception:
+        return False
+
+
+def kill_process_on_port(port: int):
+    """
+    Kill processes that are listening on the given port.
+    Uses `lsof` to find process ids and sends SIGKILL.
+    """
+    try:
+        output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
+        current_pid = os.getpid()
+        parent_pid = os.getppid()
+        for pid in output.splitlines():
+            pid = int(pid)
+            if pid in (current_pid, parent_pid):
+                print(f"Skip killing current process (pid={pid}) on port {port}")
+                continue
+            os.kill(pid, signal.SIGKILL)
+            print(f"Killed process on port {port}, pid={pid}")
+    except subprocess.CalledProcessError:
+        pass
+
+
+def clean_ports():
+    """
+    Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
+    """
+    for port in PORTS_TO_CLEAN:
+        kill_process_on_port(port)
+    time.sleep(2)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup_and_run_server():
+    """
+    Pytest fixture that runs once per test session:
+    - Cleans ports before tests
+    - Starts the API server as a subprocess
+    - Waits for server port to open (up to 30 seconds)
+    - Tears down server after all tests finish
+    """
+    print("Pre-test port cleanup...")
+    clean_ports()
+
+    print("log dir clean ")
+    if os.path.exists("log") and os.path.isdir("log"):
+        shutil.rmtree("log")
+
+    base_path = os.getenv("MODEL_PATH")
+    if base_path:
+        model_path = os.path.join(base_path, "Qwen2-7B-Instruct")
+    else:
+        model_path = "./Qwen2-7B-Instruct"
+
+    log_path = "server.log"
+    cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.entrypoints.openai.api_server",
+        "--model",
+        model_path,
+        "--port",
+        str(FD_API_PORT),
+        "--tensor-parallel-size",
+        "1",
+        "--engine-worker-queue-port",
+        str(FD_ENGINE_QUEUE_PORT),
+        "--metrics-port",
+        str(FD_METRICS_PORT),
+        "--cache-queue-port",
+        str(FD_CACHE_QUEUE_PORT),
+        "--max-model-len",
+        "32768",
+        "--max-num-seqs",
+        "128",
+        "--quantization",
+        "wint8",
+    ]
+
+    # Start subprocess in new process group
+    with open(log_path, "w") as logfile:
+        process = subprocess.Popen(
+            cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+        )
+
+    # Wait up to 300 seconds for API server to be ready
+    for _ in range(300):
+        if is_port_open("127.0.0.1", FD_API_PORT):
+            print(f"API server is up on port {FD_API_PORT}")
+            break
+        time.sleep(1)
+    else:
+        print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
+        try:
+            os.killpg(process.pid, signal.SIGTERM)
+        except Exception as e:
+            print(f"Failed to kill process group: {e}")
+        raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
+
+    yield  # Run tests
+
+    print("\n===== Post-test server cleanup... =====")
+    try:
+        os.killpg(process.pid, signal.SIGTERM)
+        clean_ports()
+        print(f"API server (pid={process.pid}) terminated")
+    except Exception as e:
+        print(f"Failed to terminate API server: {e}")
+
+
+@pytest.fixture(scope="session")
+def api_url(request):
+    """
+    Returns the API endpoint URL for chat completions.
+    """
+    return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions"
+
+
+@pytest.fixture(scope="session")
+def metrics_url(request):
+    """
+    Returns the metrics endpoint URL.
+    """
+    return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
+
+
+@pytest.fixture
+def headers():
+    """
+    Returns common HTTP request headers.
+    """
+    return {"Content-Type": "application/json"}
+
+
+@pytest.fixture
+def consistent_payload():
+    """
+    Returns a fixed payload for consistency testing,
+    including a fixed random seed and temperature.
+    """
+    return {
+        "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}],
+        "temperature": 0.9,
+        "top_p": 0,  # fix top_p to reduce randomness
+        "seed": 13,  # fixed random seed
+    }
+
+
+# ==========================
+# JSON Schema for validating chat API responses
+# ==========================
+chat_response_schema = {
+    "type": "object",
+    "properties": {
+        "id": {"type": "string"},
+        "object": {"type": "string"},
+        "created": {"type": "number"},
+        "model": {"type": "string"},
+        "choices": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "message": {
+                        "type": "object",
+                        "properties": {
+                            "role": {"type": "string"},
+                            "content": {"type": "string"},
+                        },
+                        "required": ["role", "content"],
+                    },
+                    "index": {"type": "number"},
+                    "finish_reason": {"type": "string"},
+                },
+                "required": ["message", "index", "finish_reason"],
+            },
+        },
+    },
+    "required": ["id", "object", "created", "model", "choices"],
+}
+
+
+# ==========================
+# Helper function to calculate difference rate between two texts
+# ==========================
+def calculate_diff_rate(text1, text2):
+    """
+    Calculate the difference rate between two strings
+    based on the normalized Levenshtein edit distance.
+    Returns a float in [0,1], where 0 means identical.
+    """
+    if text1 == text2:
+        return 0.0
+
+    len1, len2 = len(text1), len(text2)
+    dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
+
+    for i in range(len1 + 1):
+        for j in range(len2 + 1):
+            if i == 0 or j == 0:
+                dp[i][j] = i + j
+            elif text1[i - 1] == text2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
+            else:
+                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
+
+    edit_distance = dp[len1][len2]
+    max_len = max(len1, len2)
+    return edit_distance / max_len if max_len > 0 else 0.0
+
+
+# ==========================
+# Valid prompt test cases for parameterized testing
+# ==========================
+valid_prompts = [
+    [{"role": "user", "content": "你好"}],
+    [{"role": "user", "content": "用一句话介绍 FastDeploy"}],
+]
+
+
+@pytest.mark.parametrize("messages", valid_prompts)
+def test_valid_chat(messages, api_url, headers):
+    """
+    Test valid chat requests.
+    """
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+
+    assert resp.status_code == 200
+    validate(instance=resp.json(), schema=chat_response_schema)
+
+
+# ==========================
+# Consistency test for repeated runs with fixed payload
+# ==========================
+def test_consistency_between_runs(api_url, headers, consistent_payload):
+    """
+    Test that two runs with the same fixed input produce similar outputs.
+    """
+    # First request
+    resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp1.status_code == 200
+    result1 = resp1.json()
+    content1 = result1["choices"][0]["message"]["content"]
+
+    # Second request
+    resp2 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp2.status_code == 200
+    result2 = resp2.json()
+    content2 = result2["choices"][0]["message"]["content"]
+
+    # Calculate difference rate
+    diff_rate = calculate_diff_rate(content1, content2)
+
+    # Verify that the difference rate is below the threshold
+    assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})"
+
+
+# ==========================
+# Invalid prompt tests
+# ==========================
+
+invalid_prompts = [
+    [],  # Empty array
+    [{}],  # Empty object
+    [{"role": "user"}],  # Missing content
+    [{"content": "hello"}],  # Missing role
+]
+
+
+@pytest.mark.parametrize("messages", invalid_prompts)
+def test_invalid_chat(messages, api_url, headers):
+    """
+    Test invalid chat inputs
+    """
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+    assert resp.status_code >= 400, "Invalid request should return an error status code"
+
+
+# ==========================
+# Test for input exceeding context length
+# ==========================
+
+
+def test_exceed_context_length(api_url, headers):
+    """
+    Test case for inputs that exceed the model's maximum context length.
+    """
+    # Construct an overly long message
+    long_content = "你好，" * 20000
+
+    messages = [{"role": "user", "content": long_content}]
+
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+
+    # Check if the response indicates a token limit error or server error (500)
+    try:
+        response_json = resp.json()
+    except Exception:
+        response_json = {}
+
+    # Check status code and response content
+    assert (
+        resp.status_code != 200 or "token" in json.dumps(response_json).lower()
+    ), f"Expected token limit error or similar, but got a normal response: {response_json}"
+
+
+# ==========================
+# Multi-turn Conversation Test
+# ==========================
+def test_multi_turn_conversation(api_url, headers):
+    """
+    Test whether multi-turn conversation context is effective.
+    """
+    messages = [
+        {"role": "user", "content": "你是谁？"},
+        {"role": "assistant", "content": "我是AI助手"},
+        {"role": "user", "content": "你能做什么？"},
+    ]
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+    assert resp.status_code == 200
+    validate(instance=resp.json(), schema=chat_response_schema)
+
+
+# ==========================
+# Concurrent Performance Test
+# ==========================
+def test_concurrent_perf(api_url, headers):
+    """
+    Send concurrent requests to test stability and response time.
+    """
+    prompts = [{"role": "user", "content": "Introduce FastDeploy."}]
+
+    def send_request():
+        """
+        Send a single request
+        """
+        resp = requests.post(api_url, headers=headers, json={"messages": prompts})
+        assert resp.status_code == 200
+        return resp.elapsed.total_seconds()
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+        futures = [executor.submit(send_request) for _ in range(8)]
+        durations = [f.result() for f in futures]
+
+    print("\nResponse time for each request:", durations)
+
+
+# ==========================
+# Metrics Endpoint Test
+# ==========================
+
+
+def test_metrics_endpoint(metrics_url):
+    """
+    Test the metrics monitoring endpoint.
+    """
+    resp = requests.get(metrics_url, timeout=5)
+
+    assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}"
+    assert "text/plain" in resp.headers["Content-Type"], "Content-Type is not text/plain"
+
+    # Parse Prometheus metrics data
+    metrics_data = resp.text
+    lines = metrics_data.split("\n")
+
+    metric_lines = [line for line in lines if not line.startswith("#") and line.strip() != ""]
+
+    # 断言 具体值
+    num_requests_running_found = False
+    num_requests_waiting_found = False
+    time_to_first_token_seconds_sum_found = False
+    time_per_output_token_seconds_sum_found = False
+    e2e_request_latency_seconds_sum_found = False
+    request_inference_time_seconds_sum_found = False
+    request_queue_time_seconds_sum_found = False
+    request_prefill_time_seconds_sum_found = False
+    request_decode_time_seconds_sum_found = False
+    prompt_tokens_total_found = False
+    generation_tokens_total_found = False
+    request_prompt_tokens_sum_found = False
+    request_generation_tokens_sum_found = False
+    gpu_cache_usage_perc_found = False
+    request_params_max_tokens_sum_found = False
+    request_success_total_found = False
+    cache_config_info_found = False
+    available_batch_size_found = False
+    hit_req_rate_found = False
+    hit_token_rate_found = False
+    cpu_hit_token_rate_found = False
+    gpu_hit_token_rate_found = False
+
+    for line in metric_lines:
+        if line.startswith("fastdeploy:num_requests_running"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "num_requests_running 值错误"
+            num_requests_running_found = True
+        elif line.startswith("fastdeploy:num_requests_waiting"):
+            _, value = line.rsplit(" ", 1)
+            num_requests_waiting_found = True
+            assert float(value) >= 0, "num_requests_waiting 值错误"
+        elif line.startswith("fastdeploy:time_to_first_token_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "time_to_first_token_seconds_sum 值错误"
+            time_to_first_token_seconds_sum_found = True
+        elif line.startswith("fastdeploy:time_per_output_token_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "time_per_output_token_seconds_sum 值错误"
+            time_per_output_token_seconds_sum_found = True
+        elif line.startswith("fastdeploy:e2e_request_latency_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "e2e_request_latency_seconds_sum_found 值错误"
+            e2e_request_latency_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_inference_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_inference_time_seconds_sum 值错误"
+            request_inference_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_queue_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_queue_time_seconds_sum 值错误"
+            request_queue_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_prefill_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_prefill_time_seconds_sum 值错误"
+            request_prefill_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_decode_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_decode_time_seconds_sum 值错误"
+            request_decode_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:prompt_tokens_total"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "prompt_tokens_total 值错误"
+            prompt_tokens_total_found = True
+        elif line.startswith("fastdeploy:generation_tokens_total"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "generation_tokens_total 值错误"
+            generation_tokens_total_found = True
+        elif line.startswith("fastdeploy:request_prompt_tokens_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_prompt_tokens_sum 值错误"
+            request_prompt_tokens_sum_found = True
+        elif line.startswith("fastdeploy:request_generation_tokens_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_generation_tokens_sum 值错误"
+            request_generation_tokens_sum_found = True
+        elif line.startswith("fastdeploy:gpu_cache_usage_perc"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "gpu_cache_usage_perc 值错误"
+            gpu_cache_usage_perc_found = True
+        elif line.startswith("fastdeploy:request_params_max_tokens_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_params_max_tokens_sum 值错误"
+            request_params_max_tokens_sum_found = True
+        elif line.startswith("fastdeploy:request_success_total"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_success_total 值错误"
+            request_success_total_found = True
+        elif line.startswith("fastdeploy:cache_config_info"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "cache_config_info 值错误"
+            cache_config_info_found = True
+        elif line.startswith("fastdeploy:available_batch_size"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "available_batch_size 值错误"
+            available_batch_size_found = True
+        elif line.startswith("fastdeploy:hit_req_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "hit_req_rate 值错误"
+            hit_req_rate_found = True
+        elif line.startswith("fastdeploy:hit_token_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "hit_token_rate 值错误"
+            hit_token_rate_found = True
+        elif line.startswith("fastdeploy:cpu_hit_token_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "cpu_hit_token_rate 值错误"
+            cpu_hit_token_rate_found = True
+        elif line.startswith("fastdeploy:gpu_hit_token_rate"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "gpu_hit_token_rate 值错误"
+            gpu_hit_token_rate_found = True
+    assert num_requests_running_found, "缺少 fastdeploy:num_requests_running 指标"
+    assert num_requests_waiting_found, "缺少 fastdeploy:num_requests_waiting 指标"
+    assert time_to_first_token_seconds_sum_found, "缺少 fastdeploy:time_to_first_token_seconds_sum 指标"
+    assert time_per_output_token_seconds_sum_found, "缺少 fastdeploy:time_per_output_token_seconds_sum 指标"
+    assert e2e_request_latency_seconds_sum_found, "缺少 fastdeploy:e2e_request_latency_seconds_sum_found 指标"
+    assert request_inference_time_seconds_sum_found, "缺少 fastdeploy:request_inference_time_seconds_sum 指标"
+    assert request_queue_time_seconds_sum_found, "缺少 fastdeploy:request_queue_time_seconds_sum 指标"
+    assert request_prefill_time_seconds_sum_found, "缺少 fastdeploy:request_prefill_time_seconds_sum 指标"
+    assert request_decode_time_seconds_sum_found, "缺少 fastdeploy:request_decode_time_seconds_sum 指标"
+    assert prompt_tokens_total_found, "缺少 fastdeploy:prompt_tokens_total 指标"
+    assert generation_tokens_total_found, "缺少 fastdeploy:generation_tokens_total 指标"
+    assert request_prompt_tokens_sum_found, "缺少 fastdeploy:request_prompt_tokens_sum 指标"
+    assert request_generation_tokens_sum_found, "缺少 fastdeploy:request_generation_tokens_sum 指标"
+    assert gpu_cache_usage_perc_found, "缺少 fastdeploy:gpu_cache_usage_perc 指标"
+    assert request_params_max_tokens_sum_found, "缺少 fastdeploy:request_params_max_tokens_sum 指标"
+    assert request_success_total_found, "缺少 fastdeploy:request_success_total 指标"
+    assert cache_config_info_found, "缺少 fastdeploy:cache_config_info 指标"
+    assert available_batch_size_found, "缺少 fastdeploy:available_batch_size 指标"
+    assert hit_req_rate_found, "缺少 fastdeploy:hit_req_rate 指标"
+    assert hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标"
+    assert cpu_hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标"
+    assert gpu_hit_token_rate_found, "缺少 fastdeploy:gpu_hit_token_rate 指标"
+
+
+# ==========================
+# OpenAI Client chat.completions Test
+# ==========================
+
+
+@pytest.fixture
+def openai_client():
+    ip = "0.0.0.0"
+    service_http_port = str(FD_API_PORT)
+    client = openai.Client(
+        base_url=f"http://{ip}:{service_http_port}/v1",
+        api_key="EMPTY_API_KEY",
+    )
+    return client
+
+
+# Non-streaming test
+def test_non_streaming_chat(openai_client):
+    """Test non-streaming chat functionality with the local service"""
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_tokens=1024,
+        stream=False,
+    )
+
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "content")
+
+
+# Streaming test
+def test_streaming_chat(openai_client, capsys):
+    """Test streaming chat functionality with the local service"""
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+            {
+                "role": "assistant",
+                "content": "China(Beijing), France(Paris), Australia(Canberra).",
+            },
+            {"role": "user", "content": "OK, tell more."},
+        ],
+        temperature=1,
+        max_tokens=1024,
+        stream=True,
+    )
+
+    output = []
+    for chunk in response:
+        if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
+            output.append(chunk.choices[0].delta.content)
+    assert len(output) > 2
+
+
+# ==========================
+# OpenAI Client completions Test
+# ==========================
+
+
+def test_non_streaming(openai_client):
+    """Test non-streaming chat functionality with the local service"""
+    response = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        temperature=1,
+        max_tokens=1024,
+        stream=False,
+    )
+
+    # Assertions to check the response structure
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+
+
+def test_streaming(openai_client, capsys):
+    """Test streaming functionality with the local service"""
+    response = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        temperature=1,
+        max_tokens=1024,
+        stream=True,
+    )
+
+    # Collect streaming output
+    output = []
+    for chunk in response:
+        output.append(chunk.choices[0].text)
+    assert len(output) > 0
+
+
+def test_profile_reset_block_num():
+    """测试profile reset_block_num功能，与baseline diff不能超过5%"""
+    log_file = "./log/config.log"
+    baseline = 32562
+
+    if not os.path.exists(log_file):
+        pytest.fail(f"Log file not found: {log_file}")
+
+    with open(log_file, "r") as f:
+        log_lines = f.readlines()
+
+    target_line = None
+    for line in log_lines:
+        if "Reset block num" in line:
+            target_line = line.strip()
+            break
+
+    if target_line is None:
+        pytest.fail("日志中没有Reset block num信息")
+
+    match = re.search(r"total_block_num:(\d+)", target_line)
+    if not match:
+        pytest.fail(f"Failed to extract total_block_num from line: {target_line}")
+
+    try:
+        actual_value = int(match.group(1))
+    except ValueError:
+        pytest.fail(f"Invalid number format: {match.group(1)}")
+
+    lower_bound = baseline * (1 - 0.05)
+    upper_bound = baseline * (1 + 0.05)
+    print(f"Reset total_block_num: {actual_value}. baseline: {baseline}")
+
+    assert lower_bound <= actual_value <= upper_bound, (
+        f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
+        f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
+    )
+
+
+def test_prompt_token_ids_in_non_streaming_completion(openai_client):
+    """
+    Test cases for passing token ids through `prompt`/`prompt_token_ids` in non-streaming completion api
+    """
+    # Test case for passing a token id list in `prompt_token_ids`
+    response = openai_client.completions.create(
+        model="default",
+        prompt="",
+        temperature=1,
+        max_tokens=5,
+        extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
+        stream=False,
+    )
+    assert len(response.choices) == 1
+    assert response.usage.prompt_tokens == 9
+
+    # Test case for passing a batch of token id lists in `prompt_token_ids`
+    response = openai_client.completions.create(
+        model="default",
+        prompt="",
+        temperature=1,
+        max_tokens=5,
+        extra_body={"prompt_token_ids": [[5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937], [1, 2, 3]]},
+        stream=False,
+    )
+    assert len(response.choices) == 2
+    assert response.usage.prompt_tokens == 9 + 3
+
+    # Test case for passing a token id list in `prompt`
+    response = openai_client.completions.create(
+        model="default",
+        prompt=[5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937],
+        temperature=1,
+        max_tokens=5,
+        stream=False,
+    )
+    assert len(response.choices) == 1
+    assert response.usage.prompt_tokens == 9
+
+    # Test case for passing a batch of token id lists in `prompt`
+    response = openai_client.completions.create(
+        model="default",
+        prompt=[[5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937], [1, 2, 3]],
+        temperature=1,
+        max_tokens=5,
+        stream=False,
+    )
+    assert len(response.choices) == 2
+    assert response.usage.prompt_tokens == 9 + 3
+
+
+def test_prompt_token_ids_in_streaming_completion(openai_client):
+    """
+    Test cases for passing token ids through `prompt`/`prompt_token_ids` in streaming completion api
+    """
+    # Test case for passing a token id list in `prompt_token_ids`
+    response = openai_client.completions.create(
+        model="default",
+        prompt="",
+        temperature=1,
+        max_tokens=5,
+        extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    sum_prompt_tokens = 0
+    for chunk in response:
+        if len(chunk.choices) > 0:
+            assert chunk.usage is None
+        else:
+            sum_prompt_tokens += chunk.usage.prompt_tokens
+    assert sum_prompt_tokens == 9
+
+    # Test case for passing a batch of token id lists in `prompt_token_ids`
+    response = openai_client.completions.create(
+        model="default",
+        prompt="",
+        temperature=1,
+        max_tokens=5,
+        extra_body={"prompt_token_ids": [[5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937], [1, 2, 3]]},
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    sum_prompt_tokens = 0
+    for chunk in response:
+        if len(chunk.choices) > 0:
+            assert chunk.usage is None
+        else:
+            sum_prompt_tokens += chunk.usage.prompt_tokens
+    assert sum_prompt_tokens == 9 + 3
+
+    # Test case for passing a token id list in `prompt`
+    response = openai_client.completions.create(
+        model="default",
+        prompt=[5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937],
+        temperature=1,
+        max_tokens=5,
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    sum_prompt_tokens = 0
+    for chunk in response:
+        if len(chunk.choices) > 0:
+            assert chunk.usage is None
+        else:
+            sum_prompt_tokens += chunk.usage.prompt_tokens
+    assert sum_prompt_tokens == 9
+
+    # Test case for passing a batch of token id lists in `prompt`
+    response = openai_client.completions.create(
+        model="default",
+        prompt=[[5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937], [1, 2, 3]],
+        temperature=1,
+        max_tokens=5,
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    sum_prompt_tokens = 0
+    for chunk in response:
+        if len(chunk.choices) > 0:
+            assert chunk.usage is None
+        else:
+            sum_prompt_tokens += chunk.usage.prompt_tokens
+    assert sum_prompt_tokens == 9 + 3
--- a/tests/metrics/test_new_metrics.py
+++ b/tests/metrics/test_new_metrics.py
@@ -0,0 +1,92 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+from fastdeploy.cache_manager.cache_metrics import CacheMetrics
+from fastdeploy.output.token_processor import TokenProcessor
+
+
+class TestCoverageFix(unittest.TestCase):
+    @patch("fastdeploy.cache_manager.cache_metrics.main_process_metrics")
+    def test_cache_metrics_update_history(self, mock_main_process_metrics):
+        """
+        测试 CacheMetrics._update_history_hit_metrics 方法。
+
+        目标：确保 main_process_metrics 的 .set() 方法被正确调用，覆盖第 58-61 行。
+        """
+        print("\nRunning test for CacheMetrics._update_history_hit_metrics...")
+        metrics = CacheMetrics()
+
+        # 准备数据以避免除零错误
+        metrics.req_count = 20
+        metrics.hit_req_count = 10
+        metrics.total_token_num = 1000
+        metrics.total_cpu_matched_token_num = 250
+        metrics.total_gpu_matched_token_num = 350
+        metrics.matched_token_num = metrics.total_cpu_matched_token_num + metrics.total_gpu_matched_token_num
+
+        # 调用目标方法
+        metrics._update_history_hit_metrics()
+
+        # 断言 Prometheus 指标的 set 方法是否被正确的值调用
+        mock_main_process_metrics.hit_req_rate.set.assert_called_once_with(0.5)  # 10 / 20
+        mock_main_process_metrics.hit_token_rate.set.assert_called_once_with(0.6)  # 600 / 1000
+        mock_main_process_metrics.cpu_hit_token_rate.set.assert_called_once_with(0.25)  # 250 / 1000
+        mock_main_process_metrics.gpu_hit_token_rate.set.assert_called_once_with(0.35)  # 350 / 1000
+
+        print("Test for CacheMetrics passed.")
+
+    def setUp(self):
+        """为 TokenProcessor 测试设置通用的 mock 对象。"""
+        self.mock_cfg = MagicMock()
+        self.mock_cached_generated_tokens = MagicMock()
+        self.mock_engine_worker_queue = MagicMock()
+        self.mock_split_connector = MagicMock()
+        self.mock_resource_manager = MagicMock()
+
+        with patch("fastdeploy.output.token_processor.IPCSignal"):
+            self.processor = TokenProcessor(
+                cfg=self.mock_cfg,
+                cached_generated_tokens=self.mock_cached_generated_tokens,
+                engine_worker_queue=self.mock_engine_worker_queue,
+                split_connector=self.mock_split_connector,
+            )
+        self.processor.resource_manager = self.mock_resource_manager
+
+    # 使用 patch 来模拟 token_processor 模块中引用的 main_process_metrics
+    @patch("fastdeploy.output.token_processor.main_process_metrics")
+    def test_recycle_resources_updates_metrics(self, mock_main_process_metrics):
+        """
+        测试 TokenProcessor._recycle_resources 方法。
+
+        目标：确保 available_batch_size 等指标被更新，覆盖第 285 行左右的代码。
+        """
+        print("\nRunning test for TokenProcessor._recycle_resources (metric update)...")
+
+        # 1. 准备测试数据和 mock 行为
+        task_id = "request-456"
+        index = 0
+        mock_task = MagicMock()
+
+        # 配置 resource_manager 的 mock 返回值
+        self.mock_resource_manager.available_batch.return_value = 8
+        self.mock_resource_manager.total_block_number.return_value = 1024
+        self.mock_resource_manager.max_num_seqs = 16
+
+        # _recycle_resources 方法内部会操作这些列表/字典
+        self.mock_resource_manager.tasks_list = [mock_task]
+        self.mock_resource_manager.stop_flags = [False]
+
+        # 为了避免 del self.tokens_counter[task_id] 抛出 KeyError
+        self.processor.tokens_counter[task_id] = 5
+
+        # 调用目标方法
+        self.processor._recycle_resources(task_id=task_id, index=index, task=mock_task, result=None, is_prefill=False)
+
+        # 核心断言：验证 available_batch_size 指标是否被正确设置
+        mock_main_process_metrics.available_batch_size.set.assert_called_once_with(8)
+
+        print("Test for TokenProcessor passed.")
+
+
+if __name__ == "__main__":
+    unittest.main()