add e2e cases (#3476)

* add e2e cases * fix
2025-12-24 13:28:13 +08:00 · 2025-08-20 18:50:14 +08:00
parent 9ff2dfb162
commit e197894977
5 changed files with 2262 additions and 0 deletions
--- a/.github/workflows/_unit_test_coverage.yml
+++ b/.github/workflows/_unit_test_coverage.yml
@@ -150,6 +150,7 @@ jobs:

          python -m pip install coverage
          python -m pip install diff-cover
+          python -m pip install jsonschema aistudio_sdk==0.3.5
          python -m pip install ${fd_wheel_url}
          if [ -d "tests/plugins" ]; then
              cd tests/plugins
@@ -160,6 +161,7 @@ jobs:
          fi
          export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
          export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
+          export COVERAGE_PROCESS_START=/workspace/FastDeploy/scripts/.coveragerc
          TEST_EXIT_CODE=0
          bash scripts/coverage_run.sh || TEST_EXIT_CODE=8
          git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
--- a/scripts/.coveragerc
+++ b/scripts/.coveragerc
@@ -1,6 +1,7 @@
 [run]
 source = fastdeploy
 parallel = True
+concurrency = multiprocessing

 [paths]
 source =
--- a/test/e2e/test_EB_Lite_serving.py
+++ b/test/e2e/test_EB_Lite_serving.py
--- a/test/e2e/test_EB_VL_Lite_serving.py
+++ b/test/e2e/test_EB_VL_Lite_serving.py
@@ -0,0 +1,578 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+import signal
+import socket
+import subprocess
+import sys
+import time
+
+import openai
+import pytest
+import requests
+
+# Read ports from environment variables; use default values if not set
+FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
+FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
+FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
+
+# List of ports to clean before and after tests
+PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT]
+
+
+def is_port_open(host: str, port: int, timeout=1.0):
+    """
+    Check if a TCP port is open on the given host.
+    Returns True if connection succeeds, False otherwise.
+    """
+    try:
+        with socket.create_connection((host, port), timeout):
+            return True
+    except Exception:
+        return False
+
+
+def kill_process_on_port(port: int):
+    """
+    Kill processes that are listening on the given port.
+    Uses `lsof` to find process ids and sends SIGKILL.
+    """
+    try:
+        output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
+        for pid in output.splitlines():
+            os.kill(int(pid), signal.SIGKILL)
+            print(f"Killed process on port {port}, pid={pid}")
+    except subprocess.CalledProcessError:
+        pass
+
+
+def clean_ports():
+    """
+    Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
+    """
+    for port in PORTS_TO_CLEAN:
+        kill_process_on_port(port)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup_and_run_server():
+    """
+    Pytest fixture that runs once per test session:
+    - Cleans ports before tests
+    - Starts the API server as a subprocess
+    - Waits for server port to open (up to 30 seconds)
+    - Tears down server after all tests finish
+    """
+    print("Pre-test port cleanup...")
+    clean_ports()
+
+    base_path = os.getenv("MODEL_PATH")
+    if base_path:
+        model_path = os.path.join(base_path, "ernie-4_5-vl-28b-a3b-bf16-paddle")
+    else:
+        model_path = "./ernie-4_5-vl-28b-a3b-bf16-paddle"
+
+    log_path = "server.log"
+    limit_mm_str = json.dumps({"image": 100, "video": 100})
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.entrypoints.openai.api_server",
+        "--model",
+        model_path,
+        "--port",
+        str(FD_API_PORT),
+        "--tensor-parallel-size",
+        "2",
+        "--engine-worker-queue-port",
+        str(FD_ENGINE_QUEUE_PORT),
+        "--metrics-port",
+        str(FD_METRICS_PORT),
+        "--enable-mm",
+        "--max-model-len",
+        "32768",
+        "--max-num-batched-tokens",
+        "384",
+        "--max-num-seqs",
+        "128",
+        "--limit-mm-per-prompt",
+        limit_mm_str,
+        "--enable-chunked-prefill",
+        "--kv-cache-ratio",
+        "0.71",
+        "--quantization",
+        "wint4",
+        "--reasoning-parser",
+        "ernie-45-vl",
+    ]
+
+    # Start subprocess in new process group
+    with open(log_path, "w") as logfile:
+        process = subprocess.Popen(
+            cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+        )
+
+    # Wait up to 10 minutes for API server to be ready
+    for _ in range(10 * 60):
+        if is_port_open("127.0.0.1", FD_API_PORT):
+            print(f"API server is up on port {FD_API_PORT}")
+            break
+        time.sleep(1)
+    else:
+        print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
+        try:
+            os.killpg(process.pid, signal.SIGTERM)
+        except Exception as e:
+            print(f"Failed to kill process group: {e}")
+        raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
+
+    yield  # Run tests
+
+    print("\n===== Post-test server cleanup... =====")
+    try:
+        os.killpg(process.pid, signal.SIGTERM)
+        print(f"API server (pid={process.pid}) terminated")
+    except Exception as e:
+        print(f"Failed to terminate API server: {e}")
+
+
+@pytest.fixture(scope="session")
+def api_url(request):
+    """
+    Returns the API endpoint URL for chat completions.
+    """
+    return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions"
+
+
+@pytest.fixture(scope="session")
+def metrics_url(request):
+    """
+    Returns the metrics endpoint URL.
+    """
+    return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
+
+
+@pytest.fixture
+def headers():
+    """
+    Returns common HTTP request headers.
+    """
+    return {"Content-Type": "application/json"}
+
+
+@pytest.fixture
+def consistent_payload():
+    """
+    Returns a fixed payload for consistency testing,
+    including a fixed random seed and temperature.
+    """
+    return {
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            }
+        ],
+        "temperature": 0.8,
+        "top_p": 0,  # fix top_p to reduce randomness
+        "seed": 13,  # fixed random seed
+    }
+
+
+# ==========================
+# Consistency test for repeated runs with fixed payload
+# ==========================
+def test_consistency_between_runs(api_url, headers, consistent_payload):
+    """
+    Test that result is same as the base result.
+    """
+    # request
+    resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp1.status_code == 200
+    result1 = resp1.json()
+    content1 = (
+        result1["choices"][0]["message"]["reasoning_content"]
+        + "</think>"
+        + result1["choices"][0]["message"]["content"]
+    )
+    file_res_temp = "ernie-4_5-vl"
+    f_o = open(file_res_temp, "a")
+    f_o.writelines(content1)
+    f_o.close()
+
+    # base result
+    base_path = os.getenv("MODEL_PATH")
+    if base_path:
+        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2")
+    else:
+        base_file = "ernie-4_5-vl-base-tp2"
+    with open(base_file, "r") as f:
+        content2 = f.read()
+
+    # Verify that result is same as the base result
+    assert content1 == content2
+
+
+# ==========================
+# OpenAI Client Chat Completion Test
+# ==========================
+
+
+@pytest.fixture
+def openai_client():
+    ip = "0.0.0.0"
+    service_http_port = str(FD_API_PORT)
+    client = openai.Client(
+        base_url=f"http://{ip}:{service_http_port}/v1",
+        api_key="EMPTY_API_KEY",
+    )
+    return client
+
+
+# Non-streaming test
+def test_non_streaming_chat(openai_client):
+    """Test non-streaming chat functionality with the local service"""
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful AI assistant.",
+            },  # system不是必需，可选
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
+        ],
+        temperature=1,
+        max_tokens=53,
+        stream=False,
+    )
+
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "content")
+
+
+# Streaming test
+def test_streaming_chat(openai_client, capsys):
+    """Test streaming chat functionality with the local service"""
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful AI assistant.",
+            },  # system不是必需，可选
+            {"role": "user", "content": "List 3 countries and their capitals."},
+            {
+                "role": "assistant",
+                "content": "China(Beijing), France(Paris), Australia(Canberra).",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
+        ],
+        temperature=1,
+        max_tokens=512,
+        stream=True,
+    )
+
+    output = []
+    for chunk in response:
+        if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
+            output.append(chunk.choices[0].delta.content)
+    assert len(output) > 2
+
+
+# ==========================
+# OpenAI Client additional chat/completions test
+# ==========================
+
+
+def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
+    """
+    Test return_token_ids option in non-streaming chat functionality with the local service
+    """
+    # 设定 return_token_ids
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
+        ],
+        temperature=1,
+        max_tokens=53,
+        extra_body={"return_token_ids": True},
+        stream=False,
+    )
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "prompt_token_ids")
+    assert isinstance(response.choices[0].message.prompt_token_ids, list)
+    assert hasattr(response.choices[0].message, "completion_token_ids")
+    assert isinstance(response.choices[0].message.completion_token_ids, list)
+
+    # 不设定 return_token_ids
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
+        ],
+        temperature=1,
+        max_tokens=53,
+        extra_body={"return_token_ids": False},
+        stream=False,
+    )
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "prompt_token_ids")
+    assert response.choices[0].message.prompt_token_ids is None
+    assert hasattr(response.choices[0].message, "completion_token_ids")
+    assert response.choices[0].message.completion_token_ids is None
+
+
+def test_streaming_chat_with_return_token_ids(openai_client, capsys):
+    """
+    Test return_token_ids option in streaming chat functionality with the local service
+    """
+    # enable return_token_ids
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
+        ],
+        temperature=1,
+        max_tokens=53,
+        extra_body={"return_token_ids": True},
+        stream=True,
+    )
+    is_first_chunk = True
+    for chunk in response:
+        assert hasattr(chunk, "choices")
+        assert len(chunk.choices) > 0
+        assert hasattr(chunk.choices[0], "delta")
+        assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
+        assert hasattr(chunk.choices[0].delta, "completion_token_ids")
+        if is_first_chunk:
+            is_first_chunk = False
+            assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
+            assert chunk.choices[0].delta.completion_token_ids is None
+        else:
+            assert chunk.choices[0].delta.prompt_token_ids is None
+            assert isinstance(chunk.choices[0].delta.completion_token_ids, list)
+
+    # disable return_token_ids
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
+                            "detail": "high",
+                        },
+                    },
+                    {"type": "text", "text": "请描述图片内容"},
+                ],
+            },
+        ],
+        temperature=1,
+        max_tokens=53,
+        extra_body={"return_token_ids": False},
+        stream=True,
+    )
+    for chunk in response:
+        assert hasattr(chunk, "choices")
+        assert len(chunk.choices) > 0
+        assert hasattr(chunk.choices[0], "delta")
+        assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
+        assert chunk.choices[0].delta.prompt_token_ids is None
+        assert hasattr(chunk.choices[0].delta, "completion_token_ids")
+        assert chunk.choices[0].delta.completion_token_ids is None
+
+
+def test_chat_with_thinking(openai_client, capsys):
+    """
+    Test enable_thinking & reasoning_max_tokens option in non-streaming chat functionality with the local service
+    """
+    # enable thinking, non-streaming
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=10,
+        extra_body={"chat_template_kwargs": {"enable_thinking": True}},
+    )
+    assert response.choices[0].message.reasoning_content is not None
+
+    # disable thinking, non-streaming
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=10,
+        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
+    )
+    assert response.choices[0].message.reasoning_content is None
+    assert "</think>" not in response.choices[0].message.content
+
+    # enable thinking, streaming
+    reasoning_max_tokens = 3
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+            "reasoning_max_tokens": reasoning_max_tokens,
+            "return_token_ids": True,
+        },
+        stream=True,
+        max_tokens=10,
+    )
+    completion_tokens = reasoning_tokens = 1
+    total_tokens = 0
+    for chunk_id, chunk in enumerate(response):
+        if chunk_id == 0:  # the first chunk is an extra chunk
+            continue
+        delta_message = chunk.choices[0].delta
+        if delta_message.content != "" and delta_message.reasoning_content == "":
+            completion_tokens += len(delta_message.completion_token_ids)
+        elif delta_message.reasoning_content != "" and delta_message.content == "":
+            reasoning_tokens += len(delta_message.completion_token_ids)
+        total_tokens += len(delta_message.completion_token_ids)
+    assert completion_tokens + reasoning_tokens == total_tokens
+    assert reasoning_tokens <= reasoning_max_tokens
+
+
+def test_profile_reset_block_num():
+    """测试profile reset_block_num功能，与baseline diff不能超过5%"""
+    log_file = "./log/config.log"
+    baseline = 40000
+
+    if not os.path.exists(log_file):
+        pytest.fail(f"Log file not found: {log_file}")
+
+    with open(log_file, "r") as f:
+        log_lines = f.readlines()
+
+    target_line = None
+    for line in log_lines:
+        if "Reset block num" in line:
+            target_line = line.strip()
+            break
+
+    if target_line is None:
+        pytest.fail("日志中没有Reset block num信息")
+
+    match = re.search(r"total_block_num:(\d+)", target_line)
+    if not match:
+        pytest.fail(f"Failed to extract total_block_num from line: {target_line}")
+
+    try:
+        actual_value = int(match.group(1))
+    except ValueError:
+        pytest.fail(f"Invalid number format: {match.group(1)}")
+
+    lower_bound = baseline * (1 - 0.05)
+    upper_bound = baseline * (1 + 0.05)
+    print(f"Reset total_block_num: {actual_value}. baseline: {baseline}")
+
+    assert lower_bound <= actual_value <= upper_bound, (
+        f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
+        f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
+    )
--- a/test/e2e/test_Qwen2-7B-Instruct_serving.py
+++ b/test/e2e/test_Qwen2-7B-Instruct_serving.py
@@ -0,0 +1,641 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import concurrent.futures
+import json
+import os
+import re
+import signal
+import socket
+import subprocess
+import sys
+import time
+
+import openai
+import pytest
+import requests
+from jsonschema import validate
+
+# Read ports from environment variables; use default values if not set
+FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
+FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
+FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
+
+# List of ports to clean before and after tests
+PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT]
+
+
+def is_port_open(host: str, port: int, timeout=1.0):
+    """
+    Check if a TCP port is open on the given host.
+    Returns True if connection succeeds, False otherwise.
+    """
+    try:
+        with socket.create_connection((host, port), timeout):
+            return True
+    except Exception:
+        return False
+
+
+def kill_process_on_port(port: int):
+    """
+    Kill processes that are listening on the given port.
+    Uses `lsof` to find process ids and sends SIGKILL.
+    """
+    try:
+        output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
+        for pid in output.splitlines():
+            os.kill(int(pid), signal.SIGKILL)
+            print(f"Killed process on port {port}, pid={pid}")
+    except subprocess.CalledProcessError:
+        pass
+
+
+def clean_ports():
+    """
+    Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
+    """
+    for port in PORTS_TO_CLEAN:
+        kill_process_on_port(port)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup_and_run_server():
+    """
+    Pytest fixture that runs once per test session:
+    - Cleans ports before tests
+    - Starts the API server as a subprocess
+    - Waits for server port to open (up to 30 seconds)
+    - Tears down server after all tests finish
+    """
+    print("Pre-test port cleanup...")
+    clean_ports()
+
+    base_path = os.getenv("MODEL_PATH")
+    if base_path:
+        model_path = os.path.join(base_path, "Qwen2-7B-Instruct")
+    else:
+        model_path = "./Qwen2-7B-Instruct"
+
+    log_path = "server.log"
+    cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.entrypoints.openai.api_server",
+        "--model",
+        model_path,
+        "--port",
+        str(FD_API_PORT),
+        "--tensor-parallel-size",
+        "1",
+        "--engine-worker-queue-port",
+        str(FD_ENGINE_QUEUE_PORT),
+        "--metrics-port",
+        str(FD_METRICS_PORT),
+        "--max-model-len",
+        "32768",
+        "--max-num-seqs",
+        "128",
+        "--quantization",
+        "wint8",
+    ]
+
+    # Start subprocess in new process group
+    with open(log_path, "w") as logfile:
+        process = subprocess.Popen(
+            cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+        )
+
+    # Wait up to 300 seconds for API server to be ready
+    for _ in range(300):
+        if is_port_open("127.0.0.1", FD_API_PORT):
+            print(f"API server is up on port {FD_API_PORT}")
+            break
+        time.sleep(1)
+    else:
+        print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
+        try:
+            os.killpg(process.pid, signal.SIGTERM)
+        except Exception as e:
+            print(f"Failed to kill process group: {e}")
+        raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
+
+    yield  # Run tests
+
+    print("\n===== Post-test server cleanup... =====")
+    try:
+        os.killpg(process.pid, signal.SIGTERM)
+        print(f"API server (pid={process.pid}) terminated")
+    except Exception as e:
+        print(f"Failed to terminate API server: {e}")
+
+
+@pytest.fixture(scope="session")
+def api_url(request):
+    """
+    Returns the API endpoint URL for chat completions.
+    """
+    return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions"
+
+
+@pytest.fixture(scope="session")
+def metrics_url(request):
+    """
+    Returns the metrics endpoint URL.
+    """
+    return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
+
+
+@pytest.fixture
+def headers():
+    """
+    Returns common HTTP request headers.
+    """
+    return {"Content-Type": "application/json"}
+
+
+@pytest.fixture
+def consistent_payload():
+    """
+    Returns a fixed payload for consistency testing,
+    including a fixed random seed and temperature.
+    """
+    return {
+        "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}],
+        "temperature": 0.9,
+        "top_p": 0,  # fix top_p to reduce randomness
+        "seed": 13,  # fixed random seed
+    }
+
+
+# ==========================
+# JSON Schema for validating chat API responses
+# ==========================
+chat_response_schema = {
+    "type": "object",
+    "properties": {
+        "id": {"type": "string"},
+        "object": {"type": "string"},
+        "created": {"type": "number"},
+        "model": {"type": "string"},
+        "choices": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "message": {
+                        "type": "object",
+                        "properties": {
+                            "role": {"type": "string"},
+                            "content": {"type": "string"},
+                        },
+                        "required": ["role", "content"],
+                    },
+                    "index": {"type": "number"},
+                    "finish_reason": {"type": "string"},
+                },
+                "required": ["message", "index", "finish_reason"],
+            },
+        },
+    },
+    "required": ["id", "object", "created", "model", "choices"],
+}
+
+
+# ==========================
+# Helper function to calculate difference rate between two texts
+# ==========================
+def calculate_diff_rate(text1, text2):
+    """
+    Calculate the difference rate between two strings
+    based on the normalized Levenshtein edit distance.
+    Returns a float in [0,1], where 0 means identical.
+    """
+    if text1 == text2:
+        return 0.0
+
+    len1, len2 = len(text1), len(text2)
+    dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
+
+    for i in range(len1 + 1):
+        for j in range(len2 + 1):
+            if i == 0 or j == 0:
+                dp[i][j] = i + j
+            elif text1[i - 1] == text2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
+            else:
+                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
+
+    edit_distance = dp[len1][len2]
+    max_len = max(len1, len2)
+    return edit_distance / max_len if max_len > 0 else 0.0
+
+
+# ==========================
+# Valid prompt test cases for parameterized testing
+# ==========================
+valid_prompts = [
+    [{"role": "user", "content": "你好"}],
+    [{"role": "user", "content": "用一句话介绍 FastDeploy"}],
+]
+
+
+@pytest.mark.parametrize("messages", valid_prompts)
+def test_valid_chat(messages, api_url, headers):
+    """
+    Test valid chat requests.
+    """
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+
+    assert resp.status_code == 200
+    validate(instance=resp.json(), schema=chat_response_schema)
+
+
+# ==========================
+# Consistency test for repeated runs with fixed payload
+# ==========================
+def test_consistency_between_runs(api_url, headers, consistent_payload):
+    """
+    Test that two runs with the same fixed input produce similar outputs.
+    """
+    # First request
+    resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp1.status_code == 200
+    result1 = resp1.json()
+    content1 = result1["choices"][0]["message"]["content"]
+
+    # Second request
+    resp2 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp2.status_code == 200
+    result2 = resp2.json()
+    content2 = result2["choices"][0]["message"]["content"]
+
+    # Calculate difference rate
+    diff_rate = calculate_diff_rate(content1, content2)
+
+    # Verify that the difference rate is below the threshold
+    assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})"
+
+
+# ==========================
+# Invalid prompt tests
+# ==========================
+
+invalid_prompts = [
+    [],  # Empty array
+    [{}],  # Empty object
+    [{"role": "user"}],  # Missing content
+    [{"content": "hello"}],  # Missing role
+]
+
+
+@pytest.mark.parametrize("messages", invalid_prompts)
+def test_invalid_chat(messages, api_url, headers):
+    """
+    Test invalid chat inputs
+    """
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+    assert resp.status_code >= 400, "Invalid request should return an error status code"
+
+
+# ==========================
+# Test for input exceeding context length
+# ==========================
+
+
+def test_exceed_context_length(api_url, headers):
+    """
+    Test case for inputs that exceed the model's maximum context length.
+    """
+    # Construct an overly long message
+    long_content = "你好，" * 20000
+
+    messages = [{"role": "user", "content": long_content}]
+
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+
+    # Check if the response indicates a token limit error or server error (500)
+    try:
+        response_json = resp.json()
+    except Exception:
+        response_json = {}
+
+    # Check status code and response content
+    assert (
+        resp.status_code != 200 or "token" in json.dumps(response_json).lower()
+    ), f"Expected token limit error or similar, but got a normal response: {response_json}"
+
+
+# ==========================
+# Multi-turn Conversation Test
+# ==========================
+def test_multi_turn_conversation(api_url, headers):
+    """
+    Test whether multi-turn conversation context is effective.
+    """
+    messages = [
+        {"role": "user", "content": "你是谁？"},
+        {"role": "assistant", "content": "我是AI助手"},
+        {"role": "user", "content": "你能做什么？"},
+    ]
+    resp = requests.post(api_url, headers=headers, json={"messages": messages})
+    assert resp.status_code == 200
+    validate(instance=resp.json(), schema=chat_response_schema)
+
+
+# ==========================
+# Concurrent Performance Test
+# ==========================
+def test_concurrent_perf(api_url, headers):
+    """
+    Send concurrent requests to test stability and response time.
+    """
+    prompts = [{"role": "user", "content": "Introduce FastDeploy."}]
+
+    def send_request():
+        """
+        Send a single request
+        """
+        resp = requests.post(api_url, headers=headers, json={"messages": prompts})
+        assert resp.status_code == 200
+        return resp.elapsed.total_seconds()
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+        futures = [executor.submit(send_request) for _ in range(8)]
+        durations = [f.result() for f in futures]
+
+    print("\nResponse time for each request:", durations)
+
+
+# ==========================
+# Metrics Endpoint Test
+# ==========================
+
+
+def test_metrics_endpoint(metrics_url):
+    """
+    Test the metrics monitoring endpoint.
+    """
+    resp = requests.get(metrics_url, timeout=5)
+
+    assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}"
+    assert "text/plain" in resp.headers["Content-Type"], "Content-Type is not text/plain"
+
+    # Parse Prometheus metrics data
+    metrics_data = resp.text
+    lines = metrics_data.split("\n")
+
+    metric_lines = [line for line in lines if not line.startswith("#") and line.strip() != ""]
+
+    # 断言 具体值
+    num_requests_running_found = False
+    num_requests_waiting_found = False
+    time_to_first_token_seconds_sum_found = False
+    time_per_output_token_seconds_sum_found = False
+    e2e_request_latency_seconds_sum_found = False
+    request_inference_time_seconds_sum_found = False
+    request_queue_time_seconds_sum_found = False
+    request_prefill_time_seconds_sum_found = False
+    request_decode_time_seconds_sum_found = False
+    prompt_tokens_total_found = False
+    generation_tokens_total_found = False
+    request_prompt_tokens_sum_found = False
+    request_generation_tokens_sum_found = False
+    gpu_cache_usage_perc_found = False
+    request_params_max_tokens_sum_found = False
+    request_success_total_found = False
+
+    for line in metric_lines:
+        if line.startswith("fastdeploy:num_requests_running"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "num_requests_running 值错误"
+            num_requests_running_found = True
+        elif line.startswith("fastdeploy:num_requests_waiting"):
+            _, value = line.rsplit(" ", 1)
+            num_requests_waiting_found = True
+            assert float(value) >= 0, "num_requests_waiting 值错误"
+        elif line.startswith("fastdeploy:time_to_first_token_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "time_to_first_token_seconds_sum 值错误"
+            time_to_first_token_seconds_sum_found = True
+        elif line.startswith("fastdeploy:time_per_output_token_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "time_per_output_token_seconds_sum 值错误"
+            time_per_output_token_seconds_sum_found = True
+        elif line.startswith("fastdeploy:e2e_request_latency_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "e2e_request_latency_seconds_sum_found 值错误"
+            e2e_request_latency_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_inference_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_inference_time_seconds_sum 值错误"
+            request_inference_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_queue_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_queue_time_seconds_sum 值错误"
+            request_queue_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_prefill_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_prefill_time_seconds_sum 值错误"
+            request_prefill_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:request_decode_time_seconds_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_decode_time_seconds_sum 值错误"
+            request_decode_time_seconds_sum_found = True
+        elif line.startswith("fastdeploy:prompt_tokens_total"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "prompt_tokens_total 值错误"
+            prompt_tokens_total_found = True
+        elif line.startswith("fastdeploy:generation_tokens_total"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "generation_tokens_total 值错误"
+            generation_tokens_total_found = True
+        elif line.startswith("fastdeploy:request_prompt_tokens_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_prompt_tokens_sum 值错误"
+            request_prompt_tokens_sum_found = True
+        elif line.startswith("fastdeploy:request_generation_tokens_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_generation_tokens_sum 值错误"
+            request_generation_tokens_sum_found = True
+        elif line.startswith("fastdeploy:gpu_cache_usage_perc"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "gpu_cache_usage_perc 值错误"
+            gpu_cache_usage_perc_found = True
+        elif line.startswith("fastdeploy:request_params_max_tokens_sum"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_params_max_tokens_sum 值错误"
+            request_params_max_tokens_sum_found = True
+        elif line.startswith("fastdeploy:request_success_total"):
+            _, value = line.rsplit(" ", 1)
+            assert float(value) >= 0, "request_success_total 值错误"
+            request_success_total_found = True
+
+    assert num_requests_running_found, "缺少 fastdeploy:num_requests_running 指标"
+    assert num_requests_waiting_found, "缺少 fastdeploy:num_requests_waiting 指标"
+    assert time_to_first_token_seconds_sum_found, "缺少 fastdeploy:time_to_first_token_seconds_sum 指标"
+    assert time_per_output_token_seconds_sum_found, "缺少 fastdeploy:time_per_output_token_seconds_sum 指标"
+    assert e2e_request_latency_seconds_sum_found, "缺少 fastdeploy:e2e_request_latency_seconds_sum_found 指标"
+    assert request_inference_time_seconds_sum_found, "缺少 fastdeploy:request_inference_time_seconds_sum 指标"
+    assert request_queue_time_seconds_sum_found, "缺少 fastdeploy:request_queue_time_seconds_sum 指标"
+    assert request_prefill_time_seconds_sum_found, "缺少 fastdeploy:request_prefill_time_seconds_sum 指标"
+    assert request_decode_time_seconds_sum_found, "缺少 fastdeploy:request_decode_time_seconds_sum 指标"
+    assert prompt_tokens_total_found, "缺少 fastdeploy:prompt_tokens_total 指标"
+    assert generation_tokens_total_found, "缺少 fastdeploy:generation_tokens_total 指标"
+    assert request_prompt_tokens_sum_found, "缺少 fastdeploy:request_prompt_tokens_sum 指标"
+    assert request_generation_tokens_sum_found, "缺少 fastdeploy:request_generation_tokens_sum 指标"
+    assert gpu_cache_usage_perc_found, "缺少 fastdeploy:gpu_cache_usage_perc 指标"
+    assert request_params_max_tokens_sum_found, "缺少 fastdeploy:request_params_max_tokens_sum 指标"
+    assert request_success_total_found, "缺少 fastdeploy:request_success_total 指标"
+
+
+# ==========================
+# OpenAI Client chat.completions Test
+# ==========================
+
+
+@pytest.fixture
+def openai_client():
+    ip = "0.0.0.0"
+    service_http_port = str(FD_API_PORT)
+    client = openai.Client(
+        base_url=f"http://{ip}:{service_http_port}/v1",
+        api_key="EMPTY_API_KEY",
+    )
+    return client
+
+
+# Non-streaming test
+def test_non_streaming_chat(openai_client):
+    """Test non-streaming chat functionality with the local service"""
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_tokens=1024,
+        stream=False,
+    )
+
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "content")
+
+
+# Streaming test
+def test_streaming_chat(openai_client, capsys):
+    """Test streaming chat functionality with the local service"""
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+            {
+                "role": "assistant",
+                "content": "China(Beijing), France(Paris), Australia(Canberra).",
+            },
+            {"role": "user", "content": "OK, tell more."},
+        ],
+        temperature=1,
+        max_tokens=1024,
+        stream=True,
+    )
+
+    output = []
+    for chunk in response:
+        if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
+            output.append(chunk.choices[0].delta.content)
+    assert len(output) > 2
+
+
+# ==========================
+# OpenAI Client completions Test
+# ==========================
+
+
+def test_non_streaming(openai_client):
+    """Test non-streaming chat functionality with the local service"""
+    response = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        temperature=1,
+        max_tokens=1024,
+        stream=False,
+    )
+
+    # Assertions to check the response structure
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+
+
+def test_streaming(openai_client, capsys):
+    """Test streaming functionality with the local service"""
+    response = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        temperature=1,
+        max_tokens=1024,
+        stream=True,
+    )
+
+    # Collect streaming output
+    output = []
+    for chunk in response:
+        output.append(chunk.choices[0].text)
+    assert len(output) > 0
+
+
+def test_profile_reset_block_num():
+    """测试profile reset_block_num功能，与baseline diff不能超过5%"""
+    log_file = "./log/config.log"
+    baseline = 32562
+
+    if not os.path.exists(log_file):
+        pytest.fail(f"Log file not found: {log_file}")
+
+    with open(log_file, "r") as f:
+        log_lines = f.readlines()
+
+    target_line = None
+    for line in log_lines:
+        if "Reset block num" in line:
+            target_line = line.strip()
+            break
+
+    if target_line is None:
+        pytest.fail("日志中没有Reset block num信息")
+
+    match = re.search(r"total_block_num:(\d+)", target_line)
+    if not match:
+        pytest.fail(f"Failed to extract total_block_num from line: {target_line}")
+
+    try:
+        actual_value = int(match.group(1))
+    except ValueError:
+        pytest.fail(f"Invalid number format: {match.group(1)}")
+
+    lower_bound = baseline * (1 - 0.05)
+    upper_bound = baseline * (1 + 0.05)
+    print(f"Reset total_block_num: {actual_value}. baseline: {baseline}")
+
+    assert lower_bound <= actual_value <= upper_bound, (
+        f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
+        f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
+    )