Sync v2.0 version of code to github repo

2025-12-24 13:28:13 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,106 @@
+### FastDeploy服务化性能压测工具
+
+#### 数据集：
+
+wget下载到本地用于性能测试
+
+<table style="width:100%; border-collapse: collapse;">
+  <thead>
+    <tr>
+      <th style="width:15%; text-align: left;">Dataset</th>
+      <th style="width:65%; text-align: left;">Data Path</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><strong>开源数据集 2k条</strong></td>
+      <td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
+    </tr>
+  </tbody>
+</table>
+#### 使用方式：
+
+```
+# 安装依赖
+python -m pip install -r requirements.txt
+```
+
+##### 参数说明
+
+```bash
+--backend openai-chat：压测使用的后端接口，指定为"openai-chat"使用chat/completion接口
+--model EB45T：模型名，任意取名，影响最后保存的结果文件名 EB45T \
+--endpoint /v1/chat/completions：endpoint，用于组url
+--host 0.0.0.0：服务ip地址，用于组url
+--port 9812：服务HTTP端口，用于组url
+--dataset-name EBChat：指定数据集类，指定为"EBChat"可读取转存的FD格式数据集
+--dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd：压测数据集路径
+--hyperparameter-path EB45T.yaml：(可选)超参文件，请求时会更新进payload中，默认不带任何超参
+--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len：性能结果中展示的指标集合
+--metric-percentiles 80,95,99,99.9,99.95,99.99：性能结果中展示的性能指标分位值
+--num-prompts 1：总计发送多少条请求
+--max-concurrency 1：压测并发数
+--save-result：开启结果保存，结果文件会存入json
+```
+
+##### /v1/chat/completions接口压测单条数据调试
+
+```
+python benchmark_serving.py \
+  --backend openai-chat \
+  --model EB45T \
+  --endpoint /v1/chat/completions \
+  --host 0.0.0.0 \
+  --port 9812 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
+  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
+  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
+  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
+  --num-prompts 1 \
+  --max-concurrency 1 \
+  --save-result
+```
+
+##### /v1/chat/completions接口完整100并发 2000条压测
+
+```
+# 保存infer_log.txt
+python benchmark_serving.py \
+  --backend openai-chat \
+  --model EB45T \
+  --endpoint /v1/chat/completions \
+  --host 0.0.0.0 \
+  --port 9812 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
+  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
+  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
+  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
+  --num-prompts 2000 \
+  --max-concurrency 100 \
+  --save-result > infer_log.txt 2>&1 &
+```
+
+##### /v1/completions接口压测
+
+修改endpoint为/v1/completions，backend为openai，会对/v1/completions接口进行压测
+
+```
+# 保存infer_log.txt
+python benchmark_serving.py \
+  --backend openai \
+  --model EB45T \
+  --endpoint /v1/completions \
+  --host 0.0.0.0 \
+  --port 9812 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
+  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
+  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
+  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
+  --num-prompts 2000 \
+  --max-concurrency 100 \
+  --save-result > infer_log.txt 2>&1 &
+```
+
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -0,0 +1,700 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
+
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    """Input for requesting LLMs via API"""
+    prompt: str
+    history_QA: Optional[dict]
+    hyper_parameters: dict
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+    language: Optional[str] = None
+
+
+@dataclass
+class RequestFuncOutput:
+    """Output for requesting LLMs via API"""
+    generated_text: str = ""
+    reasoning_content: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    arrival_time: list = field(default_factory=list)  # arrival_time
+    itl: list = field(default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    prompt_tokens: int = 0 # 推理侧返回输入token数
+    error: str = ""
+
+
+async def async_request_eb_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using EB OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Chat Completions API URL must end with 'completions'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": "default",
+            "messages": request_func_input.history_QA,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+                "continuous_usage_stats": True
+            },
+        }
+        # 超参由yaml传入
+        payload.update(request_func_input.hyper_parameters)
+
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = 0
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, type(chunk))
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                reason_content = choices[0]["delta"].get("reasoning_content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+                                    # cached_tokens
+                                    output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                output.generated_text += content or ""
+                                output.reasoning_content += reason_content or ""
+                                output.arrival_time.append(choices[0].get("arrival_time"))
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                                output.prompt_tokens = usage.get(
+                                    "prompt_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    # output.generated_text = generated_text
+                    if output.generated_text.strip() == "":
+                        output.success = False
+                        output.error = "No generated text found!"
+                    else:
+                        output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    error_text = await response.text()
+                    print("####error response:", error_text, "####payload:", payload)
+                    output.error = error_text or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        # 保存失败请求结果
+        if not output.success:
+            with open("error_output.txt", "a") as f:
+                f.write(str(output) + "\n")
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_eb_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using EB OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": "default",
+            "prompt": request_func_input.prompt,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+                "continuous_usage_stats": True
+            },
+        }
+        # 超参由yaml传入
+        payload.update(request_func_input.hyper_parameters)
+
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, chunk.usage)
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                output.arrival_time.append(choices[0].get("arrival_time"))
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.prompt_tokens = usage.get(
+                                    "prompt_tokens")
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using the TGI API"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+                        output.arrival_time.append(data["arrival_time"])
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using TRT's llm_server"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using Deepspeed MII"""
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            # "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            #"stream_options": {
+            #    "include_usage": True,
+            #},
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, type(chunk))
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using OpenAI"""
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("transcriptions", "translations"
+         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+            form = aiohttp.FormData()
+            form.add_field('file', f, content_type='audio/wav')
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_eb_openai_completions,
+    "openai-chat": async_request_eb_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_eb_openai_chat_completions)
+]
+
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -0,0 +1,309 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
+
+
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
+from PIL import Image
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: Union[str, Any]
+    history_QA: Union[str, Any]
+    json_data: Optional[dict]
+    prompt_len: int
+    expected_output_len: int
+
+
+class BenchmarkDataset(ABC):
+    """BenchmarkDataset"""
+    DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+        hyperparameter_path: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = (random_seed
+                            if random_seed is not None else self.DEFAULT_SEED)
+        self.data = None
+        self.hyperparameter_path = hyperparameter_path
+        self.hyperparameters = {}
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError(
+            "load_data must be implemented in subclasses.")
+
+    @abstractmethod
+    def sample(self, num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len
+                                                            < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long
+                or combined_too_long)
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
+    if isinstance(image, Image.Image):
+        image = image.convert("RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
+
+
+class EBDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    temperature: float
+    repetition_penalty: float
+    frequency_penalty: float
+    presence_penalty: float
+    top_p: float
+    prompt_len: int
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = [json.loads(i.strip()) for i in f.readlines()]
+
+    def sample(
+        self,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt = entry["text"]
+            self.temperature = float(entry["temperature"])
+            self.repetition_penalty = float(entry["penalty_score"])
+            self.frequency_penalty = float(entry["frequency_score"])
+            self.presence_penalty = float(entry["presence_score"])
+            self.top_p = float(entry["topp"])
+            self.prompt_len = int(entry["input_token_num"])
+            new_output_len = int(entry["max_dec_len"])
+
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=self.prompt_len,
+                    history_QA=[],
+                    expected_output_len=new_output_len,
+                ))
+
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+class EBChatDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+    prompt_len: int
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = [json.loads(i.strip()) for i in f.readlines()]
+
+    def sample(
+        self,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            json_data = entry
+            prompt = entry["messages"][-1].get("content", "")
+            history_QA = entry.get("messages", [])
+            new_output_len = int(entry.get("max_tokens", 12288))
+
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
+            samples.append(
+                SampleRequest(
+                    json_data=json_data,
+                    prompt=prompt,
+                    prompt_len=0,
+                    history_QA=history_QA,
+                    expected_output_len=new_output_len,
+                ))
+
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -0,0 +1,90 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
+
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+    """InfEncoder"""
+    def clear_inf(self, o: Any):
+        """clear_inf"""
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        """iterencode"""
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    """write_to_json"""
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)
+
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,5 @@
+aiohttp
+tqdm
+numpy
+Pillow
+pyyaml
--- a/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
@@ -0,0 +1,8 @@
+enable_chunked_prefill: True
+max_model_len: 131072
+max_num_seqs: 16
+kv_cache_ratio: 0.75
+tensor_parallel_size: 8
+max_num_batched_tokens: 4096
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 131072
+max_num_seqs: 40
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 8
+quantization: wint4
--- a/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
@@ -0,0 +1,8 @@
+enable_chunked_prefill: True
+max_model_len: 131072
+max_num_seqs: 16
+kv_cache_ratio: 0.75
+tensor_parallel_size: 8
+max_num_batched_tokens: 4096
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
@@ -0,0 +1,10 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 1
+enable_chunked_prefill: True
+max_num_batched_tokens: 384
+quantization: wint4
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 32
+kv_cache_ratio: 0.5
+tensor_parallel_size: 1
+quantization: wint4
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
+quantization: wint4
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
+quantization: wint8
--- a/benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
+++ b/benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
--- a/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 256
+tensor_parallel_size: 8
+quantization: block_wise_fp8
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+enable_chunked_prefill: True
+max_num_batched_tokens: 1024
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
+enable_prefix_caching: True
+swap_space: 200
--- a/benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
@@ -0,0 +1,11 @@
+max_model_len: 32768
+max_num_seqs: 256
+tensor_parallel_size: 8
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+enable_chunked_prefill: True
+max_num_batched_tokens: 1024
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
+enable_prefix_caching: True
+swap_space: 200
--- a/benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
@@ -0,0 +1,15 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+cache_queue_port: 55663
+enable_chunked_prefill: True
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7671,7672,7673,7674"
+pd_comm_port: "2334"
+max_num_batched_tokens: 384
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 4
+splitwise_role: prefill
+enable_prefix_caching: True
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7675,7676,7677,7678"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
+++ b/benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_prefix_caching: true
+enable_chunked_prefill: true
--- a/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
@@ -0,0 +1,13 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 1
+data_parallel_size: 8
+num_gpu_blocks_override: 1024
+cache_queue_port: 55663
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma"
+rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
@@ -0,0 +1,13 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 1
+data_parallel_size: 8
+splitwise_role: prefill
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+num_gpu_blocks_override: 1024
+cache_transfer_protocol: "rdma"
+rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
+quantization: wint4
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
@@ -0,0 +1,13 @@
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.7
+tensor_parallel_size: 4
+cache_queue_port: 55663
+enable_chunked_prefill: False
+enable_prefix_caching: False
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7671,7672,7673,7674"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 4
+splitwise_role: prefill
+enable_prefix_caching: False
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7675,7676,7677,7678"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 40
+tensor_parallel_size: 4
+quantization: wint4
+gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 160
+tensor_parallel_size: 8
+quantization: wint4
+gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
@@ -0,0 +1,8 @@
+enable_prefix_caching: True
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
+swap_space: 200
+cache_queue_port: 55664
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
@@ -0,0 +1,15 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+cache_queue_port: 55663
+enable_chunked_prefill: True
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7671,7672,7673,7674"
+pd_comm_port: "2334"
+max_num_batched_tokens: 384
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 4
+splitwise_role: prefill
+enable_prefix_caching: True
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7675,7676,7677,7678"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 8
--- a/benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 80
+tensor_parallel_size: 8
+quantization: wint8
+gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
@@ -0,0 +1,9 @@
+enable_prefix_caching: True
+max_model_len: 32768
+max_num_batched_tokens: 68304
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 8
+swap_space: 100
+cache_queue_port: 55664
--- a/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 56
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint4
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
@@ -0,0 +1,11 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 56
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint4
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+enable_chunked_prefill: True
+max_num_batched_tokens: 384
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+quantization: wint4
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.95
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint8
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
@@ -0,0 +1,11 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint8
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+enable_chunked_prefill: True
+max_num_batched_tokens: 384
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+quantization: wint8
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint4
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
+++ b/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,4 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wfp8afp8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wfp8afp8
--- a/benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint4
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 75
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.75
+quantization: wint4
+tensor_parallel_size: 4
--- a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 25
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 4
--- a/benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.75
+quantization: wint4
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.75
+quantization: wint4
+tensor_parallel_size: 1
--- a/benchmarks/yaml/request_yaml/eb45-128k.yaml
+++ b/benchmarks/yaml/request_yaml/eb45-128k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.8
+metadata:
+  min_tokens: 1
+max_tokens: 131071
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/eb45-32k.yaml
+++ b/benchmarks/yaml/request_yaml/eb45-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.8
+metadata:
+  min_tokens: 1
+max_tokens: 12288
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/qwen2-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen2-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.7
+metadata:
+  min_tokens: 1
+max_tokens: 12288
+repetition_penalty: 1.05
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/qwen3-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen3-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.7
+metadata:
+  min_tokens: 1
+max_tokens: 12288
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 1.5
--- a/benchmarks/yaml/request_yaml/x1-32k.yaml
+++ b/benchmarks/yaml/request_yaml/x1-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.95
+temperature: 0.6
+metadata:
+  min_tokens: 1
+max_tokens: 32767
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
@@ -0,0 +1,6 @@
+tensor_parallel_size: 8
+max_model_len: 32768
+max_num_seqs: 32
+num_gpu_blocks_override: 4096
+kv_cache_ratio: 0.5
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 32
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 4
+quantization: wint4
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 8
+quantization: wint4
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
@@ -0,0 +1,10 @@
+enable_prefix_caching: True
+num_gpu_blocks_override: 8000
+max_model_len: 32768
+max_num_seqs: 64
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.5
+tensor_parallel_size: 8
+swap_space: 200
+cache_queue_port: 55664
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
@@ -0,0 +1,6 @@
+tensor_parallel_size: 8
+max_model_len: 32768
+max_num_seqs: 32
+num_gpu_blocks_override: 4096
+kv_cache_ratio: 0.5
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 8
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 4
+quantization: wint8
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 64
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 8
+quantization: wint8
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
@@ -0,0 +1,10 @@
+enable_prefix_caching: True
+num_gpu_blocks_override: 8000
+max_model_len: 32768
+max_num_seqs: 64
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.5
+tensor_parallel_size: 8
+swap_space: 200
+cache_queue_port: 55664
+reasoning_parser: ernie-x1