Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

106
benchmarks/README.md Normal file
View File

@@ -0,0 +1,106 @@
### FastDeploy服务化性能压测工具
#### 数据集:
wget下载到本地用于性能测试
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr>
<th style="width:15%; text-align: left;">Dataset</th>
<th style="width:65%; text-align: left;">Data Path</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>开源数据集 2k条</strong></td>
<td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
</tr>
</tbody>
</table>
#### 使用方式:
```
# 安装依赖
python -m pip install -r requirements.txt
```
##### 参数说明
```bash
--backend openai-chat压测使用的后端接口指定为"openai-chat"使用chat/completion接口
--model EB45T模型名任意取名影响最后保存的结果文件名 EB45T \
--endpoint /v1/chat/completionsendpoint用于组url
--host 0.0.0.0服务ip地址用于组url
--port 9812服务HTTP端口用于组url
--dataset-name EBChat指定数据集类指定为"EBChat"可读取转存的FD格式数据集
--dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd压测数据集路径
--hyperparameter-path EB45T.yaml(可选)超参文件请求时会更新进payload中默认不带任何超参
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len性能结果中展示的指标集合
--metric-percentiles 80,95,99,99.9,99.95,99.99:性能结果中展示的性能指标分位值
--num-prompts 1总计发送多少条请求
--max-concurrency 1压测并发数
--save-result开启结果保存结果文件会存入json
```
##### /v1/chat/completions接口压测单条数据调试
```
python benchmark_serving.py \
--backend openai-chat \
--model EB45T \
--endpoint /v1/chat/completions \
--host 0.0.0.0 \
--port 9812 \
--dataset-name EBChat \
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
--num-prompts 1 \
--max-concurrency 1 \
--save-result
```
##### /v1/chat/completions接口完整100并发 2000条压测
```
# 保存infer_log.txt
python benchmark_serving.py \
--backend openai-chat \
--model EB45T \
--endpoint /v1/chat/completions \
--host 0.0.0.0 \
--port 9812 \
--dataset-name EBChat \
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
--num-prompts 2000 \
--max-concurrency 100 \
--save-result > infer_log.txt 2>&1 &
```
##### /v1/completions接口压测
修改endpoint为/v1/completionsbackend为openai会对/v1/completions接口进行压测
```
# 保存infer_log.txt
python benchmark_serving.py \
--backend openai \
--model EB45T \
--endpoint /v1/completions \
--host 0.0.0.0 \
--port 9812 \
--dataset-name EBChat \
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
--num-prompts 2000 \
--max-concurrency 100 \
--save-result > infer_log.txt 2>&1 &
```

View File

@@ -0,0 +1,700 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
import io
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional
import aiohttp
from tqdm.asyncio import tqdm
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
class RequestFuncInput:
"""Input for requesting LLMs via API"""
prompt: str
history_QA: Optional[dict]
hyper_parameters: dict
api_url: str
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
language: Optional[str] = None
@dataclass
class RequestFuncOutput:
"""Output for requesting LLMs via API"""
generated_text: str = ""
reasoning_content: str = ""
success: bool = False
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
arrival_time: list = field(default_factory=list) # arrival_time
itl: list = field(default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
prompt_tokens: int = 0 # 推理侧返回输入token数
error: str = ""
async def async_request_eb_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using EB OpenAI"""
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Chat Completions API URL must end with 'completions'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = {
"model": "default",
"messages": request_func_input.history_QA,
"stream": True,
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True
},
}
# 超参由yaml传入
payload.update(request_func_input.hyper_parameters)
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
output = RequestFuncOutput()
output.prompt_len = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
reason_content = choices[0]["delta"].get("reasoning_content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# cached_tokens
output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
output.generated_text += content or ""
output.reasoning_content += reason_content or ""
output.arrival_time.append(choices[0].get("arrival_time"))
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
output.prompt_tokens = usage.get(
"prompt_tokens")
most_recent_timestamp = timestamp
# output.generated_text = generated_text
if output.generated_text.strip() == "":
output.success = False
output.error = "No generated text found!"
else:
output.success = True
output.latency = most_recent_timestamp - st
else:
error_text = await response.text()
print("####error response:", error_text, "####payload:", payload)
output.error = error_text or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
# 保存失败请求结果
if not output.success:
with open("error_output.txt", "a") as f:
f.write(str(output) + "\n")
if pbar:
pbar.update(1)
return output
async def async_request_eb_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using EB OpenAI"""
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": "default",
"prompt": request_func_input.prompt,
"stream": True,
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True
},
}
# 超参由yaml传入
payload.update(request_func_input.hyper_parameters)
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, chunk.usage)
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.arrival_time.append(choices[0].get("arrival_time"))
generated_text += text or ""
elif usage := data.get("usage"):
output.prompt_tokens = usage.get(
"prompt_tokens")
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using the TGI API"""
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
params = {
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
"truncate": request_func_input.prompt_len,
"ignore_eos_token": request_func_input.ignore_eos,
}
payload = {
"inputs": request_func_input.prompt,
"parameters": params,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
if request_func_input.ignore_eos:
output.output_tokens = request_func_input.output_len
else:
output.output_tokens = None
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.arrival_time.append(data["arrival_time"])
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using TRT's llm_server"""
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
"temperature": 0.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"stream": True,
}
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using Deepspeed MII"""
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"prompt": request_func_input.prompt,
"max_tokens": request_func_input.output_len,
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"top_p": 1.0,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0
st = time.perf_counter()
try:
async with session.post(url=request_func_input.api_url,
json=payload) as response:
if response.status == 200:
parsed_resp = await response.json()
output.latency = time.perf_counter() - st
if "choices" in parsed_resp:
output.generated_text = parsed_resp["choices"][0][
"text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = ("Unexpected response format: "
"neither 'choices' nor 'text' found")
output.success = False
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using OpenAI"""
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
# "temperature": 0.0,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
#"stream_options": {
# "include_usage": True,
#},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using OpenAI"""
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
api_url = request_func_input.api_url
assert api_url.endswith(
("transcriptions", "translations"
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
"or `translations`."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage": True,
"stream_continuous_usage_stats": True
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
# Send audio file
def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
form = aiohttp.FormData()
form.add_field('file', f, content_type='audio/wav')
for key, value in payload.items():
form.add_field(key, str(value))
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url,
data=form,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get(
"content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_openai_completions,
"lmdeploy": async_request_openai_completions,
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_eb_openai_completions,
"openai-chat": async_request_eb_openai_chat_completions,
"openai-audio": async_request_openai_audio,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
}
OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_eb_openai_chat_completions)
]

View File

@@ -0,0 +1,309 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
import base64
import io
import json
import logging
import random
from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Callable, Optional, Union
from PIL import Image
logger = logging.getLogger(__name__)
@dataclass
class SampleRequest:
"""
Represents a single inference request for benchmarking.
"""
prompt: Union[str, Any]
history_QA: Union[str, Any]
json_data: Optional[dict]
prompt_len: int
expected_output_len: int
class BenchmarkDataset(ABC):
"""BenchmarkDataset"""
DEFAULT_SEED = 0
IS_MULTIMODAL = False
def __init__(
self,
dataset_path: Optional[str] = None,
random_seed: int = DEFAULT_SEED,
hyperparameter_path: Optional[str] = None,
) -> None:
"""
Initialize the BenchmarkDataset with an optional dataset path and random
seed. Args:
dataset_path (Optional[str]): Path to the dataset. If None, it
indicates that a default or random dataset might be used.
random_seed (int): Seed value for reproducible shuffling or
sampling. Defaults to DEFAULT_SEED.
"""
self.dataset_path = dataset_path
# Set the random seed, ensuring that a None value is replaced with the
# default seed.
self.random_seed = (random_seed
if random_seed is not None else self.DEFAULT_SEED)
self.data = None
self.hyperparameter_path = hyperparameter_path
self.hyperparameters = {}
def load_data(self) -> None:
"""
Load data from the dataset path into self.data.
This method must be overridden by subclasses since the method to load
data will vary depending on the dataset format and source.
Raises:
NotImplementedError: If a subclass does not implement this method.
"""
# TODO (jenniferzhao): add support for downloading data
raise NotImplementedError(
"load_data must be implemented in subclasses.")
@abstractmethod
def sample(self, num_requests: int) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
Subclasses must override this method to implement dataset-specific logic
for generating a list of SampleRequest objects.
Args:
num_requests (int): The number of sample requests to generate.
Returns:
list[SampleRequest]: A list of sample requests generated from the
dataset.
"""
raise NotImplementedError("sample must be implemented in subclasses.")
def maybe_oversample_requests(self, requests: list[SampleRequest],
num_requests: int) -> None:
"""
Oversamples the list of requests if its size is less than the desired
number.
Args:
requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests.
"""
if len(requests) < num_requests:
random.seed(self.random_seed)
additional = random.choices(requests,
k=num_requests - len(requests))
requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.",
num_requests)
def is_valid_sequence(
prompt_len: int,
output_len: int,
min_len: int = 4,
max_prompt_len: int = 1024,
max_total_len: int = 2048,
skip_min_output_len_check: bool = False,
) -> bool:
"""
Validate a sequence based on prompt and output lengths.
Default pruning criteria are copied from the original `sample_hf_requests`
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
from `sample_requests` in benchmark_throughput.py.
"""
# Check for invalid conditions
prompt_too_short = prompt_len < min_len
output_too_short = (not skip_min_output_len_check) and (output_len
< min_len)
prompt_too_long = prompt_len > max_prompt_len
combined_too_long = (prompt_len + output_len) > max_total_len
# Return True if none of the invalid conditions are met
return not (prompt_too_short or output_too_short or prompt_too_long
or combined_too_long)
def process_image(image: Any) -> Mapping[str, Any]:
"""
Process a single image input and return a multimedia content dictionary.
Supports three input types:
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
"""
if isinstance(image, dict) and 'bytes' in image:
image = Image.open(BytesIO(image['bytes']))
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(
image_data.getvalue()).decode("utf-8")
return {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
if isinstance(image, str):
image_url = (image if image.startswith(
("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
" or str or dictionary with raw image bytes.")
class EBDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
temperature: float
repetition_penalty: float
frequency_penalty: float
presence_penalty: float
top_p: float
prompt_len: int
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples: list = []
for entry in self.data:
if len(samples) >= num_requests:
break
prompt = entry["text"]
self.temperature = float(entry["temperature"])
self.repetition_penalty = float(entry["penalty_score"])
self.frequency_penalty = float(entry["frequency_score"])
self.presence_penalty = float(entry["presence_score"])
self.top_p = float(entry["topp"])
self.prompt_len = int(entry["input_token_num"])
new_output_len = int(entry["max_dec_len"])
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
prompt=prompt,
prompt_len=self.prompt_len,
history_QA=[],
expected_output_len=new_output_len,
))
self.maybe_oversample_requests(samples, num_requests)
return samples
class EBChatDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
prompt_len: int
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples: list = []
for entry in self.data:
if len(samples) >= num_requests:
break
json_data = entry
prompt = entry["messages"][-1].get("content", "")
history_QA = entry.get("messages", [])
new_output_len = int(entry.get("max_tokens", 12288))
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
json_data=json_data,
prompt=prompt,
prompt_len=0,
history_QA=history_QA,
expected_output_len=new_output_len,
))
self.maybe_oversample_requests(samples, num_requests)
return samples

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,90 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
import argparse
import json
import math
import os
from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any]) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
records.append(record)
return records
class InfEncoder(json.JSONEncoder):
"""InfEncoder"""
def clear_inf(self, o: Any):
"""clear_inf"""
if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()}
elif isinstance(o, list):
return [self.clear_inf(v) for v in o]
elif isinstance(o, float) and math.isinf(o):
return "inf"
return o
def iterencode(self, o: Any, *args, **kwargs) -> Any:
"""iterencode"""
return super().iterencode(self.clear_inf(o), *args, **kwargs)
def write_to_json(filename: str, records: list) -> None:
"""write_to_json"""
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)

View File

@@ -0,0 +1,5 @@
aiohttp
tqdm
numpy
Pillow
pyyaml

View File

@@ -0,0 +1,8 @@
enable_chunked_prefill: True
max_model_len: 131072
max_num_seqs: 16
kv_cache_ratio: 0.75
tensor_parallel_size: 8
max_num_batched_tokens: 4096
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,5 @@
max_model_len: 131072
max_num_seqs: 40
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint4

View File

@@ -0,0 +1,8 @@
enable_chunked_prefill: True
max_model_len: 131072
max_num_seqs: 16
kv_cache_ratio: 0.75
tensor_parallel_size: 8
max_num_batched_tokens: 4096
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,10 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 1
enable_chunked_prefill: True
max_num_batched_tokens: 384
quantization: wint4
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 32
kv_cache_ratio: 0.5
tensor_parallel_size: 1
quantization: wint4

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768
quantization: wint4

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768
quantization: wint8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 256
tensor_parallel_size: 8
quantization: block_wise_fp8
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
enable_chunked_prefill: True
max_num_batched_tokens: 1024
max_num_partial_prefills: 3
max_long_partial_prefills: 3
enable_prefix_caching: True
swap_space: 200

View File

@@ -0,0 +1,11 @@
max_model_len: 32768
max_num_seqs: 256
tensor_parallel_size: 8
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
enable_chunked_prefill: True
max_num_batched_tokens: 1024
max_num_partial_prefills: 3
max_long_partial_prefills: 3
enable_prefix_caching: True
swap_space: 200

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4

View File

@@ -0,0 +1,15 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
cache_queue_port: 55663
enable_chunked_prefill: True
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 4
splitwise_role: prefill
enable_prefix_caching: True
cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_prefix_caching: true
enable_chunked_prefill: true

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4

View File

@@ -0,0 +1,13 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 1
data_parallel_size: 8
num_gpu_blocks_override: 1024
cache_queue_port: 55663
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma"
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
pd_comm_port: "2334"

View File

@@ -0,0 +1,13 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 1
data_parallel_size: 8
splitwise_role: prefill
cache_queue_port: 55664
engine_worker_queue_port: 6677
num_gpu_blocks_override: 1024
cache_transfer_protocol: "rdma"
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
pd_comm_port: "2334"

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
quantization: wint4

View File

@@ -0,0 +1,13 @@
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.7
tensor_parallel_size: 4
cache_queue_port: 55663
enable_chunked_prefill: False
enable_prefix_caching: False
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 4
splitwise_role: prefill
enable_prefix_caching: False
cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 40
tensor_parallel_size: 4
quantization: wint4
gpu_memory_utilization: 0.9

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 160
tensor_parallel_size: 8
quantization: wint4
gpu_memory_utilization: 0.9

View File

@@ -0,0 +1,8 @@
enable_prefix_caching: True
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
swap_space: 200
cache_queue_port: 55664

View File

@@ -0,0 +1,15 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
cache_queue_port: 55663
enable_chunked_prefill: True
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 4
splitwise_role: prefill
enable_prefix_caching: True
cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 80
tensor_parallel_size: 8
quantization: wint8
gpu_memory_utilization: 0.9

View File

@@ -0,0 +1,9 @@
enable_prefix_caching: True
max_model_len: 32768
max_num_batched_tokens: 68304
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 8
swap_space: 100
cache_queue_port: 55664

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 56
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint4
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,11 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 56
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint4
limit_mm_per_prompt: '{"image": 100, "video": 100}'
enable_chunked_prefill: True
max_num_batched_tokens: 384
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
quantization: wint4
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.95
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint8
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,11 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint8
limit_mm_per_prompt: '{"image": 100, "video": 100}'
enable_chunked_prefill: True
max_num_batched_tokens: 384
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
quantization: wint8
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,4 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wfp8afp8
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wfp8afp8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 75
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
quantization: wint4
tensor_parallel_size: 4

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 25
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 4

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.75
quantization: wint4
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.75
quantization: wint4
tensor_parallel_size: 1

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.8
metadata:
min_tokens: 1
max_tokens: 131071
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.8
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.7
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.05
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.7
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 1.5

View File

@@ -0,0 +1,8 @@
top_p: 0.95
temperature: 0.6
metadata:
min_tokens: 1
max_tokens: 32767
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,6 @@
tensor_parallel_size: 8
max_model_len: 32768
max_num_seqs: 32
num_gpu_blocks_override: 4096
kv_cache_ratio: 0.5
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 32
gpu_memory_utilization: 0.9
tensor_parallel_size: 4
quantization: wint4
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint4
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,10 @@
enable_prefix_caching: True
num_gpu_blocks_override: 8000
max_model_len: 32768
max_num_seqs: 64
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.5
tensor_parallel_size: 8
swap_space: 200
cache_queue_port: 55664
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
tensor_parallel_size: 8
max_model_len: 32768
max_num_seqs: 32
num_gpu_blocks_override: 4096
kv_cache_ratio: 0.5
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 8
gpu_memory_utilization: 0.9
tensor_parallel_size: 4
quantization: wint8
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 64
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint8
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,10 @@
enable_prefix_caching: True
num_gpu_blocks_override: 8000
max_model_len: 32768
max_num_seqs: 64
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.5
tensor_parallel_size: 8
swap_space: 200
cache_queue_port: 55664
reasoning_parser: ernie-x1