mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
106
benchmarks/README.md
Normal file
106
benchmarks/README.md
Normal file
@@ -0,0 +1,106 @@
|
||||
### FastDeploy服务化性能压测工具
|
||||
|
||||
#### 数据集:
|
||||
|
||||
wget下载到本地用于性能测试
|
||||
|
||||
<table style="width:100%; border-collapse: collapse;">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width:15%; text-align: left;">Dataset</th>
|
||||
<th style="width:65%; text-align: left;">Data Path</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><strong>开源数据集 2k条</strong></td>
|
||||
<td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
#### 使用方式:
|
||||
|
||||
```
|
||||
# 安装依赖
|
||||
python -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
##### 参数说明
|
||||
|
||||
```bash
|
||||
--backend openai-chat:压测使用的后端接口,指定为"openai-chat"使用chat/completion接口
|
||||
--model EB45T:模型名,任意取名,影响最后保存的结果文件名 EB45T \
|
||||
--endpoint /v1/chat/completions:endpoint,用于组url
|
||||
--host 0.0.0.0:服务ip地址,用于组url
|
||||
--port 9812:服务HTTP端口,用于组url
|
||||
--dataset-name EBChat:指定数据集类,指定为"EBChat"可读取转存的FD格式数据集
|
||||
--dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd:压测数据集路径
|
||||
--hyperparameter-path EB45T.yaml:(可选)超参文件,请求时会更新进payload中,默认不带任何超参
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len:性能结果中展示的指标集合
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99:性能结果中展示的性能指标分位值
|
||||
--num-prompts 1:总计发送多少条请求
|
||||
--max-concurrency 1:压测并发数
|
||||
--save-result:开启结果保存,结果文件会存入json
|
||||
```
|
||||
|
||||
##### /v1/chat/completions接口压测单条数据调试
|
||||
|
||||
```
|
||||
python benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model EB45T \
|
||||
--endpoint /v1/chat/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port 9812 \
|
||||
--dataset-name EBChat \
|
||||
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--num-prompts 1 \
|
||||
--max-concurrency 1 \
|
||||
--save-result
|
||||
```
|
||||
|
||||
##### /v1/chat/completions接口完整100并发 2000条压测
|
||||
|
||||
```
|
||||
# 保存infer_log.txt
|
||||
python benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model EB45T \
|
||||
--endpoint /v1/chat/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port 9812 \
|
||||
--dataset-name EBChat \
|
||||
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--num-prompts 2000 \
|
||||
--max-concurrency 100 \
|
||||
--save-result > infer_log.txt 2>&1 &
|
||||
```
|
||||
|
||||
##### /v1/completions接口压测
|
||||
|
||||
修改endpoint为/v1/completions,backend为openai,会对/v1/completions接口进行压测
|
||||
|
||||
```
|
||||
# 保存infer_log.txt
|
||||
python benchmark_serving.py \
|
||||
--backend openai \
|
||||
--model EB45T \
|
||||
--endpoint /v1/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port 9812 \
|
||||
--dataset-name EBChat \
|
||||
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--num-prompts 2000 \
|
||||
--max-concurrency 100 \
|
||||
--save-result > infer_log.txt 2>&1 &
|
||||
```
|
||||
|
||||
700
benchmarks/backend_request_func.py
Normal file
700
benchmarks/backend_request_func.py
Normal file
@@ -0,0 +1,700 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
|
||||
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestFuncInput:
|
||||
"""Input for requesting LLMs via API"""
|
||||
prompt: str
|
||||
history_QA: Optional[dict]
|
||||
hyper_parameters: dict
|
||||
api_url: str
|
||||
prompt_len: int
|
||||
output_len: int
|
||||
model: str
|
||||
model_name: Optional[str] = None
|
||||
logprobs: Optional[int] = None
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict] = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestFuncOutput:
|
||||
"""Output for requesting LLMs via API"""
|
||||
generated_text: str = ""
|
||||
reasoning_content: str = ""
|
||||
success: bool = False
|
||||
latency: float = 0.0
|
||||
output_tokens: int = 0
|
||||
ttft: float = 0.0 # Time to first token
|
||||
arrival_time: list = field(default_factory=list) # arrival_time
|
||||
itl: list = field(default_factory=list) # list of inter-token latencies
|
||||
tpot: float = 0.0 # avg next-token latencies
|
||||
prompt_len: int = 0
|
||||
prompt_tokens: int = 0 # 推理侧返回输入token数
|
||||
error: str = ""
|
||||
|
||||
|
||||
async def async_request_eb_openai_chat_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using EB OpenAI"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Chat Completions API URL must end with 'completions'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
if request_func_input.multi_modal_content:
|
||||
content.append(request_func_input.multi_modal_content)
|
||||
payload = {
|
||||
"model": "default",
|
||||
"messages": request_func_input.history_QA,
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True
|
||||
},
|
||||
}
|
||||
# 超参由yaml传入
|
||||
payload.update(request_func_input.hyper_parameters)
|
||||
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = 0
|
||||
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, type(chunk))
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
|
||||
if choices := data.get("choices"):
|
||||
content = choices[0]["delta"].get("content")
|
||||
reason_content = choices[0]["delta"].get("reasoning_content")
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
# cached_tokens
|
||||
output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
output.generated_text += content or ""
|
||||
output.reasoning_content += reason_content or ""
|
||||
output.arrival_time.append(choices[0].get("arrival_time"))
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
output.prompt_tokens = usage.get(
|
||||
"prompt_tokens")
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
# output.generated_text = generated_text
|
||||
if output.generated_text.strip() == "":
|
||||
output.success = False
|
||||
output.error = "No generated text found!"
|
||||
else:
|
||||
output.success = True
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
error_text = await response.text()
|
||||
print("####error response:", error_text, "####payload:", payload)
|
||||
output.error = error_text or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
# 保存失败请求结果
|
||||
if not output.success:
|
||||
with open("error_output.txt", "a") as f:
|
||||
f.write(str(output) + "\n")
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_eb_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using EB OpenAI"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"model": "default",
|
||||
"prompt": request_func_input.prompt,
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True
|
||||
},
|
||||
}
|
||||
# 超参由yaml传入
|
||||
payload.update(request_func_input.hyper_parameters)
|
||||
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
generated_text = ""
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, chunk.usage)
|
||||
data = json.loads(chunk)
|
||||
|
||||
# NOTE: Some completion API might have a last
|
||||
# usage summary response without a token so we
|
||||
# want to check a token was generated
|
||||
if choices := data.get("choices"):
|
||||
# Note that text could be empty here
|
||||
# e.g. for special tokens
|
||||
text = choices[0].get("text")
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if not first_chunk_received:
|
||||
first_chunk_received = True
|
||||
ttft = time.perf_counter() - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
output.arrival_time.append(choices[0].get("arrival_time"))
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.prompt_tokens = usage.get(
|
||||
"prompt_tokens")
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_tgi(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using the TGI API"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
params = {
|
||||
"max_new_tokens": request_func_input.output_len,
|
||||
"do_sample": True,
|
||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||
"truncate": request_func_input.prompt_len,
|
||||
"ignore_eos_token": request_func_input.ignore_eos,
|
||||
}
|
||||
payload = {
|
||||
"inputs": request_func_input.prompt,
|
||||
"parameters": params,
|
||||
}
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
if request_func_input.ignore_eos:
|
||||
output.output_tokens = request_func_input.output_len
|
||||
else:
|
||||
output.output_tokens = None
|
||||
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||
|
||||
# NOTE: Sometimes TGI returns a ping response without
|
||||
# any data, we should skip it.
|
||||
if chunk_bytes.startswith(":"):
|
||||
continue
|
||||
chunk = chunk_bytes.removeprefix("data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = time.perf_counter() - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
output.arrival_time.append(data["arrival_time"])
|
||||
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.success = True
|
||||
output.generated_text = data["generated_text"]
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_trt_llm(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using TRT's llm_server"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"accumulate_tokens": True,
|
||||
"text_input": request_func_input.prompt,
|
||||
"temperature": 0.0,
|
||||
"top_p": 1.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
}
|
||||
if request_func_input.ignore_eos:
|
||||
payload["min_length"] = request_func_input.output_len
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
output.generated_text += data["text_output"]
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.success = True
|
||||
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using Deepspeed MII"""
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
|
||||
payload = {
|
||||
"prompt": request_func_input.prompt,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||
"top_p": 1.0,
|
||||
}
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
|
||||
# will use 0 as placeholder.
|
||||
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
|
||||
output.ttft = 0
|
||||
|
||||
st = time.perf_counter()
|
||||
try:
|
||||
async with session.post(url=request_func_input.api_url,
|
||||
json=payload) as response:
|
||||
if response.status == 200:
|
||||
parsed_resp = await response.json()
|
||||
output.latency = time.perf_counter() - st
|
||||
if "choices" in parsed_resp:
|
||||
output.generated_text = parsed_resp["choices"][0][
|
||||
"text"]
|
||||
elif "text" in parsed_resp:
|
||||
output.generated_text = parsed_resp["text"][0]
|
||||
else:
|
||||
output.error = ("Unexpected response format: "
|
||||
"neither 'choices' nor 'text' found")
|
||||
output.success = False
|
||||
output.success = True
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using OpenAI"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"prompt": request_func_input.prompt,
|
||||
# "temperature": 0.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"logprobs": request_func_input.logprobs,
|
||||
"stream": True,
|
||||
#"stream_options": {
|
||||
# "include_usage": True,
|
||||
#},
|
||||
}
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
generated_text = ""
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, type(chunk))
|
||||
data = json.loads(chunk)
|
||||
|
||||
# NOTE: Some completion API might have a last
|
||||
# usage summary response without a token so we
|
||||
# want to check a token was generated
|
||||
if choices := data.get("choices"):
|
||||
# Note that text could be empty here
|
||||
# e.g. for special tokens
|
||||
text = choices[0].get("text")
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if not first_chunk_received:
|
||||
first_chunk_received = True
|
||||
ttft = time.perf_counter() - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_openai_audio(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using OpenAI"""
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("transcriptions", "translations"
|
||||
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||
"or `translations`."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
payload = {
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"temperature": 0.0,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"language": "en",
|
||||
# Flattened due to multipart/form-data
|
||||
"stream_include_usage": True,
|
||||
"stream_continuous_usage_stats": True
|
||||
}
|
||||
if request_func_input.extra_body:
|
||||
payload.update(request_func_input.extra_body)
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
}
|
||||
|
||||
# Send audio file
|
||||
def to_bytes(y, sr):
|
||||
buffer = io.BytesIO()
|
||||
soundfile.write(buffer, y, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
|
||||
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
|
||||
form = aiohttp.FormData()
|
||||
form.add_field('file', f, content_type='audio/wav')
|
||||
for key, value in payload.items():
|
||||
form.add_field(key, str(value))
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
generated_text = ""
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url,
|
||||
data=form,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
|
||||
if choices := data.get("choices"):
|
||||
content = choices[0]["delta"].get(
|
||||
"content")
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(
|
||||
timestamp - most_recent_timestamp)
|
||||
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
output.generated_text = generated_text
|
||||
output.success = True
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
ASYNC_REQUEST_FUNCS = {
|
||||
"tgi": async_request_tgi,
|
||||
"vllm": async_request_openai_completions,
|
||||
"lmdeploy": async_request_openai_completions,
|
||||
"deepspeed-mii": async_request_deepspeed_mii,
|
||||
"openai": async_request_eb_openai_completions,
|
||||
"openai-chat": async_request_eb_openai_chat_completions,
|
||||
"openai-audio": async_request_openai_audio,
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions,
|
||||
async_request_eb_openai_chat_completions)
|
||||
]
|
||||
|
||||
309
benchmarks/benchmark_dataset.py
Normal file
309
benchmarks/benchmark_dataset.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
|
||||
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from typing import Any, Callable, Optional, Union
|
||||
from PIL import Image
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SampleRequest:
|
||||
"""
|
||||
Represents a single inference request for benchmarking.
|
||||
"""
|
||||
|
||||
prompt: Union[str, Any]
|
||||
history_QA: Union[str, Any]
|
||||
json_data: Optional[dict]
|
||||
prompt_len: int
|
||||
expected_output_len: int
|
||||
|
||||
|
||||
class BenchmarkDataset(ABC):
|
||||
"""BenchmarkDataset"""
|
||||
DEFAULT_SEED = 0
|
||||
IS_MULTIMODAL = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset_path: Optional[str] = None,
|
||||
random_seed: int = DEFAULT_SEED,
|
||||
hyperparameter_path: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the BenchmarkDataset with an optional dataset path and random
|
||||
seed. Args:
|
||||
dataset_path (Optional[str]): Path to the dataset. If None, it
|
||||
indicates that a default or random dataset might be used.
|
||||
random_seed (int): Seed value for reproducible shuffling or
|
||||
sampling. Defaults to DEFAULT_SEED.
|
||||
"""
|
||||
self.dataset_path = dataset_path
|
||||
# Set the random seed, ensuring that a None value is replaced with the
|
||||
# default seed.
|
||||
self.random_seed = (random_seed
|
||||
if random_seed is not None else self.DEFAULT_SEED)
|
||||
self.data = None
|
||||
self.hyperparameter_path = hyperparameter_path
|
||||
self.hyperparameters = {}
|
||||
|
||||
def load_data(self) -> None:
|
||||
"""
|
||||
Load data from the dataset path into self.data.
|
||||
|
||||
This method must be overridden by subclasses since the method to load
|
||||
data will vary depending on the dataset format and source.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If a subclass does not implement this method.
|
||||
"""
|
||||
# TODO (jenniferzhao): add support for downloading data
|
||||
raise NotImplementedError(
|
||||
"load_data must be implemented in subclasses.")
|
||||
|
||||
@abstractmethod
|
||||
def sample(self, num_requests: int) -> list[SampleRequest]:
|
||||
"""
|
||||
Abstract method to generate sample requests from the dataset.
|
||||
|
||||
Subclasses must override this method to implement dataset-specific logic
|
||||
for generating a list of SampleRequest objects.
|
||||
|
||||
Args:
|
||||
num_requests (int): The number of sample requests to generate.
|
||||
|
||||
Returns:
|
||||
list[SampleRequest]: A list of sample requests generated from the
|
||||
dataset.
|
||||
"""
|
||||
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||
|
||||
def maybe_oversample_requests(self, requests: list[SampleRequest],
|
||||
num_requests: int) -> None:
|
||||
"""
|
||||
Oversamples the list of requests if its size is less than the desired
|
||||
number.
|
||||
|
||||
Args:
|
||||
requests (List[SampleRequest]): The current list of sampled
|
||||
requests. num_requests (int): The target number of requests.
|
||||
"""
|
||||
if len(requests) < num_requests:
|
||||
random.seed(self.random_seed)
|
||||
additional = random.choices(requests,
|
||||
k=num_requests - len(requests))
|
||||
requests.extend(additional)
|
||||
logger.info("Oversampled requests to reach %d total samples.",
|
||||
num_requests)
|
||||
|
||||
|
||||
def is_valid_sequence(
|
||||
prompt_len: int,
|
||||
output_len: int,
|
||||
min_len: int = 4,
|
||||
max_prompt_len: int = 1024,
|
||||
max_total_len: int = 2048,
|
||||
skip_min_output_len_check: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Validate a sequence based on prompt and output lengths.
|
||||
|
||||
Default pruning criteria are copied from the original `sample_hf_requests`
|
||||
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
|
||||
from `sample_requests` in benchmark_throughput.py.
|
||||
"""
|
||||
# Check for invalid conditions
|
||||
prompt_too_short = prompt_len < min_len
|
||||
output_too_short = (not skip_min_output_len_check) and (output_len
|
||||
< min_len)
|
||||
prompt_too_long = prompt_len > max_prompt_len
|
||||
combined_too_long = (prompt_len + output_len) > max_total_len
|
||||
|
||||
# Return True if none of the invalid conditions are met
|
||||
return not (prompt_too_short or output_too_short or prompt_too_long
|
||||
or combined_too_long)
|
||||
|
||||
|
||||
def process_image(image: Any) -> Mapping[str, Any]:
|
||||
"""
|
||||
Process a single image input and return a multimedia content dictionary.
|
||||
|
||||
Supports three input types:
|
||||
|
||||
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
||||
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
||||
|
||||
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
|
||||
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
|
||||
a dictionary with the image as a base64 data URL.
|
||||
|
||||
3. String input: - Treats the string as a URL or local file path. -
|
||||
Prepends "file://" if the string doesn't start with "http://" or
|
||||
"file://". - Returns a dictionary with the image URL.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input is not a supported type.
|
||||
"""
|
||||
if isinstance(image, dict) and 'bytes' in image:
|
||||
image = Image.open(BytesIO(image['bytes']))
|
||||
if isinstance(image, Image.Image):
|
||||
image = image.convert("RGB")
|
||||
with io.BytesIO() as image_data:
|
||||
image.save(image_data, format="JPEG")
|
||||
image_base64 = base64.b64encode(
|
||||
image_data.getvalue()).decode("utf-8")
|
||||
return {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||
},
|
||||
}
|
||||
|
||||
if isinstance(image, str):
|
||||
image_url = (image if image.startswith(
|
||||
("http://", "file://")) else f"file://{image}")
|
||||
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||
|
||||
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
||||
" or str or dictionary with raw image bytes.")
|
||||
|
||||
|
||||
class EBDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||
sample requests based on conversation turns.
|
||||
"""
|
||||
|
||||
temperature: float
|
||||
repetition_penalty: float
|
||||
frequency_penalty: float
|
||||
presence_penalty: float
|
||||
top_p: float
|
||||
prompt_len: int
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.load_data()
|
||||
|
||||
def load_data(self) -> None:
|
||||
if self.dataset_path is None:
|
||||
raise ValueError("dataset_path must be provided for loading data.")
|
||||
|
||||
with open(self.dataset_path, encoding="utf-8") as f:
|
||||
self.data = [json.loads(i.strip()) for i in f.readlines()]
|
||||
|
||||
def sample(
|
||||
self,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
samples: list = []
|
||||
for entry in self.data:
|
||||
if len(samples) >= num_requests:
|
||||
break
|
||||
prompt = entry["text"]
|
||||
self.temperature = float(entry["temperature"])
|
||||
self.repetition_penalty = float(entry["penalty_score"])
|
||||
self.frequency_penalty = float(entry["frequency_score"])
|
||||
self.presence_penalty = float(entry["presence_score"])
|
||||
self.top_p = float(entry["topp"])
|
||||
self.prompt_len = int(entry["input_token_num"])
|
||||
new_output_len = int(entry["max_dec_len"])
|
||||
|
||||
if enable_multimodal_chat:
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
prompt, None)
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=self.prompt_len,
|
||||
history_QA=[],
|
||||
expected_output_len=new_output_len,
|
||||
))
|
||||
|
||||
self.maybe_oversample_requests(samples, num_requests)
|
||||
return samples
|
||||
|
||||
|
||||
class EBChatDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||
sample requests based on conversation turns.
|
||||
"""
|
||||
prompt_len: int
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.load_data()
|
||||
|
||||
def load_data(self) -> None:
|
||||
if self.dataset_path is None:
|
||||
raise ValueError("dataset_path must be provided for loading data.")
|
||||
|
||||
with open(self.dataset_path, encoding="utf-8") as f:
|
||||
self.data = [json.loads(i.strip()) for i in f.readlines()]
|
||||
|
||||
def sample(
|
||||
self,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
samples: list = []
|
||||
for entry in self.data:
|
||||
if len(samples) >= num_requests:
|
||||
break
|
||||
json_data = entry
|
||||
prompt = entry["messages"][-1].get("content", "")
|
||||
history_QA = entry.get("messages", [])
|
||||
new_output_len = int(entry.get("max_tokens", 12288))
|
||||
|
||||
if enable_multimodal_chat:
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
prompt, None)
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
json_data=json_data,
|
||||
prompt=prompt,
|
||||
prompt_len=0,
|
||||
history_QA=history_QA,
|
||||
expected_output_len=new_output_len,
|
||||
))
|
||||
|
||||
self.maybe_oversample_requests(samples, num_requests)
|
||||
return samples
|
||||
|
||||
1141
benchmarks/benchmark_serving.py
Normal file
1141
benchmarks/benchmark_serving.py
Normal file
File diff suppressed because it is too large
Load Diff
90
benchmarks/benchmark_utils.py
Normal file
90
benchmarks/benchmark_utils.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
|
||||
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
metrics: dict[str, list],
|
||||
extra_info: dict[str, Any]) -> list:
|
||||
"""
|
||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||
on metric per record
|
||||
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||
"""
|
||||
records = []
|
||||
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
|
||||
return records
|
||||
|
||||
for name, benchmark_values in metrics.items():
|
||||
record = {
|
||||
"benchmark": {
|
||||
"name": "vLLM benchmark",
|
||||
"extra_info": {
|
||||
"args": vars(args),
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"name": args.model,
|
||||
},
|
||||
"metric": {
|
||||
"name": name,
|
||||
"benchmark_values": benchmark_values,
|
||||
"extra_info": extra_info,
|
||||
},
|
||||
}
|
||||
|
||||
tp = record["benchmark"]["extra_info"]["args"].get(
|
||||
"tensor_parallel_size")
|
||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||
if not tp and "tensor_parallel_size" in extra_info:
|
||||
record["benchmark"]["extra_info"]["args"][
|
||||
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||
|
||||
records.append(record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
class InfEncoder(json.JSONEncoder):
|
||||
"""InfEncoder"""
|
||||
def clear_inf(self, o: Any):
|
||||
"""clear_inf"""
|
||||
if isinstance(o, dict):
|
||||
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||
elif isinstance(o, list):
|
||||
return [self.clear_inf(v) for v in o]
|
||||
elif isinstance(o, float) and math.isinf(o):
|
||||
return "inf"
|
||||
return o
|
||||
|
||||
def iterencode(self, o: Any, *args, **kwargs) -> Any:
|
||||
"""iterencode"""
|
||||
return super().iterencode(self.clear_inf(o), *args, **kwargs)
|
||||
|
||||
|
||||
def write_to_json(filename: str, records: list) -> None:
|
||||
"""write_to_json"""
|
||||
with open(filename, "w") as f:
|
||||
json.dump(records, f, cls=InfEncoder)
|
||||
|
||||
5
benchmarks/requirements.txt
Normal file
5
benchmarks/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
aiohttp
|
||||
tqdm
|
||||
numpy
|
||||
Pillow
|
||||
pyyaml
|
||||
8
benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
Normal file
8
benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
enable_chunked_prefill: True
|
||||
max_model_len: 131072
|
||||
max_num_seqs: 16
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 8
|
||||
max_num_batched_tokens: 4096
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
5
benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 131072
|
||||
max_num_seqs: 40
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
8
benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
Normal file
8
benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
enable_chunked_prefill: True
|
||||
max_model_len: 131072
|
||||
max_num_seqs: 16
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 8
|
||||
max_num_batched_tokens: 4096
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
10
benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
Normal file
10
benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 1
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 384
|
||||
quantization: wint4
|
||||
reasoning_parser: ernie-45-vl
|
||||
5
benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
Normal file
5
benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
||||
5
benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
Normal file
5
benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
kv_cache_ratio: 0.5
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint4
|
||||
6
benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
Normal file
6
benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
||||
quantization: wint4
|
||||
6
benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
Normal file
6
benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
||||
quantization: wint8
|
||||
5
benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
Normal file
5
benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
||||
12
benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
Normal file
12
benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
tensor_parallel_size: 8
|
||||
quantization: block_wise_fp8
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 1024
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
enable_prefix_caching: True
|
||||
swap_space: 200
|
||||
11
benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
tensor_parallel_size: 8
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 1024
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
enable_prefix_caching: True
|
||||
swap_space: 200
|
||||
5
benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
15
benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
Normal file
15
benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
cache_queue_port: 55663
|
||||
enable_chunked_prefill: True
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
max_num_batched_tokens: 384
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
12
benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 4
|
||||
splitwise_role: prefill
|
||||
enable_prefix_caching: True
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
||||
6
benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
Normal file
6
benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_prefix_caching: true
|
||||
enable_chunked_prefill: true
|
||||
5
benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 1
|
||||
data_parallel_size: 8
|
||||
num_gpu_blocks_override: 1024
|
||||
cache_queue_port: 55663
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma"
|
||||
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
|
||||
pd_comm_port: "2334"
|
||||
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 1
|
||||
data_parallel_size: 8
|
||||
splitwise_role: prefill
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
num_gpu_blocks_override: 1024
|
||||
cache_transfer_protocol: "rdma"
|
||||
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
|
||||
pd_comm_port: "2334"
|
||||
6
benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
Normal file
6
benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
||||
13
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.7
|
||||
tensor_parallel_size: 4
|
||||
cache_queue_port: 55663
|
||||
enable_chunked_prefill: False
|
||||
enable_prefix_caching: False
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
12
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 4
|
||||
splitwise_role: prefill
|
||||
enable_prefix_caching: False
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
||||
5
benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 40
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
||||
gpu_memory_utilization: 0.9
|
||||
5
benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 160
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
gpu_memory_utilization: 0.9
|
||||
8
benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
Normal file
8
benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
enable_prefix_caching: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
swap_space: 200
|
||||
cache_queue_port: 55664
|
||||
15
benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
Normal file
15
benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
cache_queue_port: 55663
|
||||
enable_chunked_prefill: True
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
max_num_batched_tokens: 384
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
12
benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 4
|
||||
splitwise_role: prefill
|
||||
enable_prefix_caching: True
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
||||
5
benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 8
|
||||
5
benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 80
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
gpu_memory_utilization: 0.9
|
||||
9
benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_prefix_caching: True
|
||||
max_model_len: 32768
|
||||
max_num_batched_tokens: 68304
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 8
|
||||
swap_space: 100
|
||||
cache_queue_port: 55664
|
||||
9
benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 56
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
||||
11
benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 56
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 384
|
||||
reasoning_parser: ernie-45-vl
|
||||
9
benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
||||
9
benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.95
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
||||
11
benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 384
|
||||
reasoning_parser: ernie-45-vl
|
||||
9
benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint8
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
||||
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
||||
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
||||
5
benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
6
benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint4
|
||||
enable_static_graph_inference: True
|
||||
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
enable_static_graph_inference: True
|
||||
5
benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
5
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
4
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
Normal file
4
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
6
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wfp8afp8
|
||||
enable_static_graph_inference: True
|
||||
5
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wfp8afp8
|
||||
5
benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
||||
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
||||
5
benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
||||
6
benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint4
|
||||
enable_static_graph_inference: True
|
||||
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
5
benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 1
|
||||
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 1
|
||||
6
benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 1
|
||||
6
benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
Normal file
6
benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 75
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint4
|
||||
tensor_parallel_size: 4
|
||||
6
benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
Normal file
6
benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 25
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 4
|
||||
5
benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
5
benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
6
benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint4
|
||||
tensor_parallel_size: 1
|
||||
6
benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint4
|
||||
tensor_parallel_size: 1
|
||||
8
benchmarks/yaml/request_yaml/eb45-128k.yaml
Normal file
8
benchmarks/yaml/request_yaml/eb45-128k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.8
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 131071
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
||||
8
benchmarks/yaml/request_yaml/eb45-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/eb45-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.8
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
||||
8
benchmarks/yaml/request_yaml/qwen2-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/qwen2-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.7
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.05
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
||||
8
benchmarks/yaml/request_yaml/qwen3-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/qwen3-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.7
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 1.5
|
||||
8
benchmarks/yaml/request_yaml/x1-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/x1-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.95
|
||||
temperature: 0.6
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 32767
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
||||
6
benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
tensor_parallel_size: 8
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
num_gpu_blocks_override: 4096
|
||||
kv_cache_ratio: 0.5
|
||||
reasoning_parser: ernie-x1
|
||||
6
benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
||||
reasoning_parser: ernie-x1
|
||||
6
benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
reasoning_parser: ernie-x1
|
||||
10
benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
Normal file
10
benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
enable_prefix_caching: True
|
||||
num_gpu_blocks_override: 8000
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 64
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.5
|
||||
tensor_parallel_size: 8
|
||||
swap_space: 200
|
||||
cache_queue_port: 55664
|
||||
reasoning_parser: ernie-x1
|
||||
6
benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
tensor_parallel_size: 8
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
num_gpu_blocks_override: 4096
|
||||
kv_cache_ratio: 0.5
|
||||
reasoning_parser: ernie-x1
|
||||
6
benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 8
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint8
|
||||
reasoning_parser: ernie-x1
|
||||
6
benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 64
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
reasoning_parser: ernie-x1
|
||||
10
benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
Normal file
10
benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
enable_prefix_caching: True
|
||||
num_gpu_blocks_override: 8000
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 64
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.5
|
||||
tensor_parallel_size: 8
|
||||
swap_space: 200
|
||||
cache_queue_port: 55664
|
||||
reasoning_parser: ernie-x1
|
||||
Reference in New Issue
Block a user