mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -131,4 +131,4 @@ python benchmarks/benchmark_mtp.py \
|
||||
--s_itl-base-model:主模型的解码延迟,可由上述的性能压测工具获得,与batch-size一一对应
|
||||
--dataset-name:指定数据集类,指定为"EBChat"可读取转存的FD格式数据集
|
||||
--dataset-path:测试数据集路径
|
||||
```
|
||||
```
|
||||
|
||||
@@ -29,13 +29,13 @@ from typing import Optional
|
||||
import aiohttp
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestFuncInput:
|
||||
"""Input for requesting LLMs via API"""
|
||||
|
||||
no: int
|
||||
prompt: str
|
||||
history_QA: Optional[dict]
|
||||
@@ -55,6 +55,7 @@ class RequestFuncInput:
|
||||
@dataclass
|
||||
class RequestFuncOutput:
|
||||
"""Output for requesting LLMs via API"""
|
||||
|
||||
no: int = 0
|
||||
generated_text: str = ""
|
||||
reasoning_content: str = ""
|
||||
@@ -66,7 +67,7 @@ class RequestFuncOutput:
|
||||
itl: list = field(default_factory=list) # list of inter-token latencies
|
||||
tpot: float = 0.0 # avg next-token latencies
|
||||
prompt_len: int = 0
|
||||
prompt_tokens: int = 0 # 推理侧返回输入token数
|
||||
prompt_tokens: int = 0 # 推理侧返回输入token数
|
||||
error: str = ""
|
||||
|
||||
|
||||
@@ -76,12 +77,9 @@ async def async_request_eb_openai_chat_completions(
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using EB OpenAI"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Chat Completions API URL must end with 'completions'."
|
||||
assert api_url.endswith(("completions", "profile")), "OpenAI Chat Completions API URL must end with 'completions'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
if request_func_input.multi_modal_content:
|
||||
content.append(request_func_input.multi_modal_content)
|
||||
@@ -91,7 +89,7 @@ async def async_request_eb_openai_chat_completions(
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
}
|
||||
# 超参由yaml传入
|
||||
@@ -99,8 +97,8 @@ async def async_request_eb_openai_chat_completions(
|
||||
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
|
||||
print("payload:{}".format(json.dumps(payload, ensure_ascii=False)))
|
||||
|
||||
print(f"payload:{json.dumps(payload, ensure_ascii=False)}")
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
@@ -115,16 +113,14 @@ async def async_request_eb_openai_chat_completions(
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
async with session.post(url=api_url, json=payload, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, type(chunk))
|
||||
timestamp = time.perf_counter()
|
||||
@@ -138,22 +134,20 @@ async def async_request_eb_openai_chat_completions(
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
# cached_tokens
|
||||
output.prompt_len = data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
|
||||
|
||||
output.prompt_len = (
|
||||
data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
|
||||
)
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
output.generated_text += content or ""
|
||||
output.reasoning_content += reason_content or ""
|
||||
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
|
||||
elif usage := data.get("usage", {}):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens", 0)
|
||||
output.prompt_tokens = usage.get(
|
||||
"prompt_tokens", 0)
|
||||
output.output_tokens = usage.get("completion_tokens", 0)
|
||||
output.prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@@ -166,7 +160,12 @@ async def async_request_eb_openai_chat_completions(
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
error_text = await response.text()
|
||||
print("####error response:", error_text, "####payload:", payload)
|
||||
print(
|
||||
"####error response:",
|
||||
error_text,
|
||||
"####payload:",
|
||||
payload,
|
||||
)
|
||||
output.error = error_text or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
@@ -194,15 +193,14 @@ async def async_request_eb_openai_completions(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"model": request_func_input.model,
|
||||
"prompt": request_func_input.prompt,
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
}
|
||||
# 超参由yaml传入
|
||||
@@ -210,12 +208,12 @@ async def async_request_eb_openai_completions(
|
||||
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
|
||||
|
||||
print("payload:", json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
"Content-Type": "application/json"
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
@@ -227,8 +225,7 @@ async def async_request_eb_openai_completions(
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
async with session.post(url=api_url, json=payload, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
@@ -236,8 +233,7 @@ async def async_request_eb_openai_completions(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, chunk.usage)
|
||||
timestamp = time.perf_counter()
|
||||
@@ -250,7 +246,7 @@ async def async_request_eb_openai_completions(
|
||||
# Note that text could be empty here
|
||||
# e.g. for special tokens
|
||||
text = choices[0].get("text")
|
||||
|
||||
|
||||
# First token
|
||||
if not first_chunk_received:
|
||||
first_chunk_received = True
|
||||
@@ -259,26 +255,23 @@ async def async_request_eb_openai_completions(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
generated_text += text or ""
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
|
||||
elif usage := data.get("usage"):
|
||||
output.prompt_tokens = usage.get(
|
||||
"prompt_tokens")
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
output.prompt_tokens = usage.get("prompt_tokens")
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
|
||||
"Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
|
||||
)
|
||||
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
|
||||
@@ -294,8 +287,8 @@ async def async_request_eb_openai_completions(
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
print("final_output:{}".format(output))
|
||||
|
||||
print(f"final_output:{output}")
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
@@ -310,8 +303,7 @@ async def async_request_tgi(
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
|
||||
params = {
|
||||
"max_new_tokens": request_func_input.output_len,
|
||||
"do_sample": True,
|
||||
@@ -358,8 +350,7 @@ async def async_request_tgi(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
output.arrival_time.append(data["arrival_time"])
|
||||
@@ -388,8 +379,7 @@ async def async_request_trt_llm(
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"accumulate_tokens": True,
|
||||
"text_input": request_func_input.prompt,
|
||||
@@ -414,8 +404,7 @@ async def async_request_trt_llm(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data:")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
output.generated_text += data["text_output"]
|
||||
@@ -427,8 +416,7 @@ async def async_request_trt_llm(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@@ -453,8 +441,7 @@ async def async_request_deepspeed_mii(
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using Deepspeed MII"""
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
|
||||
|
||||
payload = {
|
||||
"prompt": request_func_input.prompt,
|
||||
@@ -472,19 +459,16 @@ async def async_request_deepspeed_mii(
|
||||
|
||||
st = time.perf_counter()
|
||||
try:
|
||||
async with session.post(url=request_func_input.api_url,
|
||||
json=payload) as response:
|
||||
async with session.post(url=request_func_input.api_url, json=payload) as response:
|
||||
if response.status == 200:
|
||||
parsed_resp = await response.json()
|
||||
output.latency = time.perf_counter() - st
|
||||
if "choices" in parsed_resp:
|
||||
output.generated_text = parsed_resp["choices"][0][
|
||||
"text"]
|
||||
output.generated_text = parsed_resp["choices"][0]["text"]
|
||||
elif "text" in parsed_resp:
|
||||
output.generated_text = parsed_resp["text"][0]
|
||||
else:
|
||||
output.error = ("Unexpected response format: "
|
||||
"neither 'choices' nor 'text' found")
|
||||
output.error = "Unexpected response format: " "neither 'choices' nor 'text' found"
|
||||
output.success = False
|
||||
output.success = True
|
||||
else:
|
||||
@@ -510,26 +494,22 @@ async def async_request_openai_completions(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
|
||||
"prompt": request_func_input.prompt,
|
||||
# "temperature": 0.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"logprobs": request_func_input.logprobs,
|
||||
"stream": True,
|
||||
#"stream_options": {
|
||||
# "stream_options": {
|
||||
# "include_usage": True,
|
||||
#},
|
||||
# },
|
||||
}
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
@@ -538,8 +518,7 @@ async def async_request_openai_completions(
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
async with session.post(url=api_url, json=payload, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
@@ -547,8 +526,7 @@ async def async_request_openai_completions(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, type(chunk))
|
||||
data = json.loads(chunk)
|
||||
@@ -569,21 +547,19 @@ async def async_request_openai_completions(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
"Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
|
||||
)
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
@@ -606,25 +582,24 @@ async def async_request_openai_audio(
|
||||
"""Request an LLM using OpenAI"""
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("transcriptions", "translations"
|
||||
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||
("transcriptions", "translations")
|
||||
), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||
"or `translations`."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
payload = {
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
|
||||
"temperature": 0.0,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"language": "en",
|
||||
# Flattened due to multipart/form-data
|
||||
"stream_include_usage": True,
|
||||
"stream_continuous_usage_stats": True
|
||||
"stream_continuous_usage_stats": True,
|
||||
}
|
||||
if request_func_input.extra_body:
|
||||
payload.update(request_func_input.extra_body)
|
||||
@@ -639,9 +614,9 @@ async def async_request_openai_audio(
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
|
||||
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
|
||||
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
|
||||
form = aiohttp.FormData()
|
||||
form.add_field('file', f, content_type='audio/wav')
|
||||
form.add_field("file", f, content_type="audio/wav")
|
||||
for key, value in payload.items():
|
||||
form.add_field(key, str(value))
|
||||
|
||||
@@ -653,24 +628,20 @@ async def async_request_openai_audio(
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url,
|
||||
data=form,
|
||||
headers=headers) as response:
|
||||
async with session.post(url=api_url, data=form, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
if chunk != "[DONE]":
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
|
||||
if choices := data.get("choices"):
|
||||
content = choices[0]["delta"].get(
|
||||
"content")
|
||||
content = choices[0]["delta"].get("content")
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
@@ -678,13 +649,11 @@ async def async_request_openai_audio(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(
|
||||
timestamp - most_recent_timestamp)
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@@ -718,8 +687,11 @@ ASYNC_REQUEST_FUNCS = {
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions,
|
||||
async_request_eb_openai_chat_completions)
|
||||
k
|
||||
for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v
|
||||
in (
|
||||
async_request_openai_completions,
|
||||
async_request_eb_openai_chat_completions,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@@ -26,9 +26,9 @@ from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from typing import Any, Callable, Optional, Union
|
||||
from PIL import Image
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -38,6 +38,7 @@ class SampleRequest:
|
||||
"""
|
||||
Represents a single inference request for benchmarking.
|
||||
"""
|
||||
|
||||
no: int
|
||||
prompt: Union[str, Any]
|
||||
history_QA: Union[str, Any]
|
||||
@@ -48,6 +49,7 @@ class SampleRequest:
|
||||
|
||||
class BenchmarkDataset(ABC):
|
||||
"""BenchmarkDataset"""
|
||||
|
||||
DEFAULT_SEED = 0
|
||||
IS_MULTIMODAL = False
|
||||
|
||||
@@ -68,8 +70,7 @@ class BenchmarkDataset(ABC):
|
||||
self.dataset_path = dataset_path
|
||||
# Set the random seed, ensuring that a None value is replaced with the
|
||||
# default seed.
|
||||
self.random_seed = (random_seed
|
||||
if random_seed is not None else self.DEFAULT_SEED)
|
||||
self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
|
||||
self.data = None
|
||||
self.hyperparameter_path = hyperparameter_path
|
||||
self.hyperparameters = {}
|
||||
@@ -85,8 +86,7 @@ class BenchmarkDataset(ABC):
|
||||
NotImplementedError: If a subclass does not implement this method.
|
||||
"""
|
||||
# TODO (jenniferzhao): add support for downloading data
|
||||
raise NotImplementedError(
|
||||
"load_data must be implemented in subclasses.")
|
||||
raise NotImplementedError("load_data must be implemented in subclasses.")
|
||||
|
||||
@abstractmethod
|
||||
def sample(self, num_requests: int) -> list[SampleRequest]:
|
||||
@@ -105,8 +105,7 @@ class BenchmarkDataset(ABC):
|
||||
"""
|
||||
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||
|
||||
def maybe_oversample_requests(self, requests: list[SampleRequest],
|
||||
num_requests: int) -> None:
|
||||
def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None:
|
||||
"""
|
||||
Oversamples the list of requests if its size is less than the desired
|
||||
number.
|
||||
@@ -117,11 +116,9 @@ class BenchmarkDataset(ABC):
|
||||
"""
|
||||
if len(requests) < num_requests:
|
||||
random.seed(self.random_seed)
|
||||
additional = random.choices(requests,
|
||||
k=num_requests - len(requests))
|
||||
additional = random.choices(requests, k=num_requests - len(requests))
|
||||
requests.extend(additional)
|
||||
logger.info("Oversampled requests to reach %d total samples.",
|
||||
num_requests)
|
||||
logger.info("Oversampled requests to reach %d total samples.", num_requests)
|
||||
|
||||
|
||||
def is_valid_sequence(
|
||||
@@ -141,14 +138,12 @@ def is_valid_sequence(
|
||||
"""
|
||||
# Check for invalid conditions
|
||||
prompt_too_short = prompt_len < min_len
|
||||
output_too_short = (not skip_min_output_len_check) and (output_len
|
||||
< min_len)
|
||||
output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
|
||||
prompt_too_long = prompt_len > max_prompt_len
|
||||
combined_too_long = (prompt_len + output_len) > max_total_len
|
||||
|
||||
# Return True if none of the invalid conditions are met
|
||||
return not (prompt_too_short or output_too_short or prompt_too_long
|
||||
or combined_too_long)
|
||||
return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long)
|
||||
|
||||
|
||||
def process_image(image: Any) -> Mapping[str, Any]:
|
||||
@@ -171,28 +166,25 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
||||
Raises:
|
||||
ValueError: If the input is not a supported type.
|
||||
"""
|
||||
if isinstance(image, dict) and 'bytes' in image:
|
||||
image = Image.open(BytesIO(image['bytes']))
|
||||
if isinstance(image, dict) and "bytes" in image:
|
||||
image = Image.open(BytesIO(image["bytes"]))
|
||||
if isinstance(image, Image.Image):
|
||||
image = image.convert("RGB")
|
||||
with io.BytesIO() as image_data:
|
||||
image.save(image_data, format="JPEG")
|
||||
image_base64 = base64.b64encode(
|
||||
image_data.getvalue()).decode("utf-8")
|
||||
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
|
||||
return {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||
},
|
||||
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
|
||||
}
|
||||
|
||||
if isinstance(image, str):
|
||||
image_url = (image if image.startswith(
|
||||
("http://", "file://")) else f"file://{image}")
|
||||
image_url = image if image.startswith(("http://", "file://")) else f"file://{image}"
|
||||
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||
|
||||
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
||||
" or str or dictionary with raw image bytes.")
|
||||
raise ValueError(
|
||||
f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes."
|
||||
)
|
||||
|
||||
|
||||
class EBDataset(BenchmarkDataset):
|
||||
@@ -243,8 +235,7 @@ class EBDataset(BenchmarkDataset):
|
||||
new_output_len = int(entry["max_dec_len"])
|
||||
|
||||
if enable_multimodal_chat:
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
prompt, None)
|
||||
prompt = self.apply_multimodal_chat_transformation(prompt, None)
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
no=cnt,
|
||||
@@ -252,17 +243,20 @@ class EBDataset(BenchmarkDataset):
|
||||
prompt_len=self.prompt_len,
|
||||
history_QA=[],
|
||||
expected_output_len=new_output_len,
|
||||
))
|
||||
)
|
||||
)
|
||||
cnt += 1
|
||||
|
||||
self.maybe_oversample_requests(samples, num_requests)
|
||||
return samples
|
||||
|
||||
|
||||
class EBChatDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||
sample requests based on conversation turns.
|
||||
"""
|
||||
|
||||
prompt_len: int
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
@@ -296,8 +290,7 @@ class EBChatDataset(BenchmarkDataset):
|
||||
new_output_len = int(entry.get("max_tokens", 12288))
|
||||
|
||||
if enable_multimodal_chat:
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
prompt, None)
|
||||
prompt = self.apply_multimodal_chat_transformation(prompt, None)
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
no=cnt,
|
||||
@@ -306,9 +299,9 @@ class EBChatDataset(BenchmarkDataset):
|
||||
prompt_len=0,
|
||||
history_QA=history_QA,
|
||||
expected_output_len=new_output_len,
|
||||
))
|
||||
)
|
||||
)
|
||||
cnt += 1
|
||||
|
||||
self.maybe_oversample_requests(samples, num_requests)
|
||||
return samples
|
||||
|
||||
|
||||
@@ -18,28 +18,16 @@ import argparse
|
||||
import asyncio
|
||||
import contextlib
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Union
|
||||
|
||||
import openai
|
||||
import yaml
|
||||
from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
|
||||
from benchmark_dataset import EBChatDataset, EBDataset
|
||||
from benchmark_serving import benchmark
|
||||
|
||||
|
||||
def prepare_input_requests(
|
||||
num_prompts: int, dataset_name: str, dataset_path: str
|
||||
) -> Union[EBDataset, EBChatDataset]:
|
||||
def prepare_input_requests(num_prompts: int, dataset_name: str, dataset_path: str) -> Union[EBDataset, EBChatDataset]:
|
||||
dataset_mapping = {
|
||||
"EB": lambda: EBDataset(dataset_path=dataset_path).sample(
|
||||
num_requests=num_prompts
|
||||
),
|
||||
"EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(
|
||||
num_requests=num_prompts
|
||||
),
|
||||
"EB": lambda: EBDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
|
||||
"EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -104,24 +92,27 @@ def calculate_speedup(acceptance_rate, draft_token_step, t_ori, t_mtp):
|
||||
def main(args):
|
||||
base_url = f"http://{args.host}:{args.port}"
|
||||
|
||||
input_requests = prepare_input_requests(
|
||||
args.num_prompts, args.dataset_name, args.dataset_path
|
||||
)
|
||||
input_requests = prepare_input_requests(args.num_prompts, args.dataset_name, args.dataset_path)
|
||||
|
||||
if len(args.max_concurrency) != len(args.s_itl_base_model):
|
||||
raise ValueError(f"--max_concurrency should be same length as --s_itl_base_model")
|
||||
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
|
||||
|
||||
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
|
||||
# Wramup
|
||||
print("Starting warmup...")
|
||||
with open(os.devnull, "w") as f:
|
||||
with contextlib.redirect_stdout(f):
|
||||
send_one_batch(base_url, max_concurrency, input_requests[0:max_concurrency], True)
|
||||
send_one_batch(
|
||||
base_url,
|
||||
max_concurrency,
|
||||
input_requests[0:max_concurrency],
|
||||
True,
|
||||
)
|
||||
|
||||
# Benchmark
|
||||
record = send_one_batch(base_url, max_concurrency, input_requests, False)
|
||||
|
||||
metric_header = f"Speed up"
|
||||
metric_header = "Speed up"
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||
for draft_token_step in args.draft_token_steps:
|
||||
speedup = calculate_speedup(
|
||||
@@ -130,11 +121,7 @@ def main(args):
|
||||
s_itl,
|
||||
record["mean_s_itl_ms"],
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Speed up on {draft_token_step} steps draft", speedup
|
||||
)
|
||||
)
|
||||
print("{:<40} {:<10.2f}".format(f"Speed up on {draft_token_step} steps draft", speedup))
|
||||
print("=" * 50)
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -24,9 +24,11 @@ import os
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
metrics: dict[str, list],
|
||||
extra_info: dict[str, Any]) -> list:
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace,
|
||||
metrics: dict[str, list],
|
||||
extra_info: dict[str, Any],
|
||||
) -> list:
|
||||
"""
|
||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||
on metric per record
|
||||
@@ -54,12 +56,10 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
},
|
||||
}
|
||||
|
||||
tp = record["benchmark"]["extra_info"]["args"].get(
|
||||
"tensor_parallel_size")
|
||||
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
|
||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||
if not tp and "tensor_parallel_size" in extra_info:
|
||||
record["benchmark"]["extra_info"]["args"][
|
||||
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||
|
||||
records.append(record)
|
||||
|
||||
@@ -68,6 +68,7 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
|
||||
class InfEncoder(json.JSONEncoder):
|
||||
"""InfEncoder"""
|
||||
|
||||
def clear_inf(self, o: Any):
|
||||
"""clear_inf"""
|
||||
if isinstance(o, dict):
|
||||
@@ -87,4 +88,3 @@ def write_to_json(filename: str, records: list) -> None:
|
||||
"""write_to_json"""
|
||||
with open(filename, "w") as f:
|
||||
json.dump(records, f, cls=InfEncoder)
|
||||
|
||||
|
||||
@@ -25,32 +25,32 @@ import os
|
||||
import random
|
||||
import time
|
||||
import warnings
|
||||
import yaml
|
||||
import requests
|
||||
import copy
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
from collections.abc import AsyncGenerator, Iterable
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS,
|
||||
OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
import requests
|
||||
import yaml
|
||||
from backend_request_func import (
|
||||
ASYNC_REQUEST_FUNCS,
|
||||
OPENAI_COMPATIBLE_BACKENDS,
|
||||
RequestFuncInput,
|
||||
RequestFuncOutput,
|
||||
)
|
||||
from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
from benchmark_dataset import (SampleRequest, EBDataset, EBChatDataset)
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkMetrics:
|
||||
"""Class containing all metrics that are used in this script"""
|
||||
|
||||
completed: int
|
||||
total_input: int
|
||||
total_output: int
|
||||
@@ -133,8 +133,7 @@ async def get_request(
|
||||
input_requests: Iterable[SampleRequest] = iter(input_requests)
|
||||
|
||||
# Calculate scale parameter theta to maintain the desired request_rate.
|
||||
assert burstiness > 0, (
|
||||
f"A positive burstiness factor is expected, but given {burstiness}.")
|
||||
assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}."
|
||||
theta = 1.0 / (request_rate * burstiness)
|
||||
|
||||
for request in input_requests:
|
||||
@@ -160,7 +159,7 @@ def calculate_metrics(
|
||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||
"""Calculates various performance metrics based on the inputs and outputs."""
|
||||
input_lens: list[int] = []
|
||||
infer_input_lens: list[int] = [] # 推理侧输入token数
|
||||
infer_input_lens: list[int] = [] # 推理侧输入token数
|
||||
actual_output_lens: list[int] = []
|
||||
total_input = 0
|
||||
completed = 0
|
||||
@@ -210,8 +209,9 @@ def calculate_metrics(
|
||||
s_e2els.append(outputs[i].arrival_time[-1])
|
||||
# 解码速度去掉首token
|
||||
if len(outputs[i].arrival_time) > 2:
|
||||
s_decodes.append((outputs[i].output_tokens - 1) /
|
||||
(outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]))
|
||||
s_decodes.append(
|
||||
(outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1])
|
||||
)
|
||||
completed += 1
|
||||
else:
|
||||
actual_output_lens.append(0)
|
||||
@@ -224,16 +224,13 @@ def calculate_metrics(
|
||||
|
||||
if "ttft" in goodput_config_dict:
|
||||
valid_metrics.append(ttfts)
|
||||
slo_values.append(goodput_config_dict["ttft"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "tpot" in goodput_config_dict:
|
||||
valid_metrics.append(all_tpots)
|
||||
slo_values.append(goodput_config_dict["tpot"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "e2el" in goodput_config_dict:
|
||||
valid_metrics.append(e2els)
|
||||
slo_values.append(goodput_config_dict["e2el"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
|
||||
for req_metric in zip(*valid_metrics):
|
||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||
@@ -242,9 +239,9 @@ def calculate_metrics(
|
||||
|
||||
if completed == 0:
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
"on the benchmark arguments.",
|
||||
stacklevel=2)
|
||||
"All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.",
|
||||
stacklevel=2,
|
||||
)
|
||||
metrics = BenchmarkMetrics(
|
||||
completed=completed,
|
||||
total_input=total_input,
|
||||
@@ -253,64 +250,50 @@ def calculate_metrics(
|
||||
request_goodput=good_completed / dur_s,
|
||||
output_throughput=sum(actual_output_lens) / dur_s,
|
||||
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
||||
mean_s_decode=np.mean(s_decodes or 0) *
|
||||
1, # ttfts is empty if streaming is not supported by backend
|
||||
mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend
|
||||
std_s_decode=np.std(s_decodes or 0) * 1,
|
||||
median_s_decode=np.median(s_decodes or 0) * 1,
|
||||
percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1)
|
||||
for p in selected_percentiles],
|
||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||
1000, # ttfts is empty if streaming is not supported by backend
|
||||
percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles],
|
||||
mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
|
||||
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
mean_s_ttft_ms=np.mean(s_ttfts or 0) *
|
||||
1000, # ttfts is empty if streaming is not supported by backend
|
||||
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
|
||||
mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
|
||||
std_s_ttft_ms=np.std(s_ttfts or 0) * 1000,
|
||||
median_s_ttft_ms=np.median(s_ttfts or 0) * 1000,
|
||||
percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles],
|
||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
|
||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||
std_itl_ms=np.std(itls or 0) * 1000,
|
||||
median_itl_ms=np.median(itls or 0) * 1000,
|
||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
|
||||
mean_s_itl_ms=np.mean(s_itls or 0) * 1000,
|
||||
std_s_itl_ms=np.std(s_itls or 0) * 1000,
|
||||
median_s_itl_ms=np.median(s_itls or 0) * 1000,
|
||||
percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles],
|
||||
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles],
|
||||
mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000,
|
||||
std_s_e2el_ms=np.std(s_e2els or 0) * 1000,
|
||||
median_s_e2el_ms=np.median(s_e2els or 0) * 1000,
|
||||
percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles],
|
||||
mean_input_len=np.mean(input_lens or 0) * 1,
|
||||
std_input_len=np.std(input_lens or 0) * 1,
|
||||
median_input_len=np.median(input_lens or 0) * 1,
|
||||
percentiles_input_len=[(p, np.percentile(input_lens or 0, p))
|
||||
for p in selected_percentiles],
|
||||
percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles],
|
||||
mean_s_input_len=np.mean(infer_input_lens or 0) * 1,
|
||||
std_s_input_len=np.std(infer_input_lens or 0) * 1,
|
||||
median_s_input_len=np.median(infer_input_lens or 0) * 1,
|
||||
percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p))
|
||||
for p in selected_percentiles],
|
||||
percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles],
|
||||
mean_output_len=np.mean(actual_output_lens or 0) * 1,
|
||||
std_output_len=np.std(actual_output_lens or 0) * 1,
|
||||
median_output_len=np.median(actual_output_lens or 0) * 1,
|
||||
percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p))
|
||||
for p in selected_percentiles],
|
||||
percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles],
|
||||
)
|
||||
|
||||
return metrics, actual_output_lens
|
||||
@@ -351,20 +334,22 @@ async def benchmark(
|
||||
|
||||
if lora_modules:
|
||||
# For each input request, choose a LoRA module at random.
|
||||
lora_modules = iter(
|
||||
[random.choice(lora_modules) \
|
||||
for _ in range(len(input_requests))])
|
||||
lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))])
|
||||
|
||||
if profile:
|
||||
print("Starting profiler...")
|
||||
profile_input = RequestFuncInput(model=model_id,
|
||||
model_name=model_name,
|
||||
prompt=test_prompt,
|
||||
api_url=base_url + "/start_profile",
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body)
|
||||
test_prompt = None
|
||||
test_output_len = None
|
||||
profile_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
model_name=model_name,
|
||||
prompt=test_prompt,
|
||||
api_url=base_url + "/start_profile",
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body,
|
||||
)
|
||||
profile_output = await request_func(request_func_input=profile_input)
|
||||
if profile_output.success:
|
||||
print("Profiler started")
|
||||
@@ -384,19 +369,16 @@ async def benchmark(
|
||||
# and it will simplify the code in limited_request_func.
|
||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
# if max_concurrency else contextlib.nullcontext())
|
||||
semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
if max_concurrency else None)
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||
|
||||
async def limited_request_func(request_func_input, pbar):
|
||||
if semaphore is None:
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
async with semaphore:
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
|
||||
benchmark_start_time = time.perf_counter()
|
||||
|
||||
|
||||
print(f"开始时间:{datetime.now()}")
|
||||
tasks: list[asyncio.Task] = []
|
||||
async for request in get_request(input_requests, request_rate, burstiness):
|
||||
@@ -409,25 +391,26 @@ async def benchmark(
|
||||
req_lora_module = next(lora_modules)
|
||||
req_model_id, req_model_name = req_lora_module, req_lora_module
|
||||
|
||||
request_func_input = RequestFuncInput(model=req_model_id,
|
||||
model_name=req_model_name,
|
||||
prompt=prompt,
|
||||
prompt_len=0,
|
||||
history_QA=history_QA,
|
||||
hyper_parameters=hyper_parameters,
|
||||
api_url=api_url,
|
||||
output_len=output_len,
|
||||
logprobs=logprobs,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body)
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
limited_request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)))
|
||||
request_func_input = RequestFuncInput(
|
||||
model=req_model_id,
|
||||
model_name=req_model_name,
|
||||
prompt=prompt,
|
||||
prompt_len=0,
|
||||
history_QA=history_QA,
|
||||
hyper_parameters=hyper_parameters,
|
||||
api_url=api_url,
|
||||
output_len=output_len,
|
||||
logprobs=logprobs,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body,
|
||||
)
|
||||
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
|
||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||
print(f"完成时间:{datetime.now()}")
|
||||
if profile:
|
||||
print("Stopping profiler...")
|
||||
test_output_len = None
|
||||
test_output_len = None
|
||||
profile_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
prompt=test_prompt,
|
||||
@@ -454,22 +437,16 @@ async def benchmark(
|
||||
)
|
||||
print("Benchmark complete!!!")
|
||||
|
||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
|
||||
benchmark_duration))
|
||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||
print("{:<40} {:<10}".format("Total generated tokens:",
|
||||
metrics.total_output))
|
||||
print("{:<40} {:<10.3f}".format("Request throughput (req/s):",
|
||||
metrics.request_throughput))
|
||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||
print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput))
|
||||
if goodput_config_dict:
|
||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||
metrics.request_goodput))
|
||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||
metrics.output_throughput))
|
||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||
metrics.total_token_throughput))
|
||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput))
|
||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput))
|
||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput))
|
||||
|
||||
result = {
|
||||
"duration": benchmark_duration,
|
||||
@@ -477,8 +454,7 @@ async def benchmark(
|
||||
"total_input_tokens": metrics.total_input,
|
||||
"total_output_tokens": metrics.total_output,
|
||||
"request_throughput": metrics.request_throughput,
|
||||
"request_goodput:":
|
||||
metrics.request_goodput if goodput_config_dict else None,
|
||||
"request_goodput:": (metrics.request_goodput if goodput_config_dict else None),
|
||||
"output_throughput": metrics.output_throughput,
|
||||
"total_token_throughput": metrics.total_token_throughput,
|
||||
"input_lens": [output.prompt_len for output in outputs],
|
||||
@@ -491,7 +467,6 @@ async def benchmark(
|
||||
"reasoning_contents": [output.reasoning_content for output in outputs],
|
||||
"errors": [output.error for output in outputs],
|
||||
}
|
||||
quick_result = copy.deepcopy(result)
|
||||
|
||||
def process_one_metric(
|
||||
# E.g., "ttft"
|
||||
@@ -505,24 +480,25 @@ async def benchmark(
|
||||
# metric.
|
||||
if metric_attribute_name not in selected_percentile_metrics:
|
||||
return
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name} (ms):",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name} (ms):",
|
||||
getattr(metrics, f"median_{metric_attribute_name}_ms")))
|
||||
result[f"mean_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"mean_{metric_attribute_name}_ms")
|
||||
result[f"median_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"median_{metric_attribute_name}_ms")
|
||||
result[f"std_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"std_{metric_attribute_name}_ms")
|
||||
for p, value in getattr(metrics,
|
||||
f"percentiles_{metric_attribute_name}_ms"):
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name} (ms):",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}_ms"),
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name} (ms):",
|
||||
getattr(metrics, f"median_{metric_attribute_name}_ms"),
|
||||
)
|
||||
)
|
||||
result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms")
|
||||
result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms")
|
||||
result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms")
|
||||
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
|
||||
p_word = str(int(p)) if int(p) == p else str(p)
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
|
||||
value))
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
|
||||
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
||||
|
||||
def process_one_length(
|
||||
@@ -537,31 +513,31 @@ async def benchmark(
|
||||
# metric.
|
||||
if metric_attribute_name not in selected_percentile_metrics:
|
||||
return
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name}:",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}")))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name}:",
|
||||
getattr(metrics, f"median_{metric_attribute_name}")))
|
||||
result[f"mean_{metric_attribute_name}"] = getattr(
|
||||
metrics, f"mean_{metric_attribute_name}")
|
||||
result[f"median_{metric_attribute_name}"] = getattr(
|
||||
metrics, f"median_{metric_attribute_name}")
|
||||
result[f"std_{metric_attribute_name}"] = getattr(
|
||||
metrics, f"std_{metric_attribute_name}")
|
||||
for p, value in getattr(metrics,
|
||||
f"percentiles_{metric_attribute_name}"):
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name}:",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}"),
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name}:",
|
||||
getattr(metrics, f"median_{metric_attribute_name}"),
|
||||
)
|
||||
)
|
||||
result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}")
|
||||
result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}")
|
||||
result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}")
|
||||
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"):
|
||||
p_word = str(int(p)) if int(p) == p else str(p)
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:",
|
||||
value))
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
|
||||
result[f"p{p_word}_{metric_attribute_name}"] = value
|
||||
|
||||
process_one_length("s_decode", "Decode", "解码速度(tok/s)")
|
||||
process_one_metric("ttft", "TTFT", "Time to First Token")
|
||||
process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
|
||||
process_one_metric("tpot", "TPOT",
|
||||
"Time per Output Token (excl. 1st token)")
|
||||
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
|
||||
process_one_metric("itl", "ITL", "Inter-token Latency")
|
||||
process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
|
||||
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
||||
@@ -581,6 +557,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
|
||||
"""
|
||||
快速评估
|
||||
"""
|
||||
|
||||
def process_quick_metric(
|
||||
metric_attribute_name: str,
|
||||
metric_name: str,
|
||||
@@ -588,7 +565,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
|
||||
):
|
||||
if metric_attribute_name not in selected_percentile_metrics:
|
||||
return
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||
mean_value = getattr(metrics, f"mean_{metric_attribute_name}_ms")
|
||||
print("{:<40} {:<10.2f}".format(f"Mean {metric_name} (ms):", mean_value))
|
||||
quick_result[f"mean_{metric_attribute_name}_ms"] = mean_value
|
||||
@@ -600,17 +577,17 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
|
||||
):
|
||||
if metric_attribute_name not in selected_percentile_metrics:
|
||||
return
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||
mean_value = getattr(metrics, f"mean_{metric_attribute_name}")
|
||||
print("{:<40} {:<10.2f}".format(f"Mean {metric_name}:", mean_value))
|
||||
quick_result[f"mean_{metric_attribute_name}"] = mean_value
|
||||
|
||||
print("\n\n\n")
|
||||
print("{s:{c}^{n}}".format(s=' Benchmark Quick Summary ', n=50, c='='))
|
||||
print("{s:{c}^{n}}".format(s=" Benchmark Quick Summary ", n=50, c="="))
|
||||
process_quick_length("s_decode", "Decode", "解码速度(tok/s)")
|
||||
process_quick_metric("ttft", "TTFT", "Time to First Token")
|
||||
process_quick_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
|
||||
process_quick_metric("tpot", "TPOT",
|
||||
"Time per Output Token (excl. 1st token)")
|
||||
process_quick_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
|
||||
process_quick_metric("itl", "ITL", "Inter-token Latency")
|
||||
process_quick_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
|
||||
process_quick_metric("e2el", "E2EL", "End-to-end Latency")
|
||||
@@ -633,12 +610,14 @@ def check_goodput_args(args):
|
||||
raise ValueError(
|
||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||
"The service level objective name should be one of "
|
||||
f"{str(VALID_NAMES)}. ")
|
||||
f"{VALID_NAMES!s}. "
|
||||
)
|
||||
if slo_val < 0:
|
||||
raise ValueError(
|
||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||
"The service level objective value should be "
|
||||
"non-negative.")
|
||||
"non-negative."
|
||||
)
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
@@ -652,37 +631,43 @@ def parse_goodput(slo_pairs):
|
||||
except ValueError as err:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Invalid format found for service level objectives. "
|
||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
'Specify service level objectives for goodput as "KEY:VALUE" '
|
||||
"pairs, where the key is a metric name, and the value is a "
|
||||
"number in milliseconds.") from err
|
||||
"number in milliseconds."
|
||||
) from err
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
results: dict[str, Any],
|
||||
file_name: str) -> None:
|
||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None:
|
||||
"""Save the benchmarking results to PyTorch Benchmark Format JSON file"""
|
||||
metrics = [
|
||||
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
|
||||
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
|
||||
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
|
||||
"median_ttft_ms",
|
||||
"mean_ttft_ms",
|
||||
"std_ttft_ms",
|
||||
"p99_ttft_ms",
|
||||
"mean_tpot_ms",
|
||||
"median_tpot_ms",
|
||||
"std_tpot_ms",
|
||||
"p99_tpot_ms",
|
||||
"median_itl_ms",
|
||||
"mean_itl_ms",
|
||||
"std_itl_ms",
|
||||
"p99_itl_ms",
|
||||
]
|
||||
# These raw data might be useful, but they are rather big. They can be added
|
||||
# later if needed
|
||||
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
|
||||
pt_records = convert_to_pytorch_benchmark_format(
|
||||
args=args,
|
||||
metrics={k: [results[k]]
|
||||
for k in metrics},
|
||||
extra_info={
|
||||
k: results[k]
|
||||
for k in results if k not in metrics and k not in ignored_metrics
|
||||
})
|
||||
metrics={k: [results[k]] for k in metrics},
|
||||
extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics},
|
||||
)
|
||||
if pt_records:
|
||||
# Don't use json suffix here as we don't want CI to pick it up
|
||||
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
def check_health(api_base_url: str) -> bool:
|
||||
health_url = api_base_url.rstrip("/") + "/health"
|
||||
try:
|
||||
@@ -697,6 +682,7 @@ def check_health(api_base_url: str) -> bool:
|
||||
print(f"[HEALTH] Failed to connect to {health_url}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""Main entry point"""
|
||||
print(args)
|
||||
@@ -707,7 +693,6 @@ def main(args: argparse.Namespace):
|
||||
model_id = args.model
|
||||
model_name = args.served_model_name
|
||||
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
||||
tokenizer_mode = args.tokenizer_mode
|
||||
|
||||
if args.base_url is not None:
|
||||
api_url = f"{args.base_url}{args.endpoint}"
|
||||
@@ -717,23 +702,17 @@ def main(args: argparse.Namespace):
|
||||
base_url = f"http://{args.host}:{args.port}"
|
||||
|
||||
if args.dataset_name is None:
|
||||
raise ValueError(
|
||||
"Please specify '--dataset-name' and the corresponding "
|
||||
"'--dataset-path' if required.")
|
||||
raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.")
|
||||
|
||||
# For datasets that follow a similar structure, use a mapping.
|
||||
dataset_mapping = {
|
||||
"EB":
|
||||
lambda: EBDataset(random_seed=args.seed,
|
||||
dataset_path=args.dataset_path).sample(
|
||||
num_requests=args.num_prompts,
|
||||
output_len=args.sharegpt_output_len,
|
||||
"EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
|
||||
num_requests=args.num_prompts,
|
||||
output_len=args.sharegpt_output_len,
|
||||
),
|
||||
"EBChat":
|
||||
lambda: EBChatDataset(random_seed=args.seed,
|
||||
dataset_path=args.dataset_path).sample(
|
||||
num_requests=args.num_prompts,
|
||||
output_len=args.sharegpt_output_len,
|
||||
"EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
|
||||
num_requests=args.num_prompts,
|
||||
output_len=args.sharegpt_output_len,
|
||||
),
|
||||
}
|
||||
|
||||
@@ -751,15 +730,14 @@ def main(args: argparse.Namespace):
|
||||
"top_p": args.top_p,
|
||||
"top_k": args.top_k,
|
||||
"min_p": args.min_p,
|
||||
"temperature": args.temperature
|
||||
}.items() if v is not None
|
||||
"temperature": args.temperature,
|
||||
}.items()
|
||||
if v is not None
|
||||
}
|
||||
|
||||
# Sampling parameters are only supported by openai-compatible backend.
|
||||
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
|
||||
raise ValueError(
|
||||
"Sampling parameters are only supported by openai-compatible "
|
||||
"backends.")
|
||||
raise ValueError("Sampling parameters are only supported by openai-compatible " "backends.")
|
||||
|
||||
if "temperature" not in sampling_params:
|
||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||
@@ -790,15 +768,14 @@ def main(args: argparse.Namespace):
|
||||
disable_tqdm=args.disable_tqdm,
|
||||
profile=args.profile,
|
||||
selected_percentile_metrics=args.percentile_metrics.split(","),
|
||||
selected_percentiles=[
|
||||
float(p) for p in args.metric_percentiles.split(",")
|
||||
],
|
||||
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
||||
ignore_eos=args.ignore_eos,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
max_concurrency=args.max_concurrency,
|
||||
lora_modules=args.lora_modules,
|
||||
extra_body=sampling_params,
|
||||
))
|
||||
)
|
||||
)
|
||||
|
||||
# Save config and results to json
|
||||
if args.save_result:
|
||||
@@ -819,22 +796,23 @@ def main(args: argparse.Namespace):
|
||||
kvstring = item.split("=")
|
||||
result_json[kvstring[0].strip()] = kvstring[1].strip()
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid metadata format. Please use KEY=VALUE format."
|
||||
)
|
||||
raise ValueError("Invalid metadata format. Please use KEY=VALUE format.")
|
||||
|
||||
if not args.save_detailed:
|
||||
# Remove fields with too many data points
|
||||
for field in [
|
||||
"input_lens", "output_lens", "ttfts", "itls",
|
||||
"generated_texts", "errors"
|
||||
"input_lens",
|
||||
"output_lens",
|
||||
"ttfts",
|
||||
"itls",
|
||||
"generated_texts",
|
||||
"errors",
|
||||
]:
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
|
||||
# Traffic
|
||||
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||
< float("inf") else "inf")
|
||||
result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf"
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
@@ -843,21 +821,19 @@ def main(args: argparse.Namespace):
|
||||
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
||||
if args.max_concurrency is not None else "")
|
||||
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
|
||||
max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else ""
|
||||
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"
|
||||
if args.result_filename:
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(file_name, "w", encoding='utf-8') as outfile:
|
||||
with open(file_name, "w", encoding="utf-8") as outfile:
|
||||
json.dump(result_json, outfile)
|
||||
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the online serving throughput.")
|
||||
parser = FlexibleArgumentParser(description="Benchmark the online serving throughput.")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
@@ -883,18 +859,29 @@ if __name__ == "__main__":
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default="sharegpt",
|
||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "EB", "EBChat"],
|
||||
choices=[
|
||||
"sharegpt",
|
||||
"burstgpt",
|
||||
"sonnet",
|
||||
"random",
|
||||
"hf",
|
||||
"EB",
|
||||
"EBChat",
|
||||
],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
)
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the sharegpt/sonnet dataset. "
|
||||
"Or the huggingface dataset ID if using HF dataset.")
|
||||
parser.add_argument("--hyperparameter-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the hyperparameter. ")
|
||||
parser.add_argument(
|
||||
"--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hyperparameter-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the hyperparameter. ",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-concurrency",
|
||||
type=int,
|
||||
@@ -906,7 +893,8 @@ if __name__ == "__main__":
|
||||
"initiated, this argument will control how many are actually allowed "
|
||||
"to execute at a time. This means that when used in combination, the "
|
||||
"actual request rate may be lower than specified with --request-rate, "
|
||||
"if the server is not processing requests fast enough to keep up.")
|
||||
"if the server is not processing requests fast enough to keep up.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
@@ -917,7 +905,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--tokenizer",
|
||||
type=str,
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||
)
|
||||
parser.add_argument("--use-beam-search", action="store_true")
|
||||
parser.add_argument(
|
||||
@@ -930,11 +918,13 @@ if __name__ == "__main__":
|
||||
"--logprobs",
|
||||
type=int,
|
||||
default=None,
|
||||
help=("Number of logprobs-per-token to compute & return as part of "
|
||||
"the request. If unspecified, then either (1) if beam search "
|
||||
"is disabled, no logprobs are computed & a single dummy "
|
||||
"logprob is returned for each token; or (2) if beam search "
|
||||
"is enabled 1 logprob per token is computed"),
|
||||
help=(
|
||||
"Number of logprobs-per-token to compute & return as part of "
|
||||
"the request. If unspecified, then either (1) if beam search "
|
||||
"is disabled, no logprobs are computed & a single dummy "
|
||||
"logprob is returned for each token; or (2) if beam search "
|
||||
"is enabled 1 logprob per token is computed"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--request-rate",
|
||||
@@ -971,8 +961,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Use Torch Profiler. The endpoint must be launched with "
|
||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||
help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-result",
|
||||
@@ -1013,35 +1002,38 @@ if __name__ == "__main__":
|
||||
"--ignore-eos",
|
||||
action="store_true",
|
||||
help="Set ignore_eos flag when sending the benchmark request."
|
||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||
"Default value is \"ttft,tpot,itl\".")
|
||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||
'Default value is "ttft,tpot,itl".',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
|
||||
'Default value is "99". '
|
||||
'Use "--percentile-metrics" to select metrics.',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--goodput",
|
||||
nargs="+",
|
||||
required=False,
|
||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
help='Specify service level objectives for goodput as "KEY:VALUE" '
|
||||
"pairs, where the key is a metric name, and the value is in "
|
||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
|
||||
"separated by spaces. Allowed request level metric names are "
|
||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||
'"ttft", "tpot", "e2el". For more context on the definition of '
|
||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
|
||||
)
|
||||
|
||||
# group for dataset specific arguments
|
||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||
@@ -1069,8 +1061,8 @@ if __name__ == "__main__":
|
||||
"--sharegpt-output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the output length "
|
||||
"from the ShareGPT dataset.")
|
||||
help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.",
|
||||
)
|
||||
|
||||
random_group = parser.add_argument_group("random dataset options")
|
||||
random_group.add_argument(
|
||||
@@ -1098,29 +1090,24 @@ if __name__ == "__main__":
|
||||
"--random-prefix-len",
|
||||
type=int,
|
||||
default=0,
|
||||
help=("Number of fixed prefix tokens before the random context "
|
||||
"in a request. "
|
||||
"The total input length is the sum of `random-prefix-len` and "
|
||||
"a random "
|
||||
"context length sampled from [input_len * (1 - range_ratio), "
|
||||
"input_len * (1 + range_ratio)]."),
|
||||
help=(
|
||||
"Number of fixed prefix tokens before the random context "
|
||||
"in a request. "
|
||||
"The total input length is the sum of `random-prefix-len` and "
|
||||
"a random "
|
||||
"context length sampled from [input_len * (1 - range_ratio), "
|
||||
"input_len * (1 + range_ratio)]."
|
||||
),
|
||||
)
|
||||
|
||||
hf_group = parser.add_argument_group("hf dataset options")
|
||||
hf_group.add_argument("--hf-subset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Subset of the HF dataset.")
|
||||
hf_group.add_argument("--hf-split",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Split of the HF dataset.")
|
||||
hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.")
|
||||
hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.")
|
||||
hf_group.add_argument(
|
||||
"--hf-output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the output lengths "
|
||||
"from the sampled HF dataset.",
|
||||
help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.",
|
||||
)
|
||||
|
||||
sampling_group = parser.add_argument_group("sampling parameters")
|
||||
@@ -1128,52 +1115,58 @@ if __name__ == "__main__":
|
||||
"--top-p",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Top-p sampling parameter. Only has effect on openai-compatible "
|
||||
"backends.")
|
||||
help="Top-p sampling parameter. Only has effect on openai-compatible " "backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--top-k",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Top-k sampling parameter. Only has effect on openai-compatible "
|
||||
"backends.")
|
||||
help="Top-k sampling parameter. Only has effect on openai-compatible " "backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--min-p",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Min-p sampling parameter. Only has effect on openai-compatible "
|
||||
"backends.")
|
||||
help="Min-p sampling parameter. Only has effect on openai-compatible " "backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Temperature sampling parameter. Only has effect on "
|
||||
"openai-compatible backends. If not specified, default to greedy "
|
||||
"decoding (i.e. temperature==0.0).")
|
||||
"decoding (i.e. temperature==0.0).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--tokenizer-mode',
|
||||
"--tokenizer-mode",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=['auto', 'slow', 'mistral', 'custom'],
|
||||
choices=["auto", "slow", "mistral", "custom"],
|
||||
help='The tokenizer mode.\n\n* "auto" will use the '
|
||||
'fast tokenizer if available.\n* "slow" will '
|
||||
'always use the slow tokenizer. \n* '
|
||||
"always use the slow tokenizer. \n* "
|
||||
'"mistral" will always use the `mistral_common` tokenizer. \n*'
|
||||
'"custom" will use --tokenizer to select the preregistered tokenizer.')
|
||||
'"custom" will use --tokenizer to select the preregistered tokenizer.',
|
||||
)
|
||||
|
||||
parser.add_argument("--served-model-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ")
|
||||
parser.add_argument(
|
||||
"--served-model-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument("--lora-modules",
|
||||
nargs='+',
|
||||
default=None,
|
||||
help="A subset of LoRA module names passed in when "
|
||||
"launching the server. For each request, the "
|
||||
"script chooses a LoRA module at random.")
|
||||
parser.add_argument(
|
||||
"--lora-modules",
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="A subset of LoRA module names passed in when "
|
||||
"launching the server. For each request, the "
|
||||
"script chooses a LoRA module at random.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -7,4 +7,4 @@ tensor_parallel_size: 1
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 384
|
||||
quantization: wint4
|
||||
reasoning_parser: ernie-45-vl
|
||||
reasoning_parser: ernie-45-vl
|
||||
|
||||
@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
max_num_batched_tokens: 384
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
|
||||
@@ -9,4 +9,4 @@ cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
||||
pd_comm_port: "2333"
|
||||
|
||||
@@ -10,4 +10,4 @@ engine_worker_queue_port: 6677
|
||||
num_gpu_blocks_override: 1024
|
||||
cache_transfer_protocol: "rdma"
|
||||
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
|
||||
pd_comm_port: "2334"
|
||||
pd_comm_port: "2334"
|
||||
|
||||
@@ -10,4 +10,4 @@ splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
pd_comm_port: "2334"
|
||||
|
||||
@@ -9,4 +9,4 @@ cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
||||
pd_comm_port: "2333"
|
||||
|
||||
@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
max_num_batched_tokens: 384
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
|
||||
@@ -9,4 +9,4 @@ cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
||||
pd_comm_port: "2333"
|
||||
|
||||
@@ -3,4 +3,4 @@ max_num_seqs: 75
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint4
|
||||
tensor_parallel_size: 4
|
||||
tensor_parallel_size: 4
|
||||
|
||||
@@ -3,4 +3,4 @@ max_num_seqs: 25
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 4
|
||||
tensor_parallel_size: 4
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
metadata:
|
||||
min_tokens: 32
|
||||
max_tokens: 33
|
||||
max_tokens: 33
|
||||
|
||||
@@ -5,4 +5,4 @@ metadata:
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.05
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
||||
presence_penalty: 0
|
||||
|
||||
@@ -5,4 +5,4 @@ metadata:
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 1.5
|
||||
presence_penalty: 1.5
|
||||
|
||||
@@ -8,4 +8,4 @@ frequency_penalty: 0
|
||||
presence_penalty: 0
|
||||
skip_special_tokens: false
|
||||
chat_template_kwargs:
|
||||
enable_thinking: true
|
||||
enable_thinking: true
|
||||
|
||||
@@ -3,4 +3,4 @@ max_num_seqs: 64
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
reasoning_parser: ernie-x1
|
||||
reasoning_parser: ernie-x1
|
||||
|
||||
Reference in New Issue
Block a user