update benchmark tools (#4416)

This commit is contained in:
Zhang Yulong
2025-10-15 11:15:25 +08:00
committed by GitHub
parent 4b647d17de
commit c4f866c457
2 changed files with 37 additions and 4 deletions

View File

@@ -58,10 +58,12 @@ class RequestFuncOutput:
"""Output for requesting LLMs via API""" """Output for requesting LLMs via API"""
no: int = 0 no: int = 0
request_id: str = ""
generated_text: str = "" generated_text: str = ""
reasoning_content: str = "" reasoning_content: str = ""
success: bool = False success: bool = False
latency: float = 0.0 latency: float = 0.0
end_timestamp: float = 0.0 # 模型完全返回的时间戳(秒, perf_counter基准
output_tokens: int = 0 output_tokens: int = 0
ttft: float = 0.0 # Time to first token ttft: float = 0.0 # Time to first token
arrival_time: list = field(default_factory=list) # arrival_time arrival_time: list = field(default_factory=list) # arrival_time
@@ -154,6 +156,8 @@ async def async_request_eb_openai_chat_completions(
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
# output.generated_text = generated_text # output.generated_text = generated_text
# 在流式结束时,记录最后一个 chunk 收到的时间戳
output.end_timestamp = most_recent_timestamp
if output.generated_text.strip() == "": if output.generated_text.strip() == "":
output.success = False output.success = False
output.error = "No generated text found!" output.error = "No generated text found!"
@@ -170,6 +174,7 @@ async def async_request_eb_openai_chat_completions(
) )
output.error = error_text or "" output.error = error_text or ""
output.success = False output.success = False
output.request_id = data.get("id", "")
except Exception: except Exception:
output.success = False output.success = False
exc_info = sys.exc_info() exc_info = sys.exc_info()

View File

@@ -150,7 +150,7 @@ async def get_request(
def calculate_metrics( def calculate_metrics(
input_requests: list[SampleRequest], # input_requests: list[SampleRequest],
outputs: list[RequestFuncOutput], outputs: list[RequestFuncOutput],
dur_s: float, dur_s: float,
selected_percentiles: list[float], selected_percentiles: list[float],
@@ -395,6 +395,7 @@ async def benchmark(
print(f"Traffic request rate: {request_rate}") print(f"Traffic request rate: {request_rate}")
print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}") print(f"Maximum request concurrency: {max_concurrency}")
print(f"Drop ratio: {args.drop_ratio}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests)) pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@@ -443,6 +444,8 @@ async def benchmark(
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar))) tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
outputs.sort(key=lambda x: x.end_timestamp)
if profile: if profile:
print("Stopping profiler...") print("Stopping profiler...")
profile_input = RequestFuncInput( profile_input = RequestFuncInput(
@@ -460,11 +463,30 @@ async def benchmark(
if pbar is not None: if pbar is not None:
pbar.close() pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time drop_ratio = args.drop_ratio
print("benchmark_duration:", benchmark_duration) if 0.0 < drop_ratio < 1:
# 按drop_ratio头尾各舍弃一半请求不计入benchmark统计
n = len(outputs)
drop_count = int(n * drop_ratio)
half = drop_count // 2
if half > 0:
outputs = outputs[half : n - half]
# 根据收到最后一个chunk的时间戳计算总时长
if len(outputs) >= 2:
benchmark_duration = outputs[-1].end_timestamp - outputs[0].end_timestamp
else:
benchmark_duration = 0.0
print(f"丢弃前数量: {n}")
print(f"丢弃后数量: {len(outputs)}")
print(f"benchmark_duration: {benchmark_duration}")
else:
benchmark_duration = time.perf_counter() - benchmark_start_time
print(f"benchmark_duration: {benchmark_duration}")
metrics, actual_output_lens = calculate_metrics( metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests, # input_requests=input_requests,
outputs=outputs, outputs=outputs,
dur_s=benchmark_duration, dur_s=benchmark_duration,
# tokenizer=tokenizer, # tokenizer=tokenizer,
@@ -1081,6 +1103,12 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="shuffle dataset", help="shuffle dataset",
) )
parser.add_argument(
"--drop-ratio",
type=float,
default=0.0,
help="Drop ratio of the outputs. [0, 1)",
)
parser.add_argument( parser.add_argument(
"--trust-remote-code", "--trust-remote-code",
action="store_true", action="store_true",