mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-20 15:25:22 +08:00
update benchmark tools (#4416)
This commit is contained in:
@@ -58,10 +58,12 @@ class RequestFuncOutput:
|
|||||||
"""Output for requesting LLMs via API"""
|
"""Output for requesting LLMs via API"""
|
||||||
|
|
||||||
no: int = 0
|
no: int = 0
|
||||||
|
request_id: str = ""
|
||||||
generated_text: str = ""
|
generated_text: str = ""
|
||||||
reasoning_content: str = ""
|
reasoning_content: str = ""
|
||||||
success: bool = False
|
success: bool = False
|
||||||
latency: float = 0.0
|
latency: float = 0.0
|
||||||
|
end_timestamp: float = 0.0 # 模型完全返回的时间戳(秒, perf_counter基准)
|
||||||
output_tokens: int = 0
|
output_tokens: int = 0
|
||||||
ttft: float = 0.0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
arrival_time: list = field(default_factory=list) # arrival_time
|
arrival_time: list = field(default_factory=list) # arrival_time
|
||||||
@@ -154,6 +156,8 @@ async def async_request_eb_openai_chat_completions(
|
|||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
# output.generated_text = generated_text
|
# output.generated_text = generated_text
|
||||||
|
# 在流式结束时,记录最后一个 chunk 收到的时间戳
|
||||||
|
output.end_timestamp = most_recent_timestamp
|
||||||
if output.generated_text.strip() == "":
|
if output.generated_text.strip() == "":
|
||||||
output.success = False
|
output.success = False
|
||||||
output.error = "No generated text found!"
|
output.error = "No generated text found!"
|
||||||
@@ -170,6 +174,7 @@ async def async_request_eb_openai_chat_completions(
|
|||||||
)
|
)
|
||||||
output.error = error_text or ""
|
output.error = error_text or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
|
output.request_id = data.get("id", "")
|
||||||
except Exception:
|
except Exception:
|
||||||
output.success = False
|
output.success = False
|
||||||
exc_info = sys.exc_info()
|
exc_info = sys.exc_info()
|
||||||
|
@@ -150,7 +150,7 @@ async def get_request(
|
|||||||
|
|
||||||
|
|
||||||
def calculate_metrics(
|
def calculate_metrics(
|
||||||
input_requests: list[SampleRequest],
|
# input_requests: list[SampleRequest],
|
||||||
outputs: list[RequestFuncOutput],
|
outputs: list[RequestFuncOutput],
|
||||||
dur_s: float,
|
dur_s: float,
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
@@ -395,6 +395,7 @@ async def benchmark(
|
|||||||
print(f"Traffic request rate: {request_rate}")
|
print(f"Traffic request rate: {request_rate}")
|
||||||
print(f"Burstiness factor: {burstiness} ({distribution})")
|
print(f"Burstiness factor: {burstiness} ({distribution})")
|
||||||
print(f"Maximum request concurrency: {max_concurrency}")
|
print(f"Maximum request concurrency: {max_concurrency}")
|
||||||
|
print(f"Drop ratio: {args.drop_ratio}")
|
||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
@@ -443,6 +444,8 @@ async def benchmark(
|
|||||||
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
|
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
outputs.sort(key=lambda x: x.end_timestamp)
|
||||||
|
|
||||||
if profile:
|
if profile:
|
||||||
print("Stopping profiler...")
|
print("Stopping profiler...")
|
||||||
profile_input = RequestFuncInput(
|
profile_input = RequestFuncInput(
|
||||||
@@ -460,11 +463,30 @@ async def benchmark(
|
|||||||
if pbar is not None:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
|
drop_ratio = args.drop_ratio
|
||||||
|
if 0.0 < drop_ratio < 1:
|
||||||
|
# 按drop_ratio头尾各舍弃一半请求,不计入benchmark统计
|
||||||
|
n = len(outputs)
|
||||||
|
drop_count = int(n * drop_ratio)
|
||||||
|
half = drop_count // 2
|
||||||
|
if half > 0:
|
||||||
|
outputs = outputs[half : n - half]
|
||||||
|
|
||||||
|
# 根据收到最后一个chunk的时间戳计算总时长
|
||||||
|
if len(outputs) >= 2:
|
||||||
|
benchmark_duration = outputs[-1].end_timestamp - outputs[0].end_timestamp
|
||||||
|
else:
|
||||||
|
benchmark_duration = 0.0
|
||||||
|
|
||||||
|
print(f"丢弃前数量: {n}")
|
||||||
|
print(f"丢弃后数量: {len(outputs)}")
|
||||||
|
print(f"benchmark_duration: {benchmark_duration} 秒")
|
||||||
|
else:
|
||||||
benchmark_duration = time.perf_counter() - benchmark_start_time
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||||
print("benchmark_duration:", benchmark_duration)
|
print(f"benchmark_duration: {benchmark_duration} 秒")
|
||||||
|
|
||||||
metrics, actual_output_lens = calculate_metrics(
|
metrics, actual_output_lens = calculate_metrics(
|
||||||
input_requests=input_requests,
|
# input_requests=input_requests,
|
||||||
outputs=outputs,
|
outputs=outputs,
|
||||||
dur_s=benchmark_duration,
|
dur_s=benchmark_duration,
|
||||||
# tokenizer=tokenizer,
|
# tokenizer=tokenizer,
|
||||||
@@ -1081,6 +1103,12 @@ if __name__ == "__main__":
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="shuffle dataset",
|
help="shuffle dataset",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--drop-ratio",
|
||||||
|
type=float,
|
||||||
|
default=0.0,
|
||||||
|
help="Drop ratio of the outputs. [0, 1)",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
|
Reference in New Issue
Block a user