mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Benchmark] Update benchmark (#5496)
* update benchmark * update benchmark
This commit is contained in:
@@ -318,6 +318,7 @@ async def benchmark(
|
||||
selected_percentiles: list[float],
|
||||
ignore_eos: bool,
|
||||
debug: bool,
|
||||
pd_metrics: bool,
|
||||
goodput_config_dict: dict[str, float],
|
||||
max_concurrency: Optional[int],
|
||||
lora_modules: Optional[Iterable[str]],
|
||||
@@ -352,6 +353,7 @@ async def benchmark(
|
||||
logprobs=logprobs,
|
||||
ignore_eos=ignore_eos,
|
||||
debug=debug,
|
||||
pd_metrics=pd_metrics,
|
||||
extra_body=extra_body,
|
||||
response_format=response_format,
|
||||
random_flag=random_flag,
|
||||
@@ -446,6 +448,7 @@ async def benchmark(
|
||||
output_len=output_len,
|
||||
logprobs=logprobs,
|
||||
debug=debug,
|
||||
pd_metrics=pd_metrics,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body,
|
||||
response_format=response_format,
|
||||
@@ -548,6 +551,7 @@ async def benchmark(
|
||||
"generated_texts": [output.generated_text for output in outputs],
|
||||
"reasoning_contents": [output.reasoning_content for output in outputs],
|
||||
"errors": [output.error for output in outputs],
|
||||
"metrics": [output.metrics for output in outputs],
|
||||
}
|
||||
|
||||
def process_one_metric(
|
||||
@@ -583,6 +587,49 @@ async def benchmark(
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
|
||||
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
||||
|
||||
def process_pd_metrics(model_outputs, metric_key):
|
||||
# 收集所有该 metric 的数值
|
||||
values = []
|
||||
percentiles = []
|
||||
for p in args.metric_percentiles.split(","):
|
||||
p = p.strip()
|
||||
if p:
|
||||
percentiles.append(float(p))
|
||||
for item in model_outputs:
|
||||
metrics = item.metrics
|
||||
if metrics.get(metric_key, None) is not None:
|
||||
values.append(metrics[metric_key])
|
||||
|
||||
if not values:
|
||||
print(f"[WARN] metric_key '{metric_key}' not found in outputs.")
|
||||
return
|
||||
|
||||
arr = np.array(values) * 1000 # 秒 -> 毫秒
|
||||
|
||||
print("{s:{c}^{n}}".format(s=metric_key, n=50, c="-"))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_key} (ms):",
|
||||
np.mean(arr),
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_key} (ms):",
|
||||
np.median(arr),
|
||||
)
|
||||
)
|
||||
for p in percentiles:
|
||||
v = np.percentile(arr, p)
|
||||
print("{:<40} {:<10.2f}".format(f"P{str(int(p)) if int(p) == p else str(p)} {metric_key} (ms):", v))
|
||||
# print(f"P{str(int(p)) if int(p) == p else str(p)} {metric_key} (ms): {v:10.2f}")
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Successful {metric_key}:",
|
||||
len(arr),
|
||||
)
|
||||
)
|
||||
|
||||
def process_one_length(
|
||||
# E.g., "ttft"
|
||||
metric_attribute_name: str,
|
||||
@@ -624,6 +671,19 @@ async def benchmark(
|
||||
process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
|
||||
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
||||
process_one_metric("s_e2el", "S_E2EL", "Infer End-to-end Latency")
|
||||
if any(item.metrics for item in outputs):
|
||||
process_pd_metrics(outputs, "prefill_cost_time")
|
||||
process_pd_metrics(outputs, "prefill_prepare_cost_time")
|
||||
process_pd_metrics(outputs, "preprocess_cost_time")
|
||||
process_pd_metrics(outputs, "cache_in_scheduler_cost_time")
|
||||
process_pd_metrics(outputs, "ask_decode_resource_cost_time")
|
||||
process_pd_metrics(outputs, "prefill_first_token_infer_cost_time")
|
||||
process_pd_metrics(outputs, "wait_sending_cache_cost_time")
|
||||
process_pd_metrics(outputs, "decode_preallocate_cost_time")
|
||||
process_pd_metrics(outputs, "decode_prepare_cost_time")
|
||||
process_pd_metrics(outputs, "decode_second_token_infer_cost_time")
|
||||
process_pd_metrics(outputs, "first_token_transmission_cost_time")
|
||||
process_pd_metrics(outputs, "second_token_transmission_cost_time")
|
||||
process_one_length("input_len", "Cached Tokens", "Cached Tokens")
|
||||
process_one_length("s_input_len", "Input Length", "Infer Input Length")
|
||||
process_one_length("output_len", "Output Length", "Output Length")
|
||||
@@ -941,6 +1001,7 @@ def main(args: argparse.Namespace):
|
||||
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
||||
ignore_eos=args.ignore_eos,
|
||||
debug=args.debug,
|
||||
pd_metrics=args.pd_metrics,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
max_concurrency=args.max_concurrency,
|
||||
lora_modules=args.lora_modules,
|
||||
@@ -1129,6 +1190,11 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="shuffle dataset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pd-metrics",
|
||||
action="store_true",
|
||||
help="请求时增加PD分离参数,metrics: True",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--drop-ratio",
|
||||
type=float,
|
||||
|
||||
Reference in New Issue
Block a user