From 70aa7423f8ce0e339e9273fd82b5270d5f9dac00 Mon Sep 17 00:00:00 2001 From: ophilia-lee <58770600+ophilia-lee@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:52:56 +0800 Subject: [PATCH] =?UTF-8?q?benchmark=E5=B7=A5=E5=85=B7=E9=80=82=E9=85=8DSG?= =?UTF-8?q?Lang=E6=A1=86=E6=9E=B6=20(#4607)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * benchmark工具适配SGLang框架 * benchmark工具适配SGLang框架 * benchmark工具适配SGLang框架 --- benchmarks/backend_request_func.py | 14 +++++++++----- benchmarks/benchmark_serving.py | 2 +- .../{vLLM_default.yaml => request.yaml} | 14 +++++++------- 3 files changed, 17 insertions(+), 13 deletions(-) rename benchmarks/yaml/request_yaml/{vLLM_default.yaml => request.yaml} (53%) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 837d9df91..596804331 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -128,13 +128,13 @@ async def async_request_eb_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": - # print("####chunk:", chunk, type(chunk)) + #print("####chunk:", chunk, type(chunk)) timestamp = time.perf_counter() data = json.loads(chunk) if request_id == "None" and "id" in data: request_id = data["id"] - + if choices := data.get("choices"): content = choices[0]["delta"].get("content") reason_content = choices[0]["delta"].get("reasoning_content") @@ -143,9 +143,12 @@ async def async_request_eb_openai_chat_completions( ttft = timestamp - st output.ttft = ttft # cached_tokens - output.prompt_len = ( - data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0) - ) + if data["usage"] and data["usage"].get("prompt_tokens_details", {}): + output.prompt_len = ( + data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0) + ) + else: + output.prompt_len = 0 # Decoding phase else: @@ -157,6 +160,7 @@ async def async_request_eb_openai_chat_completions( elif usage := data.get("usage", {}): output.output_tokens = usage.get("completion_tokens", 0) output.prompt_tokens = usage.get("prompt_tokens", 0) + most_recent_timestamp = timestamp diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index ce072555f..fb13301c4 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -635,7 +635,7 @@ def benchmark_metrics( goodput_config_dict = check_goodput_args(args) metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, + # input_requests=input_requests, outputs=outputs, dur_s=benchmark_duration, selected_percentiles=selected_percentiles, diff --git a/benchmarks/yaml/request_yaml/vLLM_default.yaml b/benchmarks/yaml/request_yaml/request.yaml similarity index 53% rename from benchmarks/yaml/request_yaml/vLLM_default.yaml rename to benchmarks/yaml/request_yaml/request.yaml index a6385823b..9fc603354 100644 --- a/benchmarks/yaml/request_yaml/vLLM_default.yaml +++ b/benchmarks/yaml/request_yaml/request.yaml @@ -1,11 +1,11 @@ -top_p: 1.0 -temperature: 1.0 -metadata: - min_tokens: 1 -max_tokens: 30721 +top_p: 0.8 +temperature: 0.8 +max_tokens: 12288 repetition_penalty: 1.0 frequency_penalty: 0 presence_penalty: 0 -skip_special_tokens: false +metadata: + enable_thinking: false + min_tokens: 1 chat_template_kwargs: - enable_thinking: true + enable_thinking: false