From 99258e19c8756fc0e0a5156dbb293b5a3e87c43b Mon Sep 17 00:00:00 2001
From: ophilia-lee <58770600+ophilia-lee@users.noreply.github.com>
Date: Tue, 23 Dec 2025 19:46:23 +0800
Subject: [PATCH] =?UTF-8?q?[Benchmark]=E6=94=AF=E6=8C=81Completions?=
 =?UTF-8?q?=E6=8E=A5=E5=8F=A3=20(#5700)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* benchmark工具支持受限解码场景指定response_format

* Update backend_request_func.py

output.success判断兼容思考内容超长截断时回复内容为空的情况

* Update benchmark_serving.py

更新benchmark_metrics

* 支持Completions接口

* 支持Completions接口

* 支持Completions接口

* [Benchmark]支持Completions接口

* [Benchmark]支持Completions接口

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
---
 .github/pull_request_template.md   |  2 +-
 benchmarks/backend_request_func.py |  3 ++-
 benchmarks/benchmark_dataset.py    | 17 ++++++++++-------
 benchmarks/benchmark_serving.py    |  1 +
 tests/ce/server/test_prompt_ids.py |  7 +++----
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index ba1712559..7f3c72aa8 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -8,7 +8,7 @@
 
 > :bulb: If this PR is a Cherry Pick, the PR title needs to follow the format by adding the [Cherry-Pick] label at the very beginning and appending the original PR ID at the end. For example, [Cherry-Pick][CI] Add check trigger and logic(#5191)
 
-> :bulb: 如若此PR是Cherry Pick，PR标题需遵循格式，在最开始加上[Cherry-Pick]标签，以及最后面加上原PR ID，例如[Cherry-Pick][CI] Add check trigger and logic(#5191) 
+> :bulb: 如若此PR是Cherry Pick，PR标题需遵循格式，在最开始加上[Cherry-Pick]标签，以及最后面加上原PR ID，例如[Cherry-Pick][CI] Add check trigger and logic(#5191)
 
 ## Modifications
 
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 40b719e06..d68180fc1 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -273,7 +273,8 @@ async def async_request_eb_openai_chat_completions(
                     # 新增metrics统计，计算首token过滤空包
                     output.metrics = metrics_summary(metrics_list, token_timestamps[1:])
 
-                    if output.generated_text.strip() == "":
+                    # 兼容思考内容超长截断的情况，此时回复内容为空
+                    if output.generated_text.strip() == "" and output.reasoning_content.strip() == "":
                         output.success = False
                         output.reasoning_tokens = output.output_tokens
                         output.error = "No generated text found!"
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 8c35867ad..ab7c8deb3 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -233,20 +233,23 @@ class EBDataset(BenchmarkDataset):
         for entry in self.data:
             if len(samples) >= num_requests:
                 break
+            json_data = entry
+
             prompt = entry["text"]
-            self.temperature = float(entry["temperature"])
-            self.repetition_penalty = float(entry["penalty_score"])
-            self.frequency_penalty = float(entry["frequency_score"])
-            self.presence_penalty = float(entry["presence_score"])
-            self.top_p = float(entry["topp"])
-            self.prompt_len = int(entry["input_token_num"])
-            new_output_len = int(entry["max_dec_len"])
+            self.temperature = float(entry.get("temperature", 1))
+            self.repetition_penalty = float(entry.get("penalty_score", 0))
+            self.frequency_penalty = float(entry.get("frequency_score", 0))
+            self.presence_penalty = float(entry.get("presence_score", 0))
+            self.top_p = float(entry.get("topp", 1))
+            self.prompt_len = int(entry.get("input_token_num", 0))
+            new_output_len = int(entry.get("max_dec_len", 0))
 
             if enable_multimodal_chat:
                 prompt = self.apply_multimodal_chat_transformation(prompt, None)
             samples.append(
                 SampleRequest(
                     no=cnt,
+                    json_data=json_data,
                     prompt=prompt,
                     prompt_len=self.prompt_len,
                     history_QA=[],
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e48b65b4b..a87c8f8ac 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1233,6 +1233,7 @@ if __name__ == "__main__":
         type=str,
         default="EBChat",
         choices=[
+            "EB",
             "EBChat",
             "random",
         ],
diff --git a/tests/ce/server/test_prompt_ids.py b/tests/ce/server/test_prompt_ids.py
index e49b974c6..be05d4c18 100644
--- a/tests/ce/server/test_prompt_ids.py
+++ b/tests/ce/server/test_prompt_ids.py
@@ -24,9 +24,9 @@ def test_incremental_image_reasoning_consistency():
                         "type": "image_url",
                         "image_url": {
                             "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"
-                        }
+                        },
                     },
-                    {"type": "text", "text": "图中的文物属于哪个年代？"}
+                    {"type": "text", "text": "图中的文物属于哪个年代？"},
                 ],
             }
         ],
@@ -69,10 +69,9 @@ def test_incremental_image_reasoning_consistency():
                         "type": "image_url",
                         "image_url": {
                             "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"
-                        }
+                        },
                     },
                 ],
-
             }
         ],
         "prompt_token_ids": tokens1,