diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 002257f2a..a00c8e005 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -58,10 +58,12 @@ class RequestFuncOutput: """Output for requesting LLMs via API""" no: int = 0 + request_id: str = "" generated_text: str = "" reasoning_content: str = "" success: bool = False latency: float = 0.0 + end_timestamp: float = 0.0 # 模型完全返回的时间戳(秒, perf_counter基准) output_tokens: int = 0 ttft: float = 0.0 # Time to first token arrival_time: list = field(default_factory=list) # arrival_time @@ -154,6 +156,8 @@ async def async_request_eb_openai_chat_completions( most_recent_timestamp = timestamp # output.generated_text = generated_text + # 在流式结束时,记录最后一个 chunk 收到的时间戳 + output.end_timestamp = most_recent_timestamp if output.generated_text.strip() == "": output.success = False output.error = "No generated text found!" @@ -170,6 +174,7 @@ async def async_request_eb_openai_chat_completions( ) output.error = error_text or "" output.success = False + output.request_id = data.get("id", "") except Exception: output.success = False exc_info = sys.exc_info() diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 47c31e7e4..da5ff402a 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -150,7 +150,7 @@ async def get_request( def calculate_metrics( - input_requests: list[SampleRequest], + # input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float], @@ -395,6 +395,7 @@ async def benchmark( print(f"Traffic request rate: {request_rate}") print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Maximum request concurrency: {max_concurrency}") + print(f"Drop ratio: {args.drop_ratio}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) @@ -443,6 +444,8 @@ async def benchmark( tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar))) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs.sort(key=lambda x: x.end_timestamp) + if profile: print("Stopping profiler...") profile_input = RequestFuncInput( @@ -460,11 +463,30 @@ async def benchmark( if pbar is not None: pbar.close() - benchmark_duration = time.perf_counter() - benchmark_start_time - print("benchmark_duration:", benchmark_duration) + drop_ratio = args.drop_ratio + if 0.0 < drop_ratio < 1: + # 按drop_ratio头尾各舍弃一半请求,不计入benchmark统计 + n = len(outputs) + drop_count = int(n * drop_ratio) + half = drop_count // 2 + if half > 0: + outputs = outputs[half : n - half] + + # 根据收到最后一个chunk的时间戳计算总时长 + if len(outputs) >= 2: + benchmark_duration = outputs[-1].end_timestamp - outputs[0].end_timestamp + else: + benchmark_duration = 0.0 + + print(f"丢弃前数量: {n}") + print(f"丢弃后数量: {len(outputs)}") + print(f"benchmark_duration: {benchmark_duration} 秒") + else: + benchmark_duration = time.perf_counter() - benchmark_start_time + print(f"benchmark_duration: {benchmark_duration} 秒") metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, + # input_requests=input_requests, outputs=outputs, dur_s=benchmark_duration, # tokenizer=tokenizer, @@ -1081,6 +1103,12 @@ if __name__ == "__main__": action="store_true", help="shuffle dataset", ) + parser.add_argument( + "--drop-ratio", + type=float, + default=0.0, + help="Drop ratio of the outputs. [0, 1)", + ) parser.add_argument( "--trust-remote-code", action="store_true",