From d682c97dd318361da7e30f3fe63cbfe77477e1af Mon Sep 17 00:00:00 2001 From: xiegegege <46314656+xiegegege@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:38:36 +0800 Subject: [PATCH] [benchmark]add lite-vl and x1 yaml (#4130) --- benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml | 1 + benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml | 1 + benchmarks/yaml/eb45-32k-wint2-tp4.yaml | 5 +++++ benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml | 1 + benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml | 1 + benchmarks/yaml/eb45-vl-128k-wint4-h800-tp8.yaml | 11 +++++++++++ benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml | 2 +- benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml | 2 +- benchmarks/yaml/eb45-vl-lite-32k-bf16-a800-tp1.yaml | 9 +++++++++ benchmarks/yaml/eb45-vl-lite-32k-wint4-a800-tp1.yaml | 10 ++++++++++ benchmarks/yaml/eb45-vl-lite-32k-wint8-a800-tp1.yaml | 10 ++++++++++ benchmarks/yaml/request_yaml/eb45-vl-128k.yaml | 1 + benchmarks/yaml/request_yaml/eb45-vl-32k.yaml | 1 + .../yaml/request_yaml/{x1.yaml => x1-128k.yaml} | 2 +- benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml | 6 ++++++ 15 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 benchmarks/yaml/eb45-32k-wint2-tp4.yaml create mode 100644 benchmarks/yaml/eb45-vl-128k-wint4-h800-tp8.yaml create mode 100644 benchmarks/yaml/eb45-vl-lite-32k-bf16-a800-tp1.yaml create mode 100644 benchmarks/yaml/eb45-vl-lite-32k-wint4-a800-tp1.yaml create mode 100644 benchmarks/yaml/eb45-vl-lite-32k-wint8-a800-tp1.yaml create mode 100644 benchmarks/yaml/request_yaml/eb45-vl-128k.yaml create mode 100644 benchmarks/yaml/request_yaml/eb45-vl-32k.yaml rename benchmarks/yaml/request_yaml/{x1.yaml => x1-128k.yaml} (86%) create mode 100644 benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml diff --git a/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml b/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml index 280f8e336..3667361e0 100644 --- a/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml +++ b/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml @@ -6,3 +6,4 @@ tensor_parallel_size: 8 max_num_batched_tokens: 4096 max_num_partial_prefills: 3 max_long_partial_prefills: 3 +quantization: wint4 diff --git a/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml b/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml index 280f8e336..bc458d1a5 100644 --- a/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml +++ b/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml @@ -6,3 +6,4 @@ tensor_parallel_size: 8 max_num_batched_tokens: 4096 max_num_partial_prefills: 3 max_long_partial_prefills: 3 +quantization: wint8 diff --git a/benchmarks/yaml/eb45-32k-wint2-tp4.yaml b/benchmarks/yaml/eb45-32k-wint2-tp4.yaml new file mode 100644 index 000000000..c82ea744d --- /dev/null +++ b/benchmarks/yaml/eb45-32k-wint2-tp4.yaml @@ -0,0 +1,5 @@ +max_model_len: 32768 +max_num_seqs: 256 +kv_cache_ratio: 0.75 +tensor_parallel_size: 4 +gpu_memory_utilization: 0.9 diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml index 985ef7a34..34de7cd76 100644 --- a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml @@ -13,3 +13,4 @@ pd_comm_port: "2334" max_num_batched_tokens: 384 max_num_partial_prefills: 3 max_long_partial_prefills: 3 +quantization: wint4 diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml index 2831838fd..cf4b4a51d 100644 --- a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml @@ -10,3 +10,4 @@ engine_worker_queue_port: 6677 cache_transfer_protocol: "rdma,ipc" rdma_comm_ports: "7675,7676,7677,7678" pd_comm_port: "2333" +quantization: wint4 diff --git a/benchmarks/yaml/eb45-vl-128k-wint4-h800-tp8.yaml b/benchmarks/yaml/eb45-vl-128k-wint4-h800-tp8.yaml new file mode 100644 index 000000000..0c5f04494 --- /dev/null +++ b/benchmarks/yaml/eb45-vl-128k-wint4-h800-tp8.yaml @@ -0,0 +1,11 @@ +enable_mm: True +max_model_len: 131072 +max_num_seqs: 56 +gpu_memory_utilization: 0.8 +kv_cache_ratio: 0.8 +tensor_parallel_size: 8 +quantization: wint4 +limit_mm_per_prompt: '{"image": 100, "video": 100}' +enable_chunked_prefill: True +max_num_batched_tokens: 384 +reasoning_parser: ernie-45-vl diff --git a/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml b/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml index 3c803e662..75e2df417 100644 --- a/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml +++ b/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml @@ -1,7 +1,7 @@ enable_mm: True max_model_len: 32768 max_num_seqs: 36 -gpu_memory_utilization: 0.95 +gpu_memory_utilization: 0.9 kv_cache_ratio: 0.8 tensor_parallel_size: 8 quantization: wint8 diff --git a/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml b/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml index ff9611f5d..41d7f1869 100644 --- a/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml +++ b/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml @@ -1,7 +1,7 @@ enable_mm: True max_model_len: 32768 max_num_seqs: 36 -gpu_memory_utilization: 0.8 +gpu_memory_utilization: 0.85 kv_cache_ratio: 0.8 tensor_parallel_size: 8 quantization: wint8 diff --git a/benchmarks/yaml/eb45-vl-lite-32k-bf16-a800-tp1.yaml b/benchmarks/yaml/eb45-vl-lite-32k-bf16-a800-tp1.yaml new file mode 100644 index 000000000..2a1b9148e --- /dev/null +++ b/benchmarks/yaml/eb45-vl-lite-32k-bf16-a800-tp1.yaml @@ -0,0 +1,9 @@ +enable_mm: True +max_model_len: 32768 +max_num_seqs: 128 +gpu_memory_utilization: 0.9 +kv_cache_ratio: 0.71 +tensor_parallel_size: 1 +enable_chunked_prefill: True +max_num_batched_tokens: 384 +reasoning_parser: ernie-45-vl diff --git a/benchmarks/yaml/eb45-vl-lite-32k-wint4-a800-tp1.yaml b/benchmarks/yaml/eb45-vl-lite-32k-wint4-a800-tp1.yaml new file mode 100644 index 000000000..ffa5ceac3 --- /dev/null +++ b/benchmarks/yaml/eb45-vl-lite-32k-wint4-a800-tp1.yaml @@ -0,0 +1,10 @@ +enable_mm: True +max_model_len: 32768 +max_num_seqs: 128 +gpu_memory_utilization: 0.9 +kv_cache_ratio: 0.71 +tensor_parallel_size: 1 +enable_chunked_prefill: True +max_num_batched_tokens: 384 +quantization: wint4 +reasoning_parser: ernie-45-vl diff --git a/benchmarks/yaml/eb45-vl-lite-32k-wint8-a800-tp1.yaml b/benchmarks/yaml/eb45-vl-lite-32k-wint8-a800-tp1.yaml new file mode 100644 index 000000000..7a0d4a0c4 --- /dev/null +++ b/benchmarks/yaml/eb45-vl-lite-32k-wint8-a800-tp1.yaml @@ -0,0 +1,10 @@ +enable_mm: True +max_model_len: 32768 +max_num_seqs: 128 +gpu_memory_utilization: 0.9 +kv_cache_ratio: 0.71 +tensor_parallel_size: 1 +enable_chunked_prefill: True +max_num_batched_tokens: 384 +quantization: wint8 +reasoning_parser: ernie-45-vl diff --git a/benchmarks/yaml/request_yaml/eb45-vl-128k.yaml b/benchmarks/yaml/request_yaml/eb45-vl-128k.yaml new file mode 100644 index 000000000..2c6a5eb74 --- /dev/null +++ b/benchmarks/yaml/request_yaml/eb45-vl-128k.yaml @@ -0,0 +1 @@ +max_tokens: 131071 diff --git a/benchmarks/yaml/request_yaml/eb45-vl-32k.yaml b/benchmarks/yaml/request_yaml/eb45-vl-32k.yaml new file mode 100644 index 000000000..e2fb432b9 --- /dev/null +++ b/benchmarks/yaml/request_yaml/eb45-vl-32k.yaml @@ -0,0 +1 @@ +max_tokens: 12288 diff --git a/benchmarks/yaml/request_yaml/x1.yaml b/benchmarks/yaml/request_yaml/x1-128k.yaml similarity index 86% rename from benchmarks/yaml/request_yaml/x1.yaml rename to benchmarks/yaml/request_yaml/x1-128k.yaml index 73dc6a900..e02e466c7 100644 --- a/benchmarks/yaml/request_yaml/x1.yaml +++ b/benchmarks/yaml/request_yaml/x1-128k.yaml @@ -2,7 +2,7 @@ top_p: 0.95 temperature: 0.6 metadata: min_tokens: 1 -max_tokens: 65535 +max_tokens: 131071 repetition_penalty: 1.0 frequency_penalty: 0 presence_penalty: 0 diff --git a/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml b/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml new file mode 100644 index 000000000..c7629fd63 --- /dev/null +++ b/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml @@ -0,0 +1,6 @@ +or_parallel_size: 1 +max_model_len: 131072 +max_num_seqs: 32 +quantization: wint8 +reasoning_parser: ernie_x1 +tool_call_parser: ernie_x1