From 97e340eb14c0e35d2f9e834afdde9385f751eb54 Mon Sep 17 00:00:00 2001 From: xiegegege <46314656+xiegegege@users.noreply.github.com> Date: Mon, 15 Dec 2025 15:25:14 +0800 Subject: [PATCH] [CE]add pd router and wint4 tp4 config (#5554) --- benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml | 7 +++++++ .../yaml/eb45-32k-wint4-tp4_decode_router.yaml | 16 ++++++++++++++++ .../yaml/eb45-32k-wint4-tp4_prefill_router.yaml | 13 +++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml create mode 100644 benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml create mode 100644 benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml diff --git a/benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml b/benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml new file mode 100644 index 000000000..d05375caa --- /dev/null +++ b/benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml @@ -0,0 +1,7 @@ +num_gpu_blocks_override: 1024 +max_model_len: 8192 +max_num_seqs: 64 +data_parallel_size: 4 +tensor_parallel_size: 1 +enable_expert_parallel: True +quantization: wint4 diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml new file mode 100644 index 000000000..34de7cd76 --- /dev/null +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml @@ -0,0 +1,16 @@ +max_model_len: 32768 +max_num_seqs: 256 +gpu_memory_utilization: 0.9 +kv_cache_ratio: 0.8 +tensor_parallel_size: 4 +cache_queue_port: 55663 +enable_chunked_prefill: True +splitwise_role: decode +engine_worker_queue_port: 6678 +cache_transfer_protocol: "rdma,ipc" +rdma_comm_ports: "7671,7672,7673,7674" +pd_comm_port: "2334" +max_num_batched_tokens: 384 +max_num_partial_prefills: 3 +max_long_partial_prefills: 3 +quantization: wint4 diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml new file mode 100644 index 000000000..cf4b4a51d --- /dev/null +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml @@ -0,0 +1,13 @@ +max_model_len: 32768 +max_num_seqs: 16 +gpu_memory_utilization: 0.9 +kv_cache_ratio: 0.9 +tensor_parallel_size: 4 +splitwise_role: prefill +enable_prefix_caching: True +cache_queue_port: 55664 +engine_worker_queue_port: 6677 +cache_transfer_protocol: "rdma,ipc" +rdma_comm_ports: "7675,7676,7677,7678" +pd_comm_port: "2333" +quantization: wint4