From 97e340eb14c0e35d2f9e834afdde9385f751eb54 Mon Sep 17 00:00:00 2001
From: xiegegege <46314656+xiegegege@users.noreply.github.com>
Date: Mon, 15 Dec 2025 15:25:14 +0800
Subject: [PATCH] [CE]add pd router and wint4 tp4 config (#5554)

---
 benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml      |  7 +++++++
 .../yaml/eb45-32k-wint4-tp4_decode_router.yaml   | 16 ++++++++++++++++
 .../yaml/eb45-32k-wint4-tp4_prefill_router.yaml  | 13 +++++++++++++
 3 files changed, 36 insertions(+)
 create mode 100644 benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml
 create mode 100644 benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml
 create mode 100644 benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml

diff --git a/benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml b/benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml
new file mode 100644
index 000000000..d05375caa
--- /dev/null
+++ b/benchmarks/yaml/eb45-32k-wint4-ep4-tp4.yaml
@@ -0,0 +1,7 @@
+num_gpu_blocks_override: 1024
+max_model_len: 8192
+max_num_seqs: 64
+data_parallel_size: 4
+tensor_parallel_size: 1
+enable_expert_parallel: True
+quantization: wint4
diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml
new file mode 100644
index 000000000..34de7cd76
--- /dev/null
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml
@@ -0,0 +1,16 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+cache_queue_port: 55663
+enable_chunked_prefill: True
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7671,7672,7673,7674"
+pd_comm_port: "2334"
+max_num_batched_tokens: 384
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
+quantization: wint4
diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml
new file mode 100644
index 000000000..cf4b4a51d
--- /dev/null
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml
@@ -0,0 +1,13 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 4
+splitwise_role: prefill
+enable_prefix_caching: True
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7675,7676,7677,7678"
+pd_comm_port: "2333"
+quantization: wint4