diff --git a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml index 974c2eaf7..c71c247ee 100644 --- a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml @@ -1,6 +1,6 @@ max_model_len: 32768 max_num_seqs: 96 -gpu_memory_utilization: 0.85 +gpu_memory_utilization: 0.8 kv_cache_ratio: 0.71 tensor_parallel_size: 4 quantization: wint4 diff --git a/benchmarks/yaml/qwen3-235b-32k-fp8-tp1-dp4_decode.yaml b/benchmarks/yaml/qwen3-235b-32k-fp8-tp1-dp4_decode.yaml new file mode 100644 index 000000000..28acf9994 --- /dev/null +++ b/benchmarks/yaml/qwen3-235b-32k-fp8-tp1-dp4_decode.yaml @@ -0,0 +1,13 @@ +max_model_len: 32768 +max_num_seqs: 32 +data_parallel_size: 4 +tensor_parallel_size: 1 +enable_expert_parallel: True +enable_prefix_caching: False +splitwise_role: decode +cache_transfer_protocol: "rdma" +rdma_comm_ports: "7671,7672,7673,7674" +pd_comm_port: "2335" +engine_worker_queue_port: "4582,4583,4584,4585" +graph_optimization_config: + use_cudagraph: False diff --git a/benchmarks/yaml/qwen3-235b-32k-fp8-tp1-dp4_prefill.yaml b/benchmarks/yaml/qwen3-235b-32k-fp8-tp1-dp4_prefill.yaml new file mode 100644 index 000000000..d9e879e72 --- /dev/null +++ b/benchmarks/yaml/qwen3-235b-32k-fp8-tp1-dp4_prefill.yaml @@ -0,0 +1,13 @@ +max_model_len: 32768 +max_num_seqs: 32 +data_parallel_size: 4 +tensor_parallel_size: 1 +enable_expert_parallel: True +enable_prefix_caching: False +splitwise_role: prefill +cache_transfer_protocol: "rdma" +rdma_comm_ports: "7675,7676,7677,7678" +pd_comm_port: "2334" +engine_worker_queue_port: "4368,4369,4360,4361" +graph_optimization_config: + use_cudagraph: False