[PD Disaggregation] support DP via v1 router and decouple DP and EP (#5197)

* [fix] support DP via v1 router and decouple DP and EP

* [fix] fix scripts

* [fix] reset model path

* [fix] dp use get_output_ep, fix router port type, update scripts

* [merge] merge with latest code

* [chore] remove some debug log

* [fix] fix code style check

* [fix] fix test_multi_api_server for log_dir name

* [chore] reduce logs

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Yonghua Li
2025-12-04 15:38:43 +08:00
committed by GitHub
parent 5cd17fd662
commit f4119d51b4
15 changed files with 394 additions and 146 deletions

View File

@@ -545,6 +545,7 @@ class ParallelConfig:
self.tensor_parallel_size = 1 # TP degree
self.expert_parallel_rank = 0 # EP rank ID
self.expert_parallel_size = 1 # EP degree
self.data_parallel_rank = 0 # DP rank ID
self.data_parallel_size = 1 # DP degree
self.enable_expert_parallel = False
self.enable_chunked_moe = False
@@ -1887,7 +1888,11 @@ class FDConfig:
engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[
self.parallel_config.local_data_parallel_id
]
connector_port = self.cache_config.pd_comm_port[0] if self.cache_config.pd_comm_port else None
connector_port = (
self.cache_config.pd_comm_port[self.parallel_config.local_data_parallel_id]
if self.cache_config.pd_comm_port
else None
)
self.disaggregate_info = {}
if self.scheduler_config.splitwise_role != "mixed":