[PD Disaggregation] support DP via v1 router and decouple DP and EP (#5197)

* [fix] support DP via v1 router and decouple DP and EP * [fix] fix scripts * [fix] reset model path * [fix] dp use get_output_ep, fix router port type, update scripts * [merge] merge with latest code * [chore] remove some debug log * [fix] fix code style check * [fix] fix test_multi_api_server for log_dir name * [chore] reduce logs * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-12-04 15:38:43 +08:00
parent 5cd17fd662
commit f4119d51b4
15 changed files with 394 additions and 146 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -545,6 +545,7 @@ class ParallelConfig:
        self.tensor_parallel_size = 1  # TP degree
        self.expert_parallel_rank = 0  # EP rank ID
        self.expert_parallel_size = 1  # EP degree
+        self.data_parallel_rank = 0  # DP rank ID
        self.data_parallel_size = 1  # DP degree
        self.enable_expert_parallel = False
        self.enable_chunked_moe = False
@@ -1887,7 +1888,11 @@ class FDConfig:
            engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[
                self.parallel_config.local_data_parallel_id
            ]
-        connector_port = self.cache_config.pd_comm_port[0] if self.cache_config.pd_comm_port else None
+        connector_port = (
+            self.cache_config.pd_comm_port[self.parallel_config.local_data_parallel_id]
+            if self.cache_config.pd_comm_port
+            else None
+        )

        self.disaggregate_info = {}
        if self.scheduler_config.splitwise_role != "mixed":