[Optimize] Support and robust for tpN for PD (#4595)

* [Optimize] Support and robust for tpN for PD

* fix

* fix

* support dpM tpN for cache messager

* fix

* fix token counter

* fix bug for merge develop

* fix bug

* robust cache messager for v0
This commit is contained in:
chenjian
2025-11-03 15:38:31 +08:00
committed by GitHub
parent 7b35488779
commit 25498efcf3
9 changed files with 452 additions and 197 deletions

View File

@@ -177,7 +177,7 @@ class PaddleDisWorkerProc:
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
if self.parallel_config.data_parallel_size > 1 and not envs.FD_ENABLE_MULTI_API_SERVER:
launched_expert_service_signal_data = np.zeros(
shape=[min(self.parallel_config.data_parallel_size, self.max_chips_per_node)], dtype=np.int32
shape=[self.parallel_config.data_parallel_size // self.fd_config.nnode], dtype=np.int32
)
self.launched_expert_service_signal = IPCSignal(
name="launched_expert_service_signal",
@@ -186,7 +186,12 @@ class PaddleDisWorkerProc:
suffix=self.parallel_config.engine_pid,
create=False,
)
while self.launched_expert_service_signal.value[self.local_rank % self.max_chips_per_node] == 0:
while (
self.launched_expert_service_signal.value[
self.parallel_config.local_data_parallel_id % self.max_chips_per_node
]
== 0
):
pass
# init worker_ready_signal
@@ -568,7 +573,7 @@ class PaddleDisWorkerProc:
is_server=False,
num_client=self.parallel_config.tensor_parallel_size,
client_id=self.parallel_config.tensor_parallel_rank,
local_data_parallel_id=self.parallel_config.data_parallel_rank,
local_data_parallel_id=self.parallel_config.local_data_parallel_id,
)
def load_model(self) -> None: