[Sync Code] develop to release/2.0.3 (#2873)

* [LLM] support send batch data and aggregate data (#2860)

* [LLM] support send batch data and aggregate data

* [LLM] fix ci bugs

* [LLM] fix ci bugs

* [LLM] fix ci bugs

* [LLM] fix ci bugs

* [LLM] update

* [LLM] Update Multinode Deployment (#2830)

* [LLM] fix multinode bugs

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] fix ci bugs

* Update fastdeploy/engine/args_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* [LLM] update random port

* [LLM] update random port

* [LLM] fix ci bugs

* fix ci bugs

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Jiang-Jia-Jun
2025-07-16 23:44:26 +08:00
committed by GitHub
parent 63d6e7ce06
commit 09d0073fdc
18 changed files with 375 additions and 264 deletions

View File

@@ -136,9 +136,9 @@ class PaddleDisWorkerProc():
model_weights_status:
"""
# init worker_ready_signal
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
array_size = min(
max_chips_per_node, self.parallel_config.tensor_parallel_size *
self.max_chips_per_node, self.parallel_config.tensor_parallel_size *
self.parallel_config.expert_parallel_size)
workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_ready_signal = IPCSignal(
@@ -148,10 +148,10 @@ class PaddleDisWorkerProc():
suffix=self.parallel_config.engine_pid,
create=False)
self.worker_ready_signal.value[self.local_rank %
max_chips_per_node] = 1
self.max_chips_per_node] = 1
# init worker_healthy_live_signal
workers_alive = np.zeros(shape=[self.ranks], dtype=np.int32)
workers_alive = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal(
name="worker_healthy_live_signal",
array=workers_alive,
@@ -205,7 +205,7 @@ class PaddleDisWorkerProc():
Tmp loop function for ep utill DP is supported
"""
while True:
self.worker_healthy_live_signal.value[self.local_rank] = int(
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(
time.time())
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(