mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[LLM] Update Multinode Deployment (#2830)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* [LLM] fix multinode bugs * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] fix ci bugs * Update fastdeploy/engine/args_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * [LLM] update random port * [LLM] update random port * [LLM] fix ci bugs * fix ci bugs --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -136,9 +136,9 @@ class PaddleDisWorkerProc():
|
||||
model_weights_status:
|
||||
"""
|
||||
# init worker_ready_signal
|
||||
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||
array_size = min(
|
||||
max_chips_per_node, self.parallel_config.tensor_parallel_size *
|
||||
self.max_chips_per_node, self.parallel_config.tensor_parallel_size *
|
||||
self.parallel_config.expert_parallel_size)
|
||||
workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
|
||||
self.worker_ready_signal = IPCSignal(
|
||||
@@ -148,10 +148,10 @@ class PaddleDisWorkerProc():
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
self.worker_ready_signal.value[self.local_rank %
|
||||
max_chips_per_node] = 1
|
||||
self.max_chips_per_node] = 1
|
||||
|
||||
# init worker_healthy_live_signal
|
||||
workers_alive = np.zeros(shape=[self.ranks], dtype=np.int32)
|
||||
workers_alive = np.zeros(shape=[array_size], dtype=np.int32)
|
||||
self.worker_healthy_live_signal = IPCSignal(
|
||||
name="worker_healthy_live_signal",
|
||||
array=workers_alive,
|
||||
@@ -205,7 +205,7 @@ class PaddleDisWorkerProc():
|
||||
Tmp loop function for ep utill DP is supported
|
||||
"""
|
||||
while True:
|
||||
self.worker_healthy_live_signal.value[self.local_rank] = int(
|
||||
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(
|
||||
time.time())
|
||||
|
||||
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(
|
||||
|
Reference in New Issue
Block a user