mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 17:17:14 +08:00
This reverts commit b0f1e0eef4
.
This commit is contained in:
@@ -143,7 +143,7 @@ class PaddleDisWorkerProc():
|
||||
# Initialize task queue
|
||||
task_address = (self.parallel_config.pod_ip,
|
||||
self.parallel_config.engine_worker_queue_port)
|
||||
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||
|
||||
self.task_queue = TaskQueue(
|
||||
address=task_address,
|
||||
is_server=False,
|
||||
@@ -162,6 +162,7 @@ class PaddleDisWorkerProc():
|
||||
model_weights_status:
|
||||
"""
|
||||
# init worker_ready_signal
|
||||
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||
array_size = min(
|
||||
self.max_chips_per_node, self.parallel_config.tensor_parallel_size *
|
||||
self.parallel_config.expert_parallel_size)
|
||||
@@ -182,9 +183,9 @@ class PaddleDisWorkerProc():
|
||||
array=workers_alive,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False,
|
||||
)
|
||||
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time())
|
||||
create=False)
|
||||
self.worker_healthy_live_signal.value[self.local_rank % 8] = int(
|
||||
time.time())
|
||||
|
||||
# init model_weights_status
|
||||
workers_model_weights = np.zeros(shape=[1], dtype=np.int32)
|
||||
@@ -270,7 +271,8 @@ class PaddleDisWorkerProc():
|
||||
paddle.distributed.barrier()
|
||||
|
||||
self.insert_step = False
|
||||
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time())
|
||||
self.worker_healthy_live_signal.value[self.local_rank] = int(
|
||||
time.time())
|
||||
|
||||
# The first worker detects whether there are tasks in the task queue
|
||||
if self.local_rank % mp_num_per_node == 0:
|
||||
@@ -386,7 +388,7 @@ class PaddleDisWorkerProc():
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
self.get_profile_block_num_signal.value[
|
||||
self.local_rank % self.max_chips_per_node] = num_blocks_local
|
||||
self.local_rank] = num_blocks_local
|
||||
|
||||
# Wait all worker send the signal
|
||||
while np.any(self.get_profile_block_num_signal.value <= 0):
|
||||
@@ -394,7 +396,7 @@ class PaddleDisWorkerProc():
|
||||
num_blocks_global = self.get_profile_block_num_signal.value.min(
|
||||
).item()
|
||||
self.get_profile_block_num_signal.value[
|
||||
self.local_rank % self.max_chips_per_node] = num_blocks_global
|
||||
self.local_rank] = num_blocks_global
|
||||
else:
|
||||
num_blocks_global = self.fd_config.parallel_config.total_block_num
|
||||
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
|
||||
|
Reference in New Issue
Block a user