use dist.all_reduce(min) to sync num_blocks_local (#2933)

* pre-commit all files check

* reduce min num_blocks_local

* fix nranks=1

* pre-commit when commit-msg
This commit is contained in:
Yuanle Liu
2025-07-21 16:23:36 +08:00
committed by GitHub
parent 67990e0572
commit 2f74e93d7e
9 changed files with 71 additions and 66 deletions

View File

@@ -375,42 +375,33 @@ class PaddleDisWorkerProc:
logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
logger.info(f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}")
# 3. Send IPCSignal
get_profile_block_num = np.zeros(shape=[self.ranks], dtype=np.int32)
self.get_profile_block_num_signal = IPCSignal(
name="get_profile_block_num",
array=get_profile_block_num,
dtype=np.int32,
suffix=self.parallel_config.engine_pid,
create=False,
)
self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_local
# Wait all worker send the signal
while np.any(self.get_profile_block_num_signal.value <= 0):
time.sleep(0.01)
num_blocks_global = self.get_profile_block_num_signal.value.min().item()
if num_blocks_global < 0:
logger.error(
"The total number of blocks cannot be less than zero."
"Please increase gpu_memory_utilization"
"Or decrease max_num_batched_tokens(max model length) "
)
if num_blocks_local <= 0:
raise ValueError(
"The total number of blocks cannot be less than zero."
"Please increase gpu_memory_utilization"
"Or decrease max_num_batched_tokens(max model length) "
)
self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_global
if self.ranks > 1:
num_blocks_local = paddle.full(shape=[1], fill_value=num_blocks_local, dtype="int32")
dist.all_reduce(num_blocks_local, op=dist.ReduceOp.MIN)
num_blocks_local = num_blocks_local.item()
if self.local_rank == 0:
# 3. Send IPCSignal
get_profile_block_num = np.zeros(shape=[1], dtype=np.int32)
self.get_profile_block_num_signal = IPCSignal(
name="get_profile_block_num",
array=get_profile_block_num,
dtype=np.int32,
suffix=self.parallel_config.engine_pid,
create=False,
)
self.get_profile_block_num_signal.value[0] = num_blocks_local
else:
num_blocks_global = self.fd_config.parallel_config.total_block_num
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
num_blocks_local = self.fd_config.parallel_config.total_block_num
# 4. Updata share inputs
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_local)
def init_device(self) -> None:
"""Initialize device and Construct model runner"""