mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-02 15:22:24 +08:00
use dist.all_reduce(min) to sync num_blocks_local (#2933)
* pre-commit all files check * reduce min num_blocks_local * fix nranks=1 * pre-commit when commit-msg
This commit is contained in:
@@ -375,42 +375,33 @@ class PaddleDisWorkerProc:
|
||||
logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
|
||||
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
|
||||
|
||||
logger.info(f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}")
|
||||
|
||||
# 3. Send IPCSignal
|
||||
get_profile_block_num = np.zeros(shape=[self.ranks], dtype=np.int32)
|
||||
self.get_profile_block_num_signal = IPCSignal(
|
||||
name="get_profile_block_num",
|
||||
array=get_profile_block_num,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False,
|
||||
)
|
||||
self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_local
|
||||
|
||||
# Wait all worker send the signal
|
||||
while np.any(self.get_profile_block_num_signal.value <= 0):
|
||||
time.sleep(0.01)
|
||||
num_blocks_global = self.get_profile_block_num_signal.value.min().item()
|
||||
|
||||
if num_blocks_global < 0:
|
||||
logger.error(
|
||||
"The total number of blocks cannot be less than zero."
|
||||
"Please increase gpu_memory_utilization"
|
||||
"Or decrease max_num_batched_tokens(max model length) "
|
||||
)
|
||||
if num_blocks_local <= 0:
|
||||
raise ValueError(
|
||||
"The total number of blocks cannot be less than zero."
|
||||
"Please increase gpu_memory_utilization"
|
||||
"Or decrease max_num_batched_tokens(max model length) "
|
||||
)
|
||||
|
||||
self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_global
|
||||
if self.ranks > 1:
|
||||
num_blocks_local = paddle.full(shape=[1], fill_value=num_blocks_local, dtype="int32")
|
||||
dist.all_reduce(num_blocks_local, op=dist.ReduceOp.MIN)
|
||||
num_blocks_local = num_blocks_local.item()
|
||||
|
||||
if self.local_rank == 0:
|
||||
# 3. Send IPCSignal
|
||||
get_profile_block_num = np.zeros(shape=[1], dtype=np.int32)
|
||||
self.get_profile_block_num_signal = IPCSignal(
|
||||
name="get_profile_block_num",
|
||||
array=get_profile_block_num,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False,
|
||||
)
|
||||
self.get_profile_block_num_signal.value[0] = num_blocks_local
|
||||
else:
|
||||
num_blocks_global = self.fd_config.parallel_config.total_block_num
|
||||
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
|
||||
num_blocks_local = self.fd_config.parallel_config.total_block_num
|
||||
# 4. Updata share inputs
|
||||
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
|
||||
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_local)
|
||||
|
||||
def init_device(self) -> None:
|
||||
"""Initialize device and Construct model runner"""
|
||||
|
Reference in New Issue
Block a user