use dist.all_reduce(min) to sync num_blocks_local (#2933)

* pre-commit all files check * reduce min num_blocks_local * fix nranks=1 * pre-commit when commit-msg
2025-10-02 15:22:24 +08:00 · 2025-07-21 16:23:36 +08:00
parent 67990e0572
commit 2f74e93d7e
9 changed files with 71 additions and 66 deletions
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -375,42 +375,33 @@ class PaddleDisWorkerProc:
            logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
            logger.info(f"------- num_blocks_local:{num_blocks_local} --------")

-            logger.info(f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}")
-
-            # 3. Send IPCSignal
-            get_profile_block_num = np.zeros(shape=[self.ranks], dtype=np.int32)
-            self.get_profile_block_num_signal = IPCSignal(
-                name="get_profile_block_num",
-                array=get_profile_block_num,
-                dtype=np.int32,
-                suffix=self.parallel_config.engine_pid,
-                create=False,
-            )
-            self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_local
-
-            # Wait all worker send the signal
-            while np.any(self.get_profile_block_num_signal.value <= 0):
-                time.sleep(0.01)
-            num_blocks_global = self.get_profile_block_num_signal.value.min().item()
-
-            if num_blocks_global < 0:
-                logger.error(
-                    "The total number of blocks cannot be less than zero."
-                    "Please increase gpu_memory_utilization"
-                    "Or decrease max_num_batched_tokens(max model length) "
-                )
+            if num_blocks_local <= 0:
                raise ValueError(
                    "The total number of blocks cannot be less than zero."
                    "Please increase gpu_memory_utilization"
                    "Or decrease max_num_batched_tokens(max model length) "
                )

-            self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_global
+            if self.ranks > 1:
+                num_blocks_local = paddle.full(shape=[1], fill_value=num_blocks_local, dtype="int32")
+                dist.all_reduce(num_blocks_local, op=dist.ReduceOp.MIN)
+                num_blocks_local = num_blocks_local.item()
+
+            if self.local_rank == 0:
+                # 3. Send IPCSignal
+                get_profile_block_num = np.zeros(shape=[1], dtype=np.int32)
+                self.get_profile_block_num_signal = IPCSignal(
+                    name="get_profile_block_num",
+                    array=get_profile_block_num,
+                    dtype=np.int32,
+                    suffix=self.parallel_config.engine_pid,
+                    create=False,
+                )
+                self.get_profile_block_num_signal.value[0] = num_blocks_local
        else:
-            num_blocks_global = self.fd_config.parallel_config.total_block_num
-        # NOTE(liuzichang): Too big num_blocks_global will lead to error 700
+            num_blocks_local = self.fd_config.parallel_config.total_block_num
        # 4. Updata share inputs
-        self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
+        self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_local)

    def init_device(self) -> None:
        """Initialize device and Construct model runner"""