Modified to support custom all reduce by default (#3538)

2025-12-24 13:28:13 +08:00 · 2025-08-22 16:59:05 +08:00
parent 27666ee586
commit df7c31012b
15 changed files with 18 additions and 30 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -278,7 +278,7 @@ class ParallelConfig:
        self.disable_any_whitespace: bool = True
        self.pod_ip: str = None
        # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
-        self.enable_custom_all_reduce: bool = False
+        self.disable_custom_all_reduce: bool = False
        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -188,7 +188,7 @@ class EngineArgs:
    Flag to enable prefix caching.
    """

-    enable_custom_all_reduce: bool = False
+    disable_custom_all_reduce: bool = False
    """
    Flag to enable the custom all-reduce kernel.
    """
@@ -571,10 +571,10 @@ class EngineArgs:
            help="Degree of tensor parallelism.",
        )
        parallel_group.add_argument(
-            "--enable-custom-all-reduce",
+            "--disable-custom-all-reduce",
            action="store_true",
-            default=EngineArgs.enable_custom_all_reduce,
-            help="Flag to enable custom all-reduce.",
+            default=EngineArgs.disable_custom_all_reduce,
+            help="Flag to disable custom all-reduce.",
        )
        parallel_group.add_argument(
            "--max-num-seqs",
@@ -947,10 +947,6 @@ class EngineArgs:
        early_stop_cfg = self.create_early_stop_config()
        early_stop_cfg.update_enable_early_stop(self.enable_early_stop)

-        assert not (
-            self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
-        ), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
-
        assert is_port_available(
            "0.0.0.0", self.engine_worker_queue_port
        ), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -1118,7 +1118,7 @@ class LLMEngine:
            "do_profile": self.do_profile,
            "dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
            "disable_any_whitespace": self.cfg.disable_any_whitespace,
-            "enable_custom_all_reduce": self.cfg.parallel_config.enable_custom_all_reduce,
+            "disable_custom_all_reduce": self.cfg.parallel_config.disable_custom_all_reduce,
            "enable_logprob": self.cfg.model_config.enable_logprob,
        }
        for worker_flag, value in worker_append_flag.items():
--- a/fastdeploy/worker/gpu_worker.py
+++ b/fastdeploy/worker/gpu_worker.py
@@ -69,7 +69,7 @@ class GpuWorker(WorkerBase):
            gc.collect()
            paddle.device.cuda.empty_cache()
            if (
-                self.parallel_config.enable_custom_all_reduce
+                not self.parallel_config.disable_custom_all_reduce
                and self.parallel_config.tensor_parallel_size > 1
                and paddle.is_compiled_with_cuda()
            ):
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -516,7 +516,7 @@ def parse_args():
        help="enable prefix cache",
    )
    parser.add_argument(
-        "--enable_custom_all_reduce",
+        "--disable_custom_all_reduce",
        action="store_true",
        help="enable custom all-reduce",
    )