Modified to support custom all reduce by default (#3538)

This commit is contained in:
zhink
2025-08-22 16:59:05 +08:00
committed by GitHub
parent 27666ee586
commit df7c31012b
15 changed files with 18 additions and 30 deletions

View File

@@ -188,7 +188,7 @@ class EngineArgs:
Flag to enable prefix caching.
"""
enable_custom_all_reduce: bool = False
disable_custom_all_reduce: bool = False
"""
Flag to enable the custom all-reduce kernel.
"""
@@ -571,10 +571,10 @@ class EngineArgs:
help="Degree of tensor parallelism.",
)
parallel_group.add_argument(
"--enable-custom-all-reduce",
"--disable-custom-all-reduce",
action="store_true",
default=EngineArgs.enable_custom_all_reduce,
help="Flag to enable custom all-reduce.",
default=EngineArgs.disable_custom_all_reduce,
help="Flag to disable custom all-reduce.",
)
parallel_group.add_argument(
"--max-num-seqs",
@@ -947,10 +947,6 @@ class EngineArgs:
early_stop_cfg = self.create_early_stop_config()
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
assert not (
self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
assert is_port_available(
"0.0.0.0", self.engine_worker_queue_port
), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."

View File

@@ -1118,7 +1118,7 @@ class LLMEngine:
"do_profile": self.do_profile,
"dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
"disable_any_whitespace": self.cfg.disable_any_whitespace,
"enable_custom_all_reduce": self.cfg.parallel_config.enable_custom_all_reduce,
"disable_custom_all_reduce": self.cfg.parallel_config.disable_custom_all_reduce,
"enable_logprob": self.cfg.model_config.enable_logprob,
}
for worker_flag, value in worker_append_flag.items():