[Feature] support custom all-reduce (#2758)

* [Feature] support custom all-reduce

* add vllm adapted
This commit is contained in:
zhink
2025-07-09 16:00:27 +08:00
committed by GitHub
parent be21ef5047
commit b89180f1cd
16 changed files with 1194 additions and 2 deletions

View File

@@ -147,6 +147,12 @@ class EngineArgs:
"""
Flag to enable prefix caching.
"""
enable_custom_all_reduce: bool = False
"""
Flag to enable the custom all-reduce kernel.
"""
engine_worker_queue_port: int = 8002
"""
Port for worker queue communication.
@@ -421,6 +427,10 @@ class EngineArgs:
type=int,
default=EngineArgs.tensor_parallel_size,
help="Degree of tensor parallelism.")
parallel_group.add_argument("--enable-custom-all-reduce",
action='store_true',
default=EngineArgs.enable_custom_all_reduce,
help="Flag to enable custom all-reduce.")
parallel_group.add_argument(
"--max-num-seqs",
type=int,
@@ -733,6 +743,7 @@ class EngineArgs:
tensor_parallel_size=self.tensor_parallel_size,
enable_expert_parallel=self.enable_expert_parallel,
data_parallel_size=self.data_parallel_size,
enable_custom_all_reduce=self.enable_custom_all_reduce
)
def create_engine_config(self) -> Config:
@@ -755,6 +766,9 @@ class EngineArgs:
assert not (self.use_cudagraph and self.enable_prefix_caching), \
"Prefix caching cannot be used with CUDA graph"
assert not (self.tensor_parallel_size<=1 and self.enable_custom_all_reduce), \
"enable_custom_all_reduce must be used with tensor_parallel_size>1"
return Config(
model_name_or_path=self.model,
model_config=model_cfg,
@@ -784,4 +798,5 @@ class EngineArgs:
max_capture_batch_size=self.max_capture_batch_size,
guided_decoding_backend=self.guided_decoding_backend,
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
enable_custom_all_reduce=self.enable_custom_all_reduce,
)