mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
Modified to support custom all reduce by default (#3538)
This commit is contained in:
@@ -278,7 +278,7 @@ class ParallelConfig:
|
||||
self.disable_any_whitespace: bool = True
|
||||
self.pod_ip: str = None
|
||||
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
|
||||
self.enable_custom_all_reduce: bool = False
|
||||
self.disable_custom_all_reduce: bool = False
|
||||
for key, value in args.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
|
||||
@@ -188,7 +188,7 @@ class EngineArgs:
|
||||
Flag to enable prefix caching.
|
||||
"""
|
||||
|
||||
enable_custom_all_reduce: bool = False
|
||||
disable_custom_all_reduce: bool = False
|
||||
"""
|
||||
Flag to enable the custom all-reduce kernel.
|
||||
"""
|
||||
@@ -571,10 +571,10 @@ class EngineArgs:
|
||||
help="Degree of tensor parallelism.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--enable-custom-all-reduce",
|
||||
"--disable-custom-all-reduce",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_custom_all_reduce,
|
||||
help="Flag to enable custom all-reduce.",
|
||||
default=EngineArgs.disable_custom_all_reduce,
|
||||
help="Flag to disable custom all-reduce.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--max-num-seqs",
|
||||
@@ -947,10 +947,6 @@ class EngineArgs:
|
||||
early_stop_cfg = self.create_early_stop_config()
|
||||
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
|
||||
|
||||
assert not (
|
||||
self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
|
||||
), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
|
||||
|
||||
assert is_port_available(
|
||||
"0.0.0.0", self.engine_worker_queue_port
|
||||
), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
|
||||
|
||||
@@ -1118,7 +1118,7 @@ class LLMEngine:
|
||||
"do_profile": self.do_profile,
|
||||
"dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
|
||||
"disable_any_whitespace": self.cfg.disable_any_whitespace,
|
||||
"enable_custom_all_reduce": self.cfg.parallel_config.enable_custom_all_reduce,
|
||||
"disable_custom_all_reduce": self.cfg.parallel_config.disable_custom_all_reduce,
|
||||
"enable_logprob": self.cfg.model_config.enable_logprob,
|
||||
}
|
||||
for worker_flag, value in worker_append_flag.items():
|
||||
|
||||
@@ -69,7 +69,7 @@ class GpuWorker(WorkerBase):
|
||||
gc.collect()
|
||||
paddle.device.cuda.empty_cache()
|
||||
if (
|
||||
self.parallel_config.enable_custom_all_reduce
|
||||
not self.parallel_config.disable_custom_all_reduce
|
||||
and self.parallel_config.tensor_parallel_size > 1
|
||||
and paddle.is_compiled_with_cuda()
|
||||
):
|
||||
|
||||
@@ -516,7 +516,7 @@ def parse_args():
|
||||
help="enable prefix cache",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable_custom_all_reduce",
|
||||
"--disable_custom_all_reduce",
|
||||
action="store_true",
|
||||
help="enable custom all-reduce",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user