mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Optimization] default compile rdma, reduce cudagraph buffer size in mm, fix some config bug (#5121)
* default compile rdma, reduce cudagraph buffer size in mm, fix some config logic * update * update * fix bug * enhance rdma compile * fix
This commit is contained in:
@@ -902,6 +902,12 @@ class GraphOptimizationConfig:
|
||||
draft_capture_sizes.append(max_capture_size)
|
||||
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
|
||||
|
||||
def filter_capture_size(self, tp_size: int = 1):
|
||||
"""When TSP is used, capture size must be divisible by tp size."""
|
||||
self.cudagraph_capture_sizes = [
|
||||
draft_size for draft_size in self.cudagraph_capture_sizes if (draft_size % tp_size == 0)
|
||||
]
|
||||
|
||||
def to_json_string(self):
|
||||
"""
|
||||
Convert speculative_config to json string.
|
||||
@@ -1628,7 +1634,15 @@ class FDConfig:
|
||||
if self.device_config is not None and self.device_config.device_type != "cuda":
|
||||
self.graph_opt_config.use_cudagraph = False
|
||||
logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!")
|
||||
|
||||
if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph:
|
||||
if self.scheduler_config.max_num_seqs < self.parallel_config.tensor_parallel_size:
|
||||
self.parallel_config.use_sequence_parallel_moe = False
|
||||
logger.info(
|
||||
"Warning: sequence parallel moe do not support max_num_seqs < tensor_parallel_size when cudagraph enabled. We set use_sequence_parallel_moe to False."
|
||||
)
|
||||
else:
|
||||
# It will hang when real batch_size < tp_size
|
||||
self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size)
|
||||
if self.model_config.enable_mm and self.graph_opt_config.use_cudagraph:
|
||||
self.cache_config.enable_prefix_caching = False
|
||||
logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!")
|
||||
|
||||
Reference in New Issue
Block a user