mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Executor] Default use CUDAGraph (#3594)
* add start intercept * Adjustment GraphOptConfig * pre-commit * default use cudagraph * set default value * default use cuda graph * pre-commit * fix test case bug * disable rl * fix moba attention * only support gpu * Temporarily disable PD Disaggregation * set max_num_seqs of test case as 1 * set max_num_seqs and temperature * fix max_num_batched_tokens bug * close cuda graph * success run wint2 * profile run with max_num_batched_tokens * 1.add c++ memchecker 2.success run wint2 * updatee a800 yaml * update docs * 1. delete check 2. fix plas attn test case * default use use_unique_memory_pool * add try-except for warmup * ban mtp, mm, rl * fix test case mock * fix ci bug * fix form_model_get_output_topp0 bug * fix ci bug * refine deepseek ci * refine code * Disable PD * fix sot yaml
This commit is contained in:
@@ -785,7 +785,7 @@ class GraphOptimizationConfig:
|
||||
"""
|
||||
self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
|
||||
""" Number of warmup runs for SOT warmup. """
|
||||
self.use_cudagraph: bool = False
|
||||
self.use_cudagraph: bool = True
|
||||
"""Sizes to capture cudagraph.
|
||||
- None (default): capture sizes are inferred from llm config.
|
||||
- list[int]: capture sizes are specified as given."""
|
||||
@@ -821,7 +821,7 @@ class GraphOptimizationConfig:
|
||||
""" Record maps mapped from real shape to captured size to reduce runtime overhead """
|
||||
self.real_shape_to_captured_size: dict[int, int] = None
|
||||
""" Whether to use shared memory pool for multi capture_size """
|
||||
self.use_unique_memory_pool: bool = False
|
||||
self.use_unique_memory_pool: bool = True
|
||||
|
||||
# CINN Config ...
|
||||
if args is not None:
|
||||
@@ -908,22 +908,6 @@ class GraphOptimizationConfig:
|
||||
len(self.cudagraph_capture_sizes) > 0
|
||||
), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
|
||||
|
||||
def update_use_cudagraph(self, argument: bool):
|
||||
"""
|
||||
Unified user specifies the use_cudagraph parameter through two methods,
|
||||
'--use-cudagraph' and '--graph-optimization-config'
|
||||
"""
|
||||
if self.use_cudagraph is None:
|
||||
# User only set '--use-cudagraph'
|
||||
self.use_cudagraph = argument
|
||||
else:
|
||||
# User both set '--use-cudagraph' and '--graph-optimization-config'
|
||||
if self.use_cudagraph is False and argument is True:
|
||||
raise ValueError(
|
||||
"Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
|
||||
)
|
||||
argument = self.use_cudagraph
|
||||
|
||||
|
||||
class PlasAttentionConfig:
|
||||
def __init__(
|
||||
@@ -1525,6 +1509,26 @@ class FDConfig:
|
||||
else:
|
||||
self.structured_outputs_config.guided_decoding_backend = "xgrammar"
|
||||
|
||||
# Adjustment GraphOptConfig
|
||||
if (
|
||||
(self.speculative_config is not None and self.speculative_config.method is not None)
|
||||
or (self.model_config is not None and self.model_config.enable_mm is True)
|
||||
or (self.load_config is not None and self.load_config.dynamic_load_weight is True)
|
||||
or (self.scheduler_config.splitwise_role != "mixed")
|
||||
):
|
||||
self.graph_opt_config.use_cudagraph = False
|
||||
logger.info(
|
||||
"CUDAGraph does not support to be started together with SpeculativeDecode and MultiModel temporarily, but has been automatically closed!"
|
||||
)
|
||||
if self.load_config is not None and self.load_config.dynamic_load_weight is True:
|
||||
self.graph_opt_config.graph_opt_level = 0
|
||||
logger.info(
|
||||
"Static Graph does not support to be started together with RL Training, and automatically switch to dynamic graph!"
|
||||
)
|
||||
if self.device_config is not None and self.device_config.device_type != "cuda":
|
||||
self.graph_opt_config.use_cudagraph = False
|
||||
logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!")
|
||||
|
||||
if self.scheduler_config.splitwise_role == "mixed":
|
||||
self.model_config.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.scheduler_config.splitwise_role == "prefill":
|
||||
@@ -1628,6 +1632,21 @@ class FDConfig:
|
||||
if self.scheduler_config is not None:
|
||||
self.scheduler_config.check()
|
||||
|
||||
# Check graph optimization config
|
||||
if self.graph_opt_config.use_cudagraph:
|
||||
if self.speculative_config is not None:
|
||||
assert (
|
||||
self.speculative_config.method is None
|
||||
), "CUDAGraph does not support the simultaneous use of Speculative Decoding"
|
||||
if self.model_config is not None:
|
||||
assert (
|
||||
self.model_config.enable_mm is not True
|
||||
), "CUDAGraph cannot be applied to multimodal model temporarily"
|
||||
if self.graph_opt_config.graph_opt_level > 0 or self.graph_opt_config.use_cudagraph:
|
||||
if self.load_config is not None:
|
||||
assert (
|
||||
self.load_config.dynamic_load_weight is False
|
||||
), "Static graph cannot be used in RL scene temporarily"
|
||||
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1:
|
||||
assert (
|
||||
int(envs.FD_DISABLED_RECOVER) == 0
|
||||
|
||||
Reference in New Issue
Block a user