[Executor] Default use CUDAGraph (#3594)

* add start intercept

* Adjustment GraphOptConfig

* pre-commit

* default use cudagraph

* set default value

* default use cuda graph

* pre-commit

* fix test case bug

* disable rl

* fix moba attention

* only support gpu

* Temporarily disable PD Disaggregation

* set max_num_seqs of test case as 1

* set max_num_seqs and temperature

* fix max_num_batched_tokens bug

* close cuda graph

* success run wint2

* profile run with max_num_batched_tokens

* 1.add c++ memchecker 2.success run wint2

* updatee a800 yaml

* update docs

* 1. delete check 2. fix plas attn test case

* default use use_unique_memory_pool

* add try-except for warmup

* ban mtp, mm, rl

* fix test case mock

* fix ci bug

* fix form_model_get_output_topp0 bug

* fix ci bug

* refine deepseek ci

* refine code

* Disable PD

* fix sot yaml
This commit is contained in:
RAM
2025-10-21 14:25:45 +08:00
committed by GitHub
parent 99564349a7
commit 775edcc09a
32 changed files with 417 additions and 144 deletions

View File

@@ -785,7 +785,7 @@ class GraphOptimizationConfig:
"""
self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
""" Number of warmup runs for SOT warmup. """
self.use_cudagraph: bool = False
self.use_cudagraph: bool = True
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from llm config.
- list[int]: capture sizes are specified as given."""
@@ -821,7 +821,7 @@ class GraphOptimizationConfig:
""" Record maps mapped from real shape to captured size to reduce runtime overhead """
self.real_shape_to_captured_size: dict[int, int] = None
""" Whether to use shared memory pool for multi capture_size """
self.use_unique_memory_pool: bool = False
self.use_unique_memory_pool: bool = True
# CINN Config ...
if args is not None:
@@ -908,22 +908,6 @@ class GraphOptimizationConfig:
len(self.cudagraph_capture_sizes) > 0
), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
def update_use_cudagraph(self, argument: bool):
"""
Unified user specifies the use_cudagraph parameter through two methods,
'--use-cudagraph' and '--graph-optimization-config'
"""
if self.use_cudagraph is None:
# User only set '--use-cudagraph'
self.use_cudagraph = argument
else:
# User both set '--use-cudagraph' and '--graph-optimization-config'
if self.use_cudagraph is False and argument is True:
raise ValueError(
"Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
)
argument = self.use_cudagraph
class PlasAttentionConfig:
def __init__(
@@ -1525,6 +1509,26 @@ class FDConfig:
else:
self.structured_outputs_config.guided_decoding_backend = "xgrammar"
# Adjustment GraphOptConfig
if (
(self.speculative_config is not None and self.speculative_config.method is not None)
or (self.model_config is not None and self.model_config.enable_mm is True)
or (self.load_config is not None and self.load_config.dynamic_load_weight is True)
or (self.scheduler_config.splitwise_role != "mixed")
):
self.graph_opt_config.use_cudagraph = False
logger.info(
"CUDAGraph does not support to be started together with SpeculativeDecode and MultiModel temporarily, but has been automatically closed!"
)
if self.load_config is not None and self.load_config.dynamic_load_weight is True:
self.graph_opt_config.graph_opt_level = 0
logger.info(
"Static Graph does not support to be started together with RL Training, and automatically switch to dynamic graph!"
)
if self.device_config is not None and self.device_config.device_type != "cuda":
self.graph_opt_config.use_cudagraph = False
logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!")
if self.scheduler_config.splitwise_role == "mixed":
self.model_config.moe_phase = MoEPhase(phase="prefill")
elif self.scheduler_config.splitwise_role == "prefill":
@@ -1628,6 +1632,21 @@ class FDConfig:
if self.scheduler_config is not None:
self.scheduler_config.check()
# Check graph optimization config
if self.graph_opt_config.use_cudagraph:
if self.speculative_config is not None:
assert (
self.speculative_config.method is None
), "CUDAGraph does not support the simultaneous use of Speculative Decoding"
if self.model_config is not None:
assert (
self.model_config.enable_mm is not True
), "CUDAGraph cannot be applied to multimodal model temporarily"
if self.graph_opt_config.graph_opt_level > 0 or self.graph_opt_config.use_cudagraph:
if self.load_config is not None:
assert (
self.load_config.dynamic_load_weight is False
), "Static graph cannot be used in RL scene temporarily"
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1:
assert (
int(envs.FD_DISABLED_RECOVER) == 0