[Executor] Default use CUDAGraph (#3594)

* add start intercept * Adjustment GraphOptConfig * pre-commit * default use cudagraph * set default value * default use cuda graph * pre-commit * fix test case bug * disable rl * fix moba attention * only support gpu * Temporarily disable PD Disaggregation * set max_num_seqs of test case as 1 * set max_num_seqs and temperature * fix max_num_batched_tokens bug * close cuda graph * success run wint2 * profile run with max_num_batched_tokens * 1.add c++ memchecker 2.success run wint2 * updatee a800 yaml * update docs * 1. delete check 2. fix plas attn test case * default use use_unique_memory_pool * add try-except for warmup * ban mtp, mm, rl * fix test case mock * fix ci bug * fix form_model_get_output_topp0 bug * fix ci bug * refine deepseek ci * refine code * Disable PD * fix sot yaml
2025-12-24 13:28:13 +08:00 · 2025-10-21 14:25:45 +08:00
parent 99564349a7
commit 775edcc09a
32 changed files with 417 additions and 144 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -785,7 +785,7 @@ class GraphOptimizationConfig:
        """
        self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
        """  Number of warmup runs for SOT warmup. """
-        self.use_cudagraph: bool = False
+        self.use_cudagraph: bool = True
        """Sizes to capture cudagraph.
        - None (default): capture sizes are inferred from llm config.
        - list[int]: capture sizes are specified as given."""
@@ -821,7 +821,7 @@ class GraphOptimizationConfig:
        """ Record maps mapped from real shape to captured size to reduce runtime overhead """
        self.real_shape_to_captured_size: dict[int, int] = None
        """ Whether to use shared memory pool for multi capture_size """
-        self.use_unique_memory_pool: bool = False
+        self.use_unique_memory_pool: bool = True

        # CINN Config ...
        if args is not None:
@@ -908,22 +908,6 @@ class GraphOptimizationConfig:
                len(self.cudagraph_capture_sizes) > 0
            ), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."

-    def update_use_cudagraph(self, argument: bool):
-        """
-        Unified user specifies the use_cudagraph parameter through two methods,
-        '--use-cudagraph' and '--graph-optimization-config'
-        """
-        if self.use_cudagraph is None:
-            # User only set '--use-cudagraph'
-            self.use_cudagraph = argument
-        else:
-            # User both set '--use-cudagraph' and '--graph-optimization-config'
-            if self.use_cudagraph is False and argument is True:
-                raise ValueError(
-                    "Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
-                )
-            argument = self.use_cudagraph
-

 class PlasAttentionConfig:
    def __init__(
@@ -1525,6 +1509,26 @@ class FDConfig:
            else:
                self.structured_outputs_config.guided_decoding_backend = "xgrammar"

+        # Adjustment GraphOptConfig
+        if (
+            (self.speculative_config is not None and self.speculative_config.method is not None)
+            or (self.model_config is not None and self.model_config.enable_mm is True)
+            or (self.load_config is not None and self.load_config.dynamic_load_weight is True)
+            or (self.scheduler_config.splitwise_role != "mixed")
+        ):
+            self.graph_opt_config.use_cudagraph = False
+            logger.info(
+                "CUDAGraph does not support to be started together with SpeculativeDecode and MultiModel temporarily, but has been automatically closed!"
+            )
+        if self.load_config is not None and self.load_config.dynamic_load_weight is True:
+            self.graph_opt_config.graph_opt_level = 0
+            logger.info(
+                "Static Graph does not support to be started together with RL Training, and automatically switch to dynamic graph!"
+            )
+        if self.device_config is not None and self.device_config.device_type != "cuda":
+            self.graph_opt_config.use_cudagraph = False
+            logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!")
+
        if self.scheduler_config.splitwise_role == "mixed":
            self.model_config.moe_phase = MoEPhase(phase="prefill")
        elif self.scheduler_config.splitwise_role == "prefill":
@@ -1628,6 +1632,21 @@ class FDConfig:
        if self.scheduler_config is not None:
            self.scheduler_config.check()

+        # Check graph optimization config
+        if self.graph_opt_config.use_cudagraph:
+            if self.speculative_config is not None:
+                assert (
+                    self.speculative_config.method is None
+                ), "CUDAGraph does not support the simultaneous use of Speculative Decoding"
+            if self.model_config is not None:
+                assert (
+                    self.model_config.enable_mm is not True
+                ), "CUDAGraph cannot be applied to multimodal model temporarily"
+        if self.graph_opt_config.graph_opt_level > 0 or self.graph_opt_config.use_cudagraph:
+            if self.load_config is not None:
+                assert (
+                    self.load_config.dynamic_load_weight is False
+                ), "Static graph cannot be used in RL scene temporarily"
        if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1:
            assert (
                int(envs.FD_DISABLED_RECOVER) == 0