[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
2025-10-05 08:37:06 +08:00 · 2025-07-16 10:49:01 +08:00
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions
--- a/fastdeploy/rl/rollout_config.py
+++ b/fastdeploy/rl/rollout_config.py
@@ -53,9 +53,6 @@ class RolloutModelConfig:
        enable_expert_parallell: bool = False,
        ori_vocab_size: int = None,
        quantization: str = "None",
-        enable_static_graph_inference: bool = False,
-        use_cudagraph: bool = False,
-        max_capture_batch_size: int = 64,
        guided_decoding_backend: str = "off",
        disable_any_whitespace: bool = True,
        enable_logprob: bool = False,
@@ -95,9 +92,6 @@ class RolloutModelConfig:
        self.enable_expert_parallell = enable_expert_parallell
        self.ori_vocab_size = ori_vocab_size
        self.quantization = quantization
-        self.enable_static_graph_inference = enable_static_graph_inference
-        self.use_cudagraph = use_cudagraph
-        self.max_capture_batch_size = max_capture_batch_size
        self.guided_decoding_backend = guided_decoding_backend
        self.disable_any_whitespace = disable_any_whitespace
        self.enable_logprob = enable_logprob