[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
2025-10-06 00:57:33 +08:00 · 2025-07-16 10:49:01 +08:00
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -1026,7 +1026,7 @@ class LLMEngine(object):
            f" --speculative_model_name_or_path {self.cfg.speculative_config.model_name_or_path}"
            f" --speculative_model_quantization {self.cfg.speculative_config.quantization}"
            f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}"
-            f" --max_capture_batch_size {self.cfg.max_capture_batch_size}"
+            f" --graph_optimiaztion_config '{self.cfg.graph_optimization_config.to_json_string()}'"
            f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
            f" --load_strategy {self.cfg.model_config.load_strategy}"
            f" --enable_mm {self.cfg.enable_mm}")
@@ -1041,9 +1041,6 @@ class LLMEngine(object):
            self.cfg.cache_config.enable_chunked_prefill,
            "do_profile": self.do_profile,
            "dynamic_load_weight": self.cfg.model_config.dynamic_load_weight,
-            "enable_static_graph_inference":
-            self.cfg.enable_static_graph_inference,
-            "use_cudagraph": self.cfg.use_cudagraph,
            "disable_any_whitespace": self.cfg.disable_any_whitespace,
            "enable-custom-all-reduce": self.cfg.parallel_config.enable_custom_all_reduce,
            "enable_logprob": self.cfg.enable_logprob,