[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch

* Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes.

* Do not insert max_num_seqs when the user specifies a capture list

* Support set graph optimization config from YAML file

* update cuda graph ci

* fix ci bug

* fix ci bug
This commit is contained in:
RAM
2025-07-16 10:49:01 +08:00
committed by GitHub
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions

View File

@@ -53,9 +53,6 @@ class RolloutModelConfig:
enable_expert_parallell: bool = False,
ori_vocab_size: int = None,
quantization: str = "None",
enable_static_graph_inference: bool = False,
use_cudagraph: bool = False,
max_capture_batch_size: int = 64,
guided_decoding_backend: str = "off",
disable_any_whitespace: bool = True,
enable_logprob: bool = False,
@@ -95,9 +92,6 @@ class RolloutModelConfig:
self.enable_expert_parallell = enable_expert_parallell
self.ori_vocab_size = ori_vocab_size
self.quantization = quantization
self.enable_static_graph_inference = enable_static_graph_inference
self.use_cudagraph = use_cudagraph
self.max_capture_batch_size = max_capture_batch_size
self.guided_decoding_backend = guided_decoding_backend
self.disable_any_whitespace = disable_any_whitespace
self.enable_logprob = enable_logprob