[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch

* Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes.

* Do not insert max_num_seqs when the user specifies a capture list

* Support set graph optimization config from YAML file

* update cuda graph ci

* fix ci bug

* fix ci bug
This commit is contained in:
RAM
2025-07-16 10:49:01 +08:00
committed by GitHub
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions

View File

@@ -748,10 +748,6 @@ class GCUModelRunner(ModelRunnerBase):
# 3. Prepare lora
# 4. Run model
is_decode_batch = not ((self.share_inputs["seq_lens_this_time"]
> 1).sum() > 0)
self.forward_meta.step_use_cudagraph = is_decode_batch and in_capturing
self.forward_meta.is_decode_batch = is_decode_batch
model_output = self.model(
ids_remove_padding=self.share_inputs["ids_remove_padding"],
forward_meta=self.forward_meta)
@@ -979,10 +975,6 @@ class GCUModelRunner(ModelRunnerBase):
# 2. Padding inputs for cuda grph
# 3. Execute model
is_decode_batch = not ((self.share_inputs["seq_lens_this_time"]
> 1).sum() > 0)
self.forward_meta.step_use_cudagraph = self.use_cudagraph and is_decode_batch
self.forward_meta.is_decode_batch = is_decode_batch
model_output = self.model(
ids_remove_padding=self.share_inputs["ids_remove_padding"],
forward_meta=self.forward_meta)