[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
2025-10-04 08:16:42 +08:00 · 2025-07-16 10:49:01 +08:00
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -748,10 +748,6 @@ class GCUModelRunner(ModelRunnerBase):
            # 3. Prepare lora

            # 4. Run model
-            is_decode_batch = not ((self.share_inputs["seq_lens_this_time"]
-                                    > 1).sum() > 0)
-            self.forward_meta.step_use_cudagraph = is_decode_batch and in_capturing
-            self.forward_meta.is_decode_batch = is_decode_batch
            model_output = self.model(
                ids_remove_padding=self.share_inputs["ids_remove_padding"],
                forward_meta=self.forward_meta)
@@ -979,10 +975,6 @@ class GCUModelRunner(ModelRunnerBase):
        # 2. Padding inputs for cuda grph

        # 3. Execute model
-        is_decode_batch = not ((self.share_inputs["seq_lens_this_time"]
-                                > 1).sum() > 0)
-        self.forward_meta.step_use_cudagraph = self.use_cudagraph and is_decode_batch
-        self.forward_meta.is_decode_batch = is_decode_batch
        model_output = self.model(
            ids_remove_padding=self.share_inputs["ids_remove_padding"],
            forward_meta=self.forward_meta)