[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
2025-10-05 16:48:03 +08:00 · 2025-07-16 10:49:01 +08:00
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions
--- a/fastdeploy/model_executor/forward_meta.py
+++ b/fastdeploy/model_executor/forward_meta.py
@@ -17,11 +17,11 @@
 import logging
 from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, Optional
-from fastdeploy.model_executor.layers.attention import AttentionBackend
+from typing import Optional

 import paddle
-    
+
+from fastdeploy.model_executor.layers.attention import AttentionBackend

 logger = logging.getLogger(__name__)

@@ -64,8 +64,6 @@ class ForwardMeta():

    # Use cuda graph in this step or not. Used to avoid run cuda graph when in dummy run or prefill stage.
    step_use_cudagraph: bool = False
-    # Batch type flag
-    is_decode_batch: bool = False

    # Attention backend object
    attn_backend: AttentionBackend = None