[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch

* Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes.

* Do not insert max_num_seqs when the user specifies a capture list

* Support set graph optimization config from YAML file

* update cuda graph ci

* fix ci bug

* fix ci bug
This commit is contained in:
RAM
2025-07-16 10:49:01 +08:00
committed by GitHub
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions

View File

@@ -17,11 +17,11 @@
import logging
from dataclasses import dataclass
from enum import IntEnum, auto
from typing import TYPE_CHECKING, Optional
from fastdeploy.model_executor.layers.attention import AttentionBackend
from typing import Optional
import paddle
from fastdeploy.model_executor.layers.attention import AttentionBackend
logger = logging.getLogger(__name__)
@@ -64,8 +64,6 @@ class ForwardMeta():
# Use cuda graph in this step or not. Used to avoid run cuda graph when in dummy run or prefill stage.
step_use_cudagraph: bool = False
# Batch type flag
is_decode_batch: bool = False
# Attention backend object
attn_backend: AttentionBackend = None