mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Executor] CUDA Graph support padding batch (#2844)
* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
This commit is contained in:
@@ -17,11 +17,11 @@
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from enum import IntEnum, auto
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from fastdeploy.model_executor.layers.attention import AttentionBackend
|
||||
from typing import Optional
|
||||
|
||||
import paddle
|
||||
|
||||
|
||||
from fastdeploy.model_executor.layers.attention import AttentionBackend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -64,8 +64,6 @@ class ForwardMeta():
|
||||
|
||||
# Use cuda graph in this step or not. Used to avoid run cuda graph when in dummy run or prefill stage.
|
||||
step_use_cudagraph: bool = False
|
||||
# Batch type flag
|
||||
is_decode_batch: bool = False
|
||||
|
||||
# Attention backend object
|
||||
attn_backend: AttentionBackend = None
|
||||
|
Reference in New Issue
Block a user