From bab779011c61462e2d5fc518cc84c6d2e4036764 Mon Sep 17 00:00:00 2001 From: lizhenyun01 <1500424927@qq.com> Date: Wed, 24 Sep 2025 21:32:04 +0800 Subject: [PATCH] [CudaGraph] support cudagraph use shared pool (#4199) * support cudagraph use shared pool * add envs * change CUDAGRAPH_POOL_ID to int * change CUDAGRAPH_POOL_ID to use_memory_pool * unify use_unique_memory_pool * fix use_unique_memory_pool --- fastdeploy/config.py | 3 +++ .../cudagraph_piecewise_backend.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 58bb79735..d906bbaef 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -588,6 +588,9 @@ class GraphOptimizationConfig: Thus this flag cannot be used together with splitting_ops.""" self.full_cuda_graph: bool = True + """ Whether to use shared memory pool for multi capture_size """ + self.use_unique_memory_pool: bool = False + self.max_capture_size: int = None self.real_shape_to_captured_size: dict[int, int] = None # CINN Config ... diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 8bc73d701..79ff9ea0e 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -20,6 +20,7 @@ from typing import Callable, Dict, List, Optional import paddle.jit.dy2static.utils as jit_utils import paddle.nn.layer +from paddle.base.core import CUDAGraph from paddle.device.cuda import graphs from fastdeploy import envs @@ -85,17 +86,14 @@ class Dy2StCudaGraphManager: class CudaGraphPiecewiseBackend: """Manage the capture and replay of CUDA graphs at the subgraph level.""" - def __init__( - self, - fd_config: FDConfig, - runnable: Callable, - ): + def __init__(self, fd_config: FDConfig, runnable: Callable): self.fd_config = fd_config self.runnable = runnable self.cudagraph_capture_sizes = fd_config.graph_opt_config.cudagraph_capture_sizes self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups self.real_shape_to_captured_size = fd_config.graph_opt_config.real_shape_to_captured_size - + if self.fd_config.graph_opt_config.use_unique_memory_pool: + self.unique_memory_pool_id = CUDAGraph.gen_new_memory_pool_id() self._create_entry_dict() self.cuda_graph_manager = None @@ -168,7 +166,11 @@ class CudaGraphPiecewiseBackend: input_addresses = [x.data_ptr() for (_, x) in kwargs.items() if isinstance(x, paddle.Tensor)] entry.input_addresses = input_addresses - new_grpah = graphs.CUDAGraph() + new_grpah = ( + graphs.CUDAGraph(pool_id=self.unique_memory_pool_id) + if self.fd_config.graph_opt_config.use_unique_memory_pool + else graphs.CUDAGraph() + ) paddle.device.synchronize() # Capture