From 870364b5479ff70e278aa697ce448d9bc055f179 Mon Sep 17 00:00:00 2001 From: RAM Date: Wed, 24 Sep 2025 19:45:22 +0800 Subject: [PATCH] [CUDAGraph]CUDA Graph support unique memory pool (#4230) * cuda graph use unique memory pool * fix custom device import bug * refine code * refine code * refine code --- fastdeploy/config.py | 5 +++++ .../graph_optimization/cudagraph_piecewise_backend.py | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 6f3e361b2..1918b1431 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -841,8 +841,13 @@ class GraphOptimizationConfig: Now don't support capture both decode-only and prefill-only""" self.full_cuda_graph: bool = True + """ Maximum CUDA Graph capture size """ self.max_capture_size: int = None + """ Record maps mapped from real shape to captured size to reduce runtime overhead """ self.real_shape_to_captured_size: dict[int, int] = None + """ Whether to use shared memory pool for multi capture_size """ + self.use_unique_memory_pool: bool = False + # CINN Config ... if args is not None: for key, value in args.items(): diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 8e488d57e..8c64fe3cd 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -96,6 +96,13 @@ class CudaGraphPiecewiseBackend: self.cudagraph_capture_sizes = fd_config.graph_opt_config.cudagraph_capture_sizes self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups self.real_shape_to_captured_size = fd_config.graph_opt_config.real_shape_to_captured_size + self.unique_memory_pool_id = None + if self.fd_config.graph_opt_config.use_unique_memory_pool: + # TODO(gongshaotian): Optimize code + if paddle.is_compiled_with_cuda(): + from paddle.base.core import CUDAGraph + + self.unique_memory_pool_id = CUDAGraph.gen_new_memory_pool_id() self._create_entry_dict() @@ -169,7 +176,7 @@ class CudaGraphPiecewiseBackend: input_addresses = [x.data_ptr() for (_, x) in kwargs.items() if isinstance(x, paddle.Tensor)] entry.input_addresses = input_addresses - new_grpah = graphs.CUDAGraph() + new_grpah = graphs.CUDAGraph(pool_id=self.unique_memory_pool_id) paddle.device.synchronize() # Capture