[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
2025-10-04 08:16:42 +08:00 · 2025-07-16 10:49:01 +08:00
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions
--- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -68,16 +68,20 @@ class CudaGraphPiecewiseBackend:
            self.concrete_size_entries[shape] = ConcreteSizeEntry(
                runtime_bs=shape)

-        logger.debug("[CUDA GRAPH] Created all batch size entry ")
+        logger.info(
+            f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
+            "Created all batch sizes entry."
+        )

    def __call__(self, **kwargs):
        # Get batch size
        ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
        batch_size = ids_remove_padding.shape[0]
        padding_batch_size = self.batch_size_to_captured_size[batch_size]
-        logger.debug((
-            f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
-            f"The padded batch size is :{padding_batch_size}"))
+        logger.debug(
+            f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, "
+            f"The padded batch size is :{padding_batch_size}"
+        )

        entry = self.concrete_size_entries.get(padding_batch_size)
        assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
@@ -96,10 +100,10 @@ class CudaGraphPiecewiseBackend:
            for n in range(entry.num_finished_warmup, self.warm_up_size):
                entry.num_finished_warmup += 1
                entry.runnable(**kwargs)
-                logger.debug((
-                    "[CUDA GRAPH] Warm up for batch size ",
-                    f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
-                ))
+                logger.debug(
+                    f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, "
+                    f"finished ({n+1}/{entry.num_finished_warmup}) times"
+                )

            # Store input addresses for debug
            input_addresses = [