[BugFix] 0 not into cuda graph to save memory (#5426)

2025-12-24 13:28:13 +08:00 · 2025-12-08 16:47:44 +08:00
parent d1bd40d44c
commit 438c9f785a
3 changed files with 11 additions and 12 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1577,9 +1577,6 @@ class FDConfig:
            self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
        self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)

-        if self.parallel_config.use_ep:
-            self.graph_opt_config.cudagraph_capture_sizes += [0]
-
        self.tokenizer = tokenizer
        self.ips = ips
        self.tool_parser = tool_parser
--- a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py
+++ b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py
@@ -207,6 +207,10 @@ class CustomAllreduce:

    def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
        """The main allreduce API that provides support for cuda graph."""
+
+        if input.shape[0] == 0:
+            return input
+
        if self.capturing:
            lib = cuda_wrapper.CudaRTLibrary()
            stream = paddle.device.current_stream()
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1020,14 +1020,10 @@ class GPUModelRunner(ModelRunnerBase):
        """
        # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
        max_dec_len = expected_decode_len + 1
-        if batch_size == 0:
-            # Note(ZKK): divided by 0 is invalid, here we give a input_length = 1
-            input_length = 1
-        else:
-            input_length = min(
-                num_tokens // (1 if capture_prefill else batch_size),
-                self.model_config.max_model_len - max_dec_len,
-            )
+        input_length = min(
+            num_tokens // (1 if capture_prefill else batch_size),
+            self.model_config.max_model_len - max_dec_len,
+        )

        # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
        # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
@@ -1490,7 +1486,9 @@ class GPUModelRunner(ModelRunnerBase):

        # When support capture both prefill-only and decode-only, this will use [only_prefill_use_cudagraph or only_decode_use_cudagraph]
        self.forward_meta.step_use_cudagraph = (
-            only_prefill_use_cudagraph if self.cudagraph_only_prefill else only_decode_use_cudagraph
+            only_prefill_use_cudagraph
+            if self.cudagraph_only_prefill
+            else only_decode_use_cudagraph and self.forward_meta.ids_remove_padding.shape[0] > 0
        )

        # Set forward_meta.is_dummy_or_profile_run to True to skip init_kv_signal_per_query for attention backends