From 94c57e4175aad618bcc5e1ea644b9fcffd9140a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Fri, 5 Dec 2025 20:19:01 +0800
Subject: [PATCH] [BugFix]remove _execute_empty_input (#5396)

---
 fastdeploy/config.py                  |  3 +++
 fastdeploy/worker/gpu_model_runner.py | 27 ++++++++++++++++-----------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index f1eb23852..a820e8d94 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1577,6 +1577,9 @@ class FDConfig:
             self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
         self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)
 
+        if self.parallel_config.use_ep:
+            self.graph_opt_config.cudagraph_capture_sizes = [0] + self.graph_opt_config.cudagraph_capture_sizes
+
         self.tokenizer = tokenizer
         self.ips = ips
         self.tool_parser = tool_parser
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 94c7a0b3f..9b550f104 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1020,10 +1020,14 @@ class GPUModelRunner(ModelRunnerBase):
         """
         # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
         max_dec_len = expected_decode_len + 1
-        input_length = min(
-            num_tokens // (1 if capture_prefill else batch_size),
-            self.model_config.max_model_len - max_dec_len,
-        )
+        if batch_size == 0:
+            # Note(ZKK): divided by 0 is invalid, here we give a input_length = 1
+            input_length = 1
+        else:
+            input_length = min(
+                num_tokens // (1 if capture_prefill else batch_size),
+                self.model_config.max_model_len - max_dec_len,
+            )
 
         # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
         # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
@@ -2223,13 +2227,6 @@ class GPUModelRunner(ModelRunnerBase):
         for proc in self.sampling_metadata.logits_processors:
             proc.update_state(self.share_inputs)
 
-        # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
-        # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
-        # when there is data on other runner, the current runner is required to execute part of the model.
-        if not self.not_need_stop():
-            self._execute_empty_input(self.forward_meta)
-            return None
-
         # 2. Padding inputs for cuda graph
         self.padding_cudagraph_inputs()
 
@@ -2245,6 +2242,14 @@ class GPUModelRunner(ModelRunnerBase):
                 self.forward_meta.ids_remove_padding,
                 self.forward_meta,
             )
+
+        # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
+        # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
+        # Then there is data on other runner, the current runner is required to execute part of the model.
+        # But not need to run the below code.
+        if not self.not_need_stop():
+            return None
+
         if self.use_cudagraph:
             model_output = model_output[: self.real_token_num]