[BugFix]remove _execute_empty_input (#5396)

2025-12-24 13:28:13 +08:00 · 2025-12-05 20:19:01 +08:00
parent d4979347ca
commit 94c57e4175
2 changed files with 19 additions and 11 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1577,6 +1577,9 @@ class FDConfig:
            self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
        self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)

+        if self.parallel_config.use_ep:
+            self.graph_opt_config.cudagraph_capture_sizes = [0] + self.graph_opt_config.cudagraph_capture_sizes
+
        self.tokenizer = tokenizer
        self.ips = ips
        self.tool_parser = tool_parser
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1020,10 +1020,14 @@ class GPUModelRunner(ModelRunnerBase):
        """
        # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
        max_dec_len = expected_decode_len + 1
-        input_length = min(
-            num_tokens // (1 if capture_prefill else batch_size),
-            self.model_config.max_model_len - max_dec_len,
-        )
+        if batch_size == 0:
+            # Note(ZKK): divided by 0 is invalid, here we give a input_length = 1
+            input_length = 1
+        else:
+            input_length = min(
+                num_tokens // (1 if capture_prefill else batch_size),
+                self.model_config.max_model_len - max_dec_len,
+            )

        # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
        # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
@@ -2223,13 +2227,6 @@ class GPUModelRunner(ModelRunnerBase):
        for proc in self.sampling_metadata.logits_processors:
            proc.update_state(self.share_inputs)

-        # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
-        # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
-        # when there is data on other runner, the current runner is required to execute part of the model.
-        if not self.not_need_stop():
-            self._execute_empty_input(self.forward_meta)
-            return None
-
        # 2. Padding inputs for cuda graph
        self.padding_cudagraph_inputs()

@@ -2245,6 +2242,14 @@ class GPUModelRunner(ModelRunnerBase):
                self.forward_meta.ids_remove_padding,
                self.forward_meta,
            )
+
+        # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
+        # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
+        # Then there is data on other runner, the current runner is required to execute part of the model.
+        # But not need to run the below code.
+        if not self.not_need_stop():
+            return None
+
        if self.use_cudagraph:
            model_output = model_output[: self.real_token_num]