From 94c57e4175aad618bcc5e1ea644b9fcffd9140a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?= <39978853+zhoutianzi666@users.noreply.github.com> Date: Fri, 5 Dec 2025 20:19:01 +0800 Subject: [PATCH] [BugFix]remove _execute_empty_input (#5396) --- fastdeploy/config.py | 3 +++ fastdeploy/worker/gpu_model_runner.py | 27 ++++++++++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index f1eb23852..a820e8d94 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1577,6 +1577,9 @@ class FDConfig: self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape) self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape) + if self.parallel_config.use_ep: + self.graph_opt_config.cudagraph_capture_sizes = [0] + self.graph_opt_config.cudagraph_capture_sizes + self.tokenizer = tokenizer self.ips = ips self.tool_parser = tool_parser diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 94c7a0b3f..9b550f104 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1020,10 +1020,14 @@ class GPUModelRunner(ModelRunnerBase): """ # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token max_dec_len = expected_decode_len + 1 - input_length = min( - num_tokens // (1 if capture_prefill else batch_size), - self.model_config.max_model_len - max_dec_len, - ) + if batch_size == 0: + # Note(ZKK): divided by 0 is invalid, here we give a input_length = 1 + input_length = 1 + else: + input_length = min( + num_tokens // (1 if capture_prefill else batch_size), + self.model_config.max_model_len - max_dec_len, + ) # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP. @@ -2223,13 +2227,6 @@ class GPUModelRunner(ModelRunnerBase): for proc in self.sampling_metadata.logits_processors: proc.update_state(self.share_inputs) - # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state. - # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, - # when there is data on other runner, the current runner is required to execute part of the model. - if not self.not_need_stop(): - self._execute_empty_input(self.forward_meta) - return None - # 2. Padding inputs for cuda graph self.padding_cudagraph_inputs() @@ -2245,6 +2242,14 @@ class GPUModelRunner(ModelRunnerBase): self.forward_meta.ids_remove_padding, self.forward_meta, ) + + # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state. + # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, + # Then there is data on other runner, the current runner is required to execute part of the model. + # But not need to run the below code. + if not self.not_need_stop(): + return None + if self.use_cudagraph: model_output = model_output[: self.real_token_num]