[BugFix]remove _execute_empty_input (#5396)

This commit is contained in:
周周周
2025-12-05 20:19:01 +08:00
committed by GitHub
parent d4979347ca
commit 94c57e4175
2 changed files with 19 additions and 11 deletions

View File

@@ -1577,6 +1577,9 @@ class FDConfig:
self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)
if self.parallel_config.use_ep:
self.graph_opt_config.cudagraph_capture_sizes = [0] + self.graph_opt_config.cudagraph_capture_sizes
self.tokenizer = tokenizer
self.ips = ips
self.tool_parser = tool_parser

View File

@@ -1020,10 +1020,14 @@ class GPUModelRunner(ModelRunnerBase):
"""
# NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
max_dec_len = expected_decode_len + 1
input_length = min(
num_tokens // (1 if capture_prefill else batch_size),
self.model_config.max_model_len - max_dec_len,
)
if batch_size == 0:
# Note(ZKK): divided by 0 is invalid, here we give a input_length = 1
input_length = 1
else:
input_length = min(
num_tokens // (1 if capture_prefill else batch_size),
self.model_config.max_model_len - max_dec_len,
)
# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
@@ -2223,13 +2227,6 @@ class GPUModelRunner(ModelRunnerBase):
for proc in self.sampling_metadata.logits_processors:
proc.update_state(self.share_inputs)
# NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
# This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
# when there is data on other runner, the current runner is required to execute part of the model.
if not self.not_need_stop():
self._execute_empty_input(self.forward_meta)
return None
# 2. Padding inputs for cuda graph
self.padding_cudagraph_inputs()
@@ -2245,6 +2242,14 @@ class GPUModelRunner(ModelRunnerBase):
self.forward_meta.ids_remove_padding,
self.forward_meta,
)
# NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
# This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
# Then there is data on other runner, the current runner is required to execute part of the model.
# But not need to run the below code.
if not self.not_need_stop():
return None
if self.use_cudagraph:
model_output = model_output[: self.real_token_num]