mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[BugFix]remove _execute_empty_input (#5396)
This commit is contained in:
@@ -1577,6 +1577,9 @@ class FDConfig:
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)
|
||||
|
||||
if self.parallel_config.use_ep:
|
||||
self.graph_opt_config.cudagraph_capture_sizes = [0] + self.graph_opt_config.cudagraph_capture_sizes
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.ips = ips
|
||||
self.tool_parser = tool_parser
|
||||
|
||||
@@ -1020,10 +1020,14 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
"""
|
||||
# NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
|
||||
max_dec_len = expected_decode_len + 1
|
||||
input_length = min(
|
||||
num_tokens // (1 if capture_prefill else batch_size),
|
||||
self.model_config.max_model_len - max_dec_len,
|
||||
)
|
||||
if batch_size == 0:
|
||||
# Note(ZKK): divided by 0 is invalid, here we give a input_length = 1
|
||||
input_length = 1
|
||||
else:
|
||||
input_length = min(
|
||||
num_tokens // (1 if capture_prefill else batch_size),
|
||||
self.model_config.max_model_len - max_dec_len,
|
||||
)
|
||||
|
||||
# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
|
||||
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
|
||||
@@ -2223,13 +2227,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
for proc in self.sampling_metadata.logits_processors:
|
||||
proc.update_state(self.share_inputs)
|
||||
|
||||
# NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
|
||||
# This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
|
||||
# when there is data on other runner, the current runner is required to execute part of the model.
|
||||
if not self.not_need_stop():
|
||||
self._execute_empty_input(self.forward_meta)
|
||||
return None
|
||||
|
||||
# 2. Padding inputs for cuda graph
|
||||
self.padding_cudagraph_inputs()
|
||||
|
||||
@@ -2245,6 +2242,14 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.forward_meta.ids_remove_padding,
|
||||
self.forward_meta,
|
||||
)
|
||||
|
||||
# NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
|
||||
# This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
|
||||
# Then there is data on other runner, the current runner is required to execute part of the model.
|
||||
# But not need to run the below code.
|
||||
if not self.not_need_stop():
|
||||
return None
|
||||
|
||||
if self.use_cudagraph:
|
||||
model_output = model_output[: self.real_token_num]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user