diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 809f7d4f8..eba068e89 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -72,7 +72,6 @@ DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1" def pre_process( - max_len: int, input_ids: paddle.Tensor, seq_lens_this_time: int, speculative_decoding: bool, @@ -83,7 +82,6 @@ def pre_process( """ Preprocessing before embedding. Args: - max_len: input_ids: seq_lens_this_time: speculative_decoding: @@ -97,6 +95,7 @@ def pre_process( cu_seqlens_k: """ # Remove padding + max_len = input_ids.shape[1] cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) token_num = paddle.sum(seq_lens_this_time) output_padding_offset = None @@ -490,6 +489,7 @@ def rebuild_padding( ) elif current_platform.is_dcu(): from fastdeploy.model_executor.ops.gpu import rebuild_padding + hidden_states = rebuild_padding( tmp_out, cum_offsets, diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 3acf7714d..a4748b2e5 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -502,7 +502,6 @@ class MTPProposer(Proposer): output_cum_offsets, output_padding_offset, ) = pre_process( - self.parallel_config.max_model_len, self.model_inputs["input_ids"], self.model_inputs["seq_lens_this_time"], True, diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 406ce53d9..751f45432 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -449,7 +449,6 @@ class GCUModelRunner(ModelRunnerBase): output_cum_offsets, output_padding_offset, ) = pre_process( - self.parallel_config.max_model_len, self.share_inputs["input_ids"], self.share_inputs["seq_lens_this_time"], self.speculative_decoding, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 15dc7de77..c639c29ef 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -601,7 +601,6 @@ class GPUModelRunner(ModelRunnerBase): output_cum_offsets, output_padding_offset, ) = pre_process( - self.parallel_config.max_model_len, self.share_inputs["input_ids"], self.share_inputs["seq_lens_this_time"], self.speculative_decoding, diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 601d7f264..731990ff5 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -41,7 +41,6 @@ logger = get_logger("xpu_model_runner", "xpu_model_runner.log") def xpu_pre_process( - max_len: int, input_ids: paddle.Tensor, seq_lens_this_time: int, share_inputs: Dict, @@ -51,6 +50,7 @@ def xpu_pre_process( seq_lens_decoder: Optional[paddle.Tensor] = None, ) -> XPUForwardMeta: """ """ + max_len = input_ids.shape[1] cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) token_num = paddle.sum(seq_lens_this_time) from fastdeploy.model_executor.ops.xpu import ( @@ -458,7 +458,6 @@ class XPUModelRunner(ModelRunnerBase): def _prepare_inputs(self) -> None: """prepare the model inputs""" self.forward_meta = xpu_pre_process( - self.parallel_config.max_model_len, self.share_inputs["input_ids"], self.share_inputs["seq_lens_this_time"], self.share_inputs,