diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index d6108cffc..2a1d3c56f 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -85,8 +85,6 @@ class ForwardMeta: # The sequence length processed in the current step seq_lens_this_time: Optional[paddle.Tensor] = None - # Accumulated offset - cum_offsets: Optional[paddle.Tensor] = None # batch_id_per_token tensor, used to indicate which token belongs which batch after padding removal to the original input_ids batch_id_per_token: Optional[paddle.Tensor] = None # Accumulated sequence length of query @@ -112,7 +110,8 @@ class XPUForwardMeta(ForwardMeta): """ XPUForwardMeta is used to store the global meta information of the forward, and some XPU specific meta info. """ - + # Accumulated offset + cum_offsets: Optional[paddle.Tensor] = None # TODO(wanghaitao): Supplementary notes # encoder_batch_map: Optional[paddle.Tensor] = None diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index c2a5d0c4b..c3c559832 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -397,7 +397,6 @@ class MTPProposer(Proposer): seq_lens_encoder=self.model_inputs["seq_lens_encoder"], seq_lens_decoder=self.model_inputs["seq_lens_decoder"], seq_lens_this_time=self.model_inputs["seq_lens_this_time"], - cum_offsets=self.model_inputs["cum_offsets"], batch_id_per_token=self.model_inputs["batch_id_per_token"], cu_seqlens_q=self.model_inputs["cu_seqlens_q"], cu_seqlens_k=self.model_inputs["cu_seqlens_k"], diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 6ce285081..b3c046e1d 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -680,7 +680,6 @@ class GPUModelRunner(ModelRunnerBase): seq_lens_encoder=self.share_inputs["seq_lens_encoder"], seq_lens_decoder=self.share_inputs["seq_lens_decoder"], seq_lens_this_time=self.share_inputs["seq_lens_this_time"], - cum_offsets=self.share_inputs["cum_offsets"], batch_id_per_token=self.share_inputs["batch_id_per_token"], cu_seqlens_q=self.share_inputs["cu_seqlens_q"], cu_seqlens_k=self.share_inputs["cu_seqlens_k"],