mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
delete max-len (#2959)
This commit is contained in:
@@ -72,7 +72,6 @@ DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"
|
|||||||
|
|
||||||
|
|
||||||
def pre_process(
|
def pre_process(
|
||||||
max_len: int,
|
|
||||||
input_ids: paddle.Tensor,
|
input_ids: paddle.Tensor,
|
||||||
seq_lens_this_time: int,
|
seq_lens_this_time: int,
|
||||||
speculative_decoding: bool,
|
speculative_decoding: bool,
|
||||||
@@ -83,7 +82,6 @@ def pre_process(
|
|||||||
"""
|
"""
|
||||||
Preprocessing before embedding.
|
Preprocessing before embedding.
|
||||||
Args:
|
Args:
|
||||||
max_len:
|
|
||||||
input_ids:
|
input_ids:
|
||||||
seq_lens_this_time:
|
seq_lens_this_time:
|
||||||
speculative_decoding:
|
speculative_decoding:
|
||||||
@@ -97,6 +95,7 @@ def pre_process(
|
|||||||
cu_seqlens_k:
|
cu_seqlens_k:
|
||||||
"""
|
"""
|
||||||
# Remove padding
|
# Remove padding
|
||||||
|
max_len = input_ids.shape[1]
|
||||||
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
|
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
|
||||||
token_num = paddle.sum(seq_lens_this_time)
|
token_num = paddle.sum(seq_lens_this_time)
|
||||||
output_padding_offset = None
|
output_padding_offset = None
|
||||||
@@ -490,6 +489,7 @@ def rebuild_padding(
|
|||||||
)
|
)
|
||||||
elif current_platform.is_dcu():
|
elif current_platform.is_dcu():
|
||||||
from fastdeploy.model_executor.ops.gpu import rebuild_padding
|
from fastdeploy.model_executor.ops.gpu import rebuild_padding
|
||||||
|
|
||||||
hidden_states = rebuild_padding(
|
hidden_states = rebuild_padding(
|
||||||
tmp_out,
|
tmp_out,
|
||||||
cum_offsets,
|
cum_offsets,
|
||||||
|
@@ -502,7 +502,6 @@ class MTPProposer(Proposer):
|
|||||||
output_cum_offsets,
|
output_cum_offsets,
|
||||||
output_padding_offset,
|
output_padding_offset,
|
||||||
) = pre_process(
|
) = pre_process(
|
||||||
self.parallel_config.max_model_len,
|
|
||||||
self.model_inputs["input_ids"],
|
self.model_inputs["input_ids"],
|
||||||
self.model_inputs["seq_lens_this_time"],
|
self.model_inputs["seq_lens_this_time"],
|
||||||
True,
|
True,
|
||||||
|
@@ -449,7 +449,6 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
output_cum_offsets,
|
output_cum_offsets,
|
||||||
output_padding_offset,
|
output_padding_offset,
|
||||||
) = pre_process(
|
) = pre_process(
|
||||||
self.parallel_config.max_model_len,
|
|
||||||
self.share_inputs["input_ids"],
|
self.share_inputs["input_ids"],
|
||||||
self.share_inputs["seq_lens_this_time"],
|
self.share_inputs["seq_lens_this_time"],
|
||||||
self.speculative_decoding,
|
self.speculative_decoding,
|
||||||
|
@@ -601,7 +601,6 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
output_cum_offsets,
|
output_cum_offsets,
|
||||||
output_padding_offset,
|
output_padding_offset,
|
||||||
) = pre_process(
|
) = pre_process(
|
||||||
self.parallel_config.max_model_len,
|
|
||||||
self.share_inputs["input_ids"],
|
self.share_inputs["input_ids"],
|
||||||
self.share_inputs["seq_lens_this_time"],
|
self.share_inputs["seq_lens_this_time"],
|
||||||
self.speculative_decoding,
|
self.speculative_decoding,
|
||||||
|
@@ -41,7 +41,6 @@ logger = get_logger("xpu_model_runner", "xpu_model_runner.log")
|
|||||||
|
|
||||||
|
|
||||||
def xpu_pre_process(
|
def xpu_pre_process(
|
||||||
max_len: int,
|
|
||||||
input_ids: paddle.Tensor,
|
input_ids: paddle.Tensor,
|
||||||
seq_lens_this_time: int,
|
seq_lens_this_time: int,
|
||||||
share_inputs: Dict,
|
share_inputs: Dict,
|
||||||
@@ -51,6 +50,7 @@ def xpu_pre_process(
|
|||||||
seq_lens_decoder: Optional[paddle.Tensor] = None,
|
seq_lens_decoder: Optional[paddle.Tensor] = None,
|
||||||
) -> XPUForwardMeta:
|
) -> XPUForwardMeta:
|
||||||
""" """
|
""" """
|
||||||
|
max_len = input_ids.shape[1]
|
||||||
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
|
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
|
||||||
token_num = paddle.sum(seq_lens_this_time)
|
token_num = paddle.sum(seq_lens_this_time)
|
||||||
from fastdeploy.model_executor.ops.xpu import (
|
from fastdeploy.model_executor.ops.xpu import (
|
||||||
@@ -458,7 +458,6 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
def _prepare_inputs(self) -> None:
|
def _prepare_inputs(self) -> None:
|
||||||
"""prepare the model inputs"""
|
"""prepare the model inputs"""
|
||||||
self.forward_meta = xpu_pre_process(
|
self.forward_meta = xpu_pre_process(
|
||||||
self.parallel_config.max_model_len,
|
|
||||||
self.share_inputs["input_ids"],
|
self.share_inputs["input_ids"],
|
||||||
self.share_inputs["seq_lens_this_time"],
|
self.share_inputs["seq_lens_this_time"],
|
||||||
self.share_inputs,
|
self.share_inputs,
|
||||||
|
Reference in New Issue
Block a user