[MTP] optimize mtp infer speed (#2840)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

This commit is contained in:
freeliuzc
2025-07-14 19:50:22 +08:00
committed by GitHub
parent 4c7b8bc458
commit 7cdd8d290d
6 changed files with 253 additions and 24 deletions

View File

@@ -123,7 +123,7 @@ def apply_speculative_penalty_multi_scores(
from fastdeploy.model_executor.ops.gpu import \
speculate_get_token_penalty_multi_scores
logits = speculate_get_token_penalty_multi_scores(
speculate_get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
@@ -141,5 +141,5 @@ def apply_speculative_penalty_multi_scores(
)
else:
raise NotImplementedError()
# inplace
return logits

View File

@@ -101,6 +101,8 @@ def pre_process(
seq_lens_encoder,
seq_lens_decoder,
)
if isinstance(seq_lens_output, list):
seq_lens_output = seq_lens_output[0]
output_token_num = paddle.sum(seq_lens_output)
output_cum_offsets_tmp = paddle.cumsum(max_len - seq_lens_output)
output_padding_offset, output_cum_offsets = speculate_get_output_padding_offset(