[xpu] support mtp for xpu(mix) (#5274)

* [XPU] support kernel for mtp(base)

* [XPU] support kernel for mtp(base)

* format

* format

* format

* fix gather next token

* fix step && add test

* fix

* mv pre/post process

* add adjust batch / gather next token for mtp

* fix code style

* fix mtp kenrel name

* fix mtp kernel test

* mv xpu pre/post process

* mv xpu pre/post process

* [xpu] support mtp

* fix code style
This commit is contained in:
cmcamdy
2025-12-01 11:03:14 +08:00
committed by GitHub
parent 8aec3acc8c
commit 9f4977eb74
8 changed files with 691 additions and 106 deletions

View File

@@ -182,24 +182,28 @@ def apply_speculative_penalty_multi_scores(
from fastdeploy.model_executor.ops.gpu import (
speculate_get_token_penalty_multi_scores,
)
speculate_get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
seq_lens_this_time,
output_padding_offset,
output_cum_offsets,
max_len,
elif current_platform.is_xpu():
from fastdeploy.model_executor.ops.xpu import (
speculate_get_token_penalty_multi_scores,
)
else:
raise NotImplementedError
speculate_get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
seq_lens_this_time,
output_padding_offset,
output_cum_offsets,
max_len,
)
# inplace
return logits