mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Metax] refactor cutlass moe and optimize flash attention (#5361)
* [Metax] refactor moe and flash attention backend --------- Co-authored-by: zhangchenyi_dl <16219492+zhangchenyidl@user.noreply.gitee.com>
This commit is contained in:
@@ -344,21 +344,12 @@ def post_process_normal(
|
||||
model_output.stop_flags,
|
||||
)
|
||||
|
||||
if current_platform.is_cuda() or current_platform.is_iluvatar() or current_platform.is_dcu():
|
||||
set_stop_value_multi_ends(
|
||||
sampler_output.sampled_token_ids,
|
||||
model_output.stop_flags,
|
||||
model_output.seq_lens_this_time,
|
||||
model_output.eos_token_id,
|
||||
model_output.next_tokens,
|
||||
model_output.pre_ids,
|
||||
model_output.step_idx,
|
||||
model_output.stop_token_ids,
|
||||
model_output.stop_seqs_len,
|
||||
model_output.min_tokens,
|
||||
False,
|
||||
) # multi ends
|
||||
elif current_platform.is_maca():
|
||||
if (
|
||||
current_platform.is_cuda()
|
||||
or current_platform.is_iluvatar()
|
||||
or current_platform.is_dcu()
|
||||
or current_platform.is_maca()
|
||||
):
|
||||
set_stop_value_multi_ends(
|
||||
sampler_output.sampled_token_ids,
|
||||
model_output.stop_flags,
|
||||
|
||||
Reference in New Issue
Block a user