[Metax] refactor cutlass moe and optimize flash attention (#5361)

* [Metax] refactor moe and flash attention backend
---------

Co-authored-by: zhangchenyi_dl <16219492+zhangchenyidl@user.noreply.gitee.com>
This commit is contained in:
Neil Zhu
2025-12-10 17:15:17 +08:00
committed by GitHub
parent fbc9bce1e9
commit 4403a21d4b
19 changed files with 3087 additions and 1727 deletions

View File

@@ -344,21 +344,12 @@ def post_process_normal(
model_output.stop_flags,
)
if current_platform.is_cuda() or current_platform.is_iluvatar() or current_platform.is_dcu():
set_stop_value_multi_ends(
sampler_output.sampled_token_ids,
model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.eos_token_id,
model_output.next_tokens,
model_output.pre_ids,
model_output.step_idx,
model_output.stop_token_ids,
model_output.stop_seqs_len,
model_output.min_tokens,
False,
) # multi ends
elif current_platform.is_maca():
if (
current_platform.is_cuda()
or current_platform.is_iluvatar()
or current_platform.is_dcu()
or current_platform.is_maca()
):
set_stop_value_multi_ends(
sampler_output.sampled_token_ids,
model_output.stop_flags,