[Metax] refactor cutlass moe and optimize flash attention (#5361)

* [Metax] refactor moe and flash attention backend --------- Co-authored-by: zhangchenyi_dl <16219492+zhangchenyidl@user.noreply.gitee.com>
2025-12-24 13:28:13 +08:00 · 2025-12-10 17:15:17 +08:00
parent fbc9bce1e9
commit 4403a21d4b
19 changed files with 3087 additions and 1727 deletions
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -344,21 +344,12 @@ def post_process_normal(
        model_output.stop_flags,
    )

-    if current_platform.is_cuda() or current_platform.is_iluvatar() or current_platform.is_dcu():
-        set_stop_value_multi_ends(
-            sampler_output.sampled_token_ids,
-            model_output.stop_flags,
-            model_output.seq_lens_this_time,
-            model_output.eos_token_id,
-            model_output.next_tokens,
-            model_output.pre_ids,
-            model_output.step_idx,
-            model_output.stop_token_ids,
-            model_output.stop_seqs_len,
-            model_output.min_tokens,
-            False,
-        )  # multi ends
-    elif current_platform.is_maca():
+    if (
+        current_platform.is_cuda()
+        or current_platform.is_iluvatar()
+        or current_platform.is_dcu()
+        or current_platform.is_maca()
+    ):
        set_stop_value_multi_ends(
            sampler_output.sampled_token_ids,
            model_output.stop_flags,