[Iluvatar GPU] Optimze attention and moe performance (#3234)

2025-10-27 02:20:31 +08:00 · 2025-08-08 10:51:24 +08:00
parent 37569cca86
commit fbdd6b0663
24 changed files with 1130 additions and 1653 deletions
--- a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
+++ b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -128,10 +128,16 @@ def rejection_top_p_sampling(
    rejection_top_p_sampling
    """
    try:
-        from fastdeploy.model_executor.ops.gpu import (
-            rejection_top_p_sampling,
-            top_k_renorm_probs,
-        )
+        if current_platform.is_iluvatar():
+            from fastdeploy.model_executor.ops.iluvatar import (
+                rejection_top_p_sampling,
+                top_k_renorm_probs,
+            )
+        else:
+            from fastdeploy.model_executor.ops.gpu import (
+                rejection_top_p_sampling,
+                top_k_renorm_probs,
+            )

        if paddle.count_nonzero(top_k) == 0:
            ids = rejection_top_p_sampling(