[XPU] Supports BF16 for ERNIE-4.5-21B-A3B and ERNIE-4.5-0.3B (#2765)

* fix no quant xpu moe * change dir of xpu moe weight only
2025-11-03 02:53:26 +08:00 · 2025-07-09 15:57:51 +08:00
parent 771e71a24d
commit be21ef5047
5 changed files with 234 additions and 117 deletions
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -20,9 +20,24 @@ from paddleformers.utils.log import logger

 from fastdeploy import envs
 from fastdeploy.model_executor.layers.utils import get_tensor
-from fastdeploy.platforms import current_platform


+def get_moe_method():
+    """
+    return moe method based on device platform
+    """
+    from fastdeploy.platforms import current_platform
+    if current_platform.is_cuda():
+        from .fused_moe_cutlass_backend import CutlassMoEMethod
+        return CutlassMoEMethod(None)
+    elif current_platform.is_xpu():
+        from .fused_moe_xpu_backend import XPUMoEMethod
+        return XPUMoEMethod(None)
+    elif current_platform.is_gcu():
+        from fastdeploy.model_executor.layers.backends import GCUFusedMoeMethod
+        return GCUFusedMoeMethod(None)
+    raise NotImplementedError()
+
 class FusedMoE(nn.Layer):
    """
    FusedMoE is a layer that performs MoE (Mixture of Experts) computation.
@@ -96,13 +111,7 @@ class FusedMoE(nn.Layer):
            self.moe_quant_type = moe_quant_config.name()
        else:
            # now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
-            if current_platform.is_cuda():
-                from .fused_moe_cutlass_backend import CutlassMoEMethod
-                self.quant_method = CutlassMoEMethod(None)
-            elif current_platform.is_gcu():
-                from fastdeploy.model_executor.layers.backends import \
-                    GCUFusedMoeMethod
-                self.quant_method = GCUFusedMoeMethod(None)
+            self.quant_method = get_moe_method()

        if self.ep_size > 1:
            self.quant_method.init_ep(self)