[XPU] refine fused moe (#4219)

2025-12-24 13:28:13 +08:00 · 2025-10-16 19:04:07 +08:00
parent 3bbe99eae7
commit 26ff2f8683
7 changed files with 354 additions and 585 deletions
--- a/custom_ops/xpu_ops/src/ops/moe_layer.cc
+++ b/custom_ops/xpu_ops/src/ops/moe_layer.cc
@@ -228,8 +228,9 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight,
               quant_method == "weight_only_int4") {
        APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, int4_t);
    } else {
-        PD_THROW("MoeLayer not support x_type==%d, w_type==%d",
-                 static_cast<int>(x_type), static_cast<int>(w_type));
+        PD_THROW("MoeLayer not support x_type=", static_cast<int>(x_type),
+                 ", w_type=", static_cast<int>(w_type),
+                 ", quant_method=", quant_method);
        return {};
    }
 #undef APPLY_MOE_LAYER_KERNEL
--- a/fastdeploy/model_executor/layers/backends/xpu/init.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/init.py
@@ -16,16 +16,11 @@
 xpu backend methods
 """

-from .moe.fused_moe import (
-    XPUMoEMethod,
-    XPUWeightOnlyMoeEpMethod,
-    XPUWeightOnlyMoEMethod,
-)
+from .moe.fused_moe import XPUMoEMethod, XPUWeightOnlyMoEMethod
 from .quantization.weight_only import XPUWeightOnlyLinearMethod

 __all__ = [
    "XPUWeightOnlyLinearMethod",
    "XPUMoEMethod",
    "XPUWeightOnlyMoEMethod",
-    "XPUWeightOnlyMoeEpMethod",
 ]
--- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
--- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
@@ -30,15 +30,22 @@ class MoEMethodBase(QuantMethodBase):

    def __init__(self, quant_config):
        super().__init__()
-        if quant_config is None:
+        self.quant_config = quant_config
+        if self.quant_config is None:
            self.moe_quant_type = "w16a16"
+        elif hasattr(quant_config, "algo"):
+            self.moe_quant_type = quant_config.algo
        else:
-            self.quant_config = quant_config
+            self.moe_quant_type = quant_config.name()
        self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
        self.added_scale_attrs = [
            "up_gate_proj_weight_scale",
            "down_proj_weight_scale",
        ]
+        self.added_in_scale_attrs = [
+            "up_gate_proj_in_scale",
+            "down_proj_in_scale",
+        ]
        self.pack_num = 1
        self.ep_prefill_runner = None
        self.ep_decoder_runner = None
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -79,18 +79,11 @@ class WeightOnlyConfig(QuantConfigBase):
    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
        if current_platform.is_xpu():
            if isinstance(layer, FusedMoE):
-                if layer.ep_size > 1:
-                    from fastdeploy.model_executor.layers.backends import (
-                        XPUWeightOnlyMoeEpMethod,
-                    )
+                from fastdeploy.model_executor.layers.backends import (
+                    XPUWeightOnlyMoEMethod,
+                )

-                    return XPUWeightOnlyMoeEpMethod(self)
-                else:
-                    from fastdeploy.model_executor.layers.backends import (
-                        XPUWeightOnlyMoEMethod,
-                    )
-
-                    return XPUWeightOnlyMoEMethod(self)
+                return XPUWeightOnlyMoEMethod(self)
            else:
                from fastdeploy.model_executor.layers.backends import (
                    XPUWeightOnlyLinearMethod,
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -404,9 +404,9 @@ class PaddleDisWorkerProc:

            if num_blocks_local <= 0:
                raise ValueError(
-                    "The total number of blocks cannot be less than zero."
-                    "Please increase gpu_memory_utilization"
-                    "Or decrease max_num_batched_tokens(max model length) "
+                    "The total number of blocks cannot be less than zero. "
+                    "Please increase gpu_memory_utilization "
+                    "Or decrease max_num_batched_tokens(max model length)."
                )

            if self.ranks > 1:
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -1227,7 +1227,8 @@ class XPUModelRunner(ModelRunnerBase):
        """
        Clear the block tables and kv cache after profiling.
        """
-        del self.share_inputs["caches"]
+        if hasattr(self.share_inputs, "caches"):
+            del self.share_inputs["caches"]
        if self.forward_meta is not None:
            del self.forward_meta.caches
        paddle.device.xpu.empty_cache()