[XPU] Supports BF16 for ERNIE-4.5-21B-A3B and ERNIE-4.5-0.3B (#2765)

* fix no quant xpu moe * change dir of xpu moe weight only
2025-11-03 02:53:26 +08:00 · 2025-07-09 15:57:51 +08:00
parent 771e71a24d
commit be21ef5047
5 changed files with 234 additions and 117 deletions
--- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
@@ -13,14 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-
-from typing import Dict
-
 import paddle
 from paddle import nn

-from fastdeploy.model_executor.layers.quantization.quant_base import \
-    QuantMethodBase
 from fastdeploy.model_executor.layers.quantization.weight_only import (
    WeightOnlyConfig, WeightOnlyLinearMethod)
 from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu
@@ -63,103 +58,3 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        layer.linear_weight.set_value(
            paddle.transpose(quanted_weight_tensor, [1, 0]))
        layer.linear_weight_scale.set_value(weight_scale_tensor)
-
-
-class XPUWeightOnlyMoEMethod(QuantMethodBase):
-    """
-    XPU Fused MoE Method.
-    """
-
-    def __init__(
-        self,
-        quant_config: WeightOnlyConfig,
-    ) -> None:
-        super().__init__()
-        self.quant_config = quant_config
-        self.moe_quant_type = self.quant_config.algo
-
-    def create_weights(self, layer: nn.Layer, state_dict: Dict[str,
-                                                               paddle.Tensor]):
-        """
-        Paddle cutlass create weight process.
-        """
-        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
-        assert len(ffn1_weights) == layer.num_local_experts
-        assert len(ffn2_weights) == layer.num_local_experts
-        assert ffn1_weights[0].shape == [
-            layer.hidden_size, layer.moe_intermediate_size * 2
-        ]
-        assert ffn2_weights[0].shape == [
-            layer.moe_intermediate_size, layer.hidden_size
-        ]
-
-        added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
-        added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
-
-        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
-            weight_name = added_weight_attrs[idx]
-            scale_name = added_scale_attrs[idx]
-
-            weight_list = []
-            weight_scale_list = []
-            for i in range(layer.num_local_experts):
-                quant_weight, scale = weight_quantize_xpu(
-                    weight_tensor[i], self.moe_quant_type, -1,
-                    -1)  # weight is [k,n]
-                weight_list.append(quant_weight.transpose(
-                    [1, 0]))  # transpose weight to [n,k]
-                weight_scale_list.append(scale)
-            quanted_weight = paddle.stack(weight_list, axis=0)
-            setattr(
-                layer, weight_name,
-                layer.create_parameter(
-                    shape=quanted_weight.shape,
-                    dtype=quanted_weight.dtype,
-                    default_initializer=paddle.nn.initializer.Constant(0),
-                ))
-            getattr(layer, weight_name).set_value(quanted_weight)
-
-            quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
-            setattr(
-                layer, scale_name,
-                layer.create_parameter(
-                    shape=quanted_weight_scale.shape,
-                    dtype=quanted_weight_scale.dtype,
-                ))
-            getattr(layer, scale_name).set_value(quanted_weight_scale)
-
-    def apply(
-        self,
-        layer: nn.Layer,
-        x: paddle.Tensor,
-        gate_out: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        XPU compute Fused MoE.
-        """
-        from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
-
-        fused_moe_out = xpu_moe_layer(
-            x,
-            layer.gate_weight.transpose([1, 0]),
-            layer.gate_correction_bias,
-            layer.moe_ffn1_weight,
-            layer.moe_ffn2_weight,
-            None,  # ffn1 bias
-            None,  # ffn2 bias
-            (layer.moe_ffn1_weight_scale
-             if hasattr(layer, "moe_ffn1_weight_scale") else None),
-            (layer.moe_ffn2_weight_scale
-             if hasattr(layer, "moe_ffn2_weight_scale") else None),
-            (layer.moe_ffn2_in_scale
-             if hasattr(layer, "moe_ffn2_in_scale") else None),
-            self.moe_quant_type,
-            layer.top_k,
-            False,  # moe group, used in deepseek
-        )
-        if layer.tp_size > 1:
-            from fastdeploy.distributed.communication_op import \
-                tensor_model_parallel_all_reduce
-            tensor_model_parallel_all_reduce(fused_moe_out)
-
-        return fused_moe_out