qwen3_moe (#3084)

2025-10-05 08:37:06 +08:00 · 2025-08-06 14:45:27 +08:00
parent 91dc87f1c5
commit 20839abccf
30 changed files with 1361 additions and 1087 deletions
--- a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
@@ -22,7 +22,9 @@ import paddle
 from paddle import nn
 from paddleformers.utils.log import logger

-from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
+from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
+    UnquantizedFusedMoEMethod,
+)
 from fastdeploy.model_executor.layers.utils import (
    CpuGuard,
    create_and_set_parameter,
@@ -37,7 +39,7 @@ from fastdeploy.model_executor.ops.gcu import (
 )


-class GCUFusedMoeMethod(MoEMethodBase):
+class GCUFusedMoeMethod(UnquantizedFusedMoEMethod):
    """
    Use GCU to compute Fused MoE.
    """
@@ -46,28 +48,12 @@ class GCUFusedMoeMethod(MoEMethodBase):
        super().__init__(quant_config)
        self.group_size = -1

-    def create_weights(self, layer: nn.Layer, state_dict):
-        """
-        Paddle gcu create weight process.
-        """
-        # bf16
+    def process_loaded_weights(self, layer: nn.Layer, state_dict):
        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
        stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
        stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
-        for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]):
-            # shape [E, K, N] -> [E, N, K]
-            weight_tensor = paddle.transpose(weight_tensor, [0, 2, 1])
-            weight_name = self.added_weight_attrs[idx]
-            setattr(
-                layer,
-                weight_name,
-                layer.create_parameter(
-                    shape=weight_tensor.shape,
-                    dtype=weight_tensor.dtype,
-                    default_initializer=paddle.nn.initializer.Constant(0),
-                ),
-            )
-            getattr(layer, weight_name).set_value(weight_tensor)
+        layer.up_gate_proj_weight.set_value(paddle.transpose(stacked_up_gate_proj_weights, [0, 2, 1]))
+        layer.down_proj_weight.set_value(paddle.transpose(stacked_down_proj_weights, [0, 2, 1]))

    @paddle.no_grad()
    def compute_ffn(
@@ -202,18 +188,19 @@ class GCUFusedMoeMethod(MoEMethodBase):
        self,
        layer: nn.Layer,
        x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
    ) -> paddle.Tensor:
        """
        Paddle gcu compute Fused MoE.
        """
+        gate_out = gate(x.cast("float32"))
        return self.compute_ffn(layer, x, gate_out, enable_quant=False)

    def apply_ep_prefill(
        self,
        layer: nn.Layer,
        x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
    ) -> paddle.Tensor:
        """
        Apply the EP prefill method.
@@ -224,7 +211,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
        self,
        layer: nn.Layer,
        x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
    ) -> paddle.Tensor:
        """
        Apply the EP decoder method.
@@ -235,7 +222,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
        self,
        layer: nn.Layer,
        x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
    ) -> paddle.Tensor:
        """
        Paddle Cutlass compute Fused MoE.
@@ -400,9 +387,10 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
        self,
        layer: nn.Layer,
        x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
    ) -> paddle.Tensor:
        """
        Paddle gcu compute Fused MoE.
        """
+        gate_out = gate(x.cast("float32"))
        return self.compute_ffn(layer, x, gate_out, enable_quant=True)
--- a/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
@@ -37,7 +37,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        self.quant_config = quant_config
        self.group_size = -1

-    def create_weights(self, layer):
+    def create_weights(self, layer, **extra_weight_attrs):
        # The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
        weight_scale_shape = [layer.weight_shape[1]]

@@ -45,6 +45,14 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        if self.quant_config.name() == "wint4":
            layer.weight_shape[0] //= 2
        layer.weight_dtype = "int8"
+
+        layer.weight = layer.create_parameter(
+            shape=layer.weight_shape,
+            dtype=layer.weight_dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
        layer.weight_scale = layer.create_parameter(
            shape=weight_scale_shape,
            dtype=layer._dtype,