[Features] support hugging face qwen3 moe (#3649)

* split ut * qwen3-30B-A3B * fix * add test * add test_torch_model.py * fix test_torch_model.py * delete print * fix moe * delete init.py * fix * fix --------- Co-authored-by: bukejiyu <395822456@qq.com> Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
2025-11-02 20:54:03 +08:00 · 2025-08-30 15:26:05 +08:00
parent f206474cc7
commit 455205f991
9 changed files with 437 additions and 258 deletions
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -294,6 +294,7 @@ class ReplicatedLinear(LinearBase):
            weight_loader=(
                self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
            ),
+            model_format=fd_config.model_config.model_format,
        )


@@ -446,7 +447,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                shard_size = (self.local_rank + 1) * block_size
                loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_size)

-            loaded_weight = get_tensor(loaded_weight)
            if not param._is_initialized():
                param.initialize()
            param_shard_size = output_size // 2
@@ -574,7 +574,6 @@ class QKVParallelLinear(ColumnParallelLinear):
                shard_size = (shard_id + 1) * block_size
                loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_size)

-            loaded_weight = get_tensor(loaded_weight)
            if not param._is_initialized():
                param.initialize()

--- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
@@ -19,7 +19,7 @@ from abc import abstractmethod
 import paddle
 from paddle import nn

-from fastdeploy.model_executor.utils import set_weight_attrs
+from fastdeploy.model_executor.utils import default_weight_loader, set_weight_attrs
 from fastdeploy.platforms import current_platform

 from ..quantization.quant_base import QuantMethodBase
@@ -205,5 +205,17 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
            default_initializer=paddle.nn.initializer.Constant(0),
        )

-        set_weight_attrs(layer.up_gate_proj_weight, extra_weight_attrs)
-        set_weight_attrs(layer.down_proj_weight, extra_weight_attrs)
+        set_weight_attrs(
+            layer.up_gate_proj_weight,
+            {
+                "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                "model_format": extra_weight_attrs.get("model_format", ""),
+            },
+        )
+        set_weight_attrs(
+            layer.down_proj_weight,
+            {
+                "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                "model_format": extra_weight_attrs.get("model_format", ""),
+            },
+        )
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -151,7 +151,9 @@ class FusedMoE(nn.Layer):
            self.gate_correction_bias = gate_correction_bias
        else:
            self.gate_correction_bias = None
-        self.quant_method.create_weights(self, weight_loader=self.weight_loader)
+        self.quant_method.create_weights(
+            self, weight_loader=self.weight_loader, model_format=fd_config.model_config.model_format
+        )

        logger.info(
            f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \
@@ -197,6 +199,9 @@ class FusedMoE(nn.Layer):
            )

    def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim=None):
+        model_format = getattr(param, "model_format", "")
+        if model_format == "torch":
+            loaded_weight = loaded_weight.transpose([1, 0])
        dim = -1 if shard_dim else 0
        if self.tp_size > 1:
            if isinstance(loaded_weight, (np.ndarray, paddle.Tensor)):
@@ -208,8 +213,6 @@ class FusedMoE(nn.Layer):
            shard_size = (self.tp_rank + 1) * block_size
            loaded_weight = slice_fn(loaded_weight, shard_dim, shard_offset, shard_size)

-        loaded_weight = get_tensor(loaded_weight)
-
        expert_param = param[expert_id - self.expert_id_offset]
        param_shard_size = expert_param.shape[dim] // 2
        if shard_id == "gate":
@@ -229,14 +232,18 @@ class FusedMoE(nn.Layer):
            )

        # To ensure compatibility across backends, apply an extra transpose for GCU and XPU
-        if expert_param.shape != loaded_weight.shape:
-            loaded_weight = loaded_weight.transpose([1, 0])
+        if current_platform.is_xpu() or current_platform.is_gcu():
+            if expert_param.shape != loaded_weight.shape:
+                loaded_weight = loaded_weight.transpose([1, 0])
        assert expert_param.shape == loaded_weight.shape, (
            f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})"
        )
        expert_param.copy_(loaded_weight, False)

    def _load_down_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim=None):
+        model_format = getattr(param, "model_format", "")
+        if model_format == "torch":
+            loaded_weight = loaded_weight.transpose([1, 0])
        if self.tp_size > 1 and shard_dim is not None:
            dim = -1 if shard_dim else 0
            if isinstance(loaded_weight, (np.ndarray, paddle.Tensor)):
@@ -247,14 +254,14 @@ class FusedMoE(nn.Layer):
            shard_offset = self.tp_rank * block_size
            shard_size = (self.tp_rank + 1) * block_size
            loaded_weight = slice_fn(loaded_weight, shard_dim, shard_offset, shard_size)
-        loaded_weight = get_tensor(loaded_weight)
        expert_param = param[expert_id - self.expert_id_offset]
        if hasattr(param, "tensor_track"):
            # for dyn quant
            param.tensor_track.mark(start=0, batch_id=expert_id - self.expert_id_offset)
        # To ensure compatibility across backends, apply an extra transpose for GCU and XPU
-        if expert_param.shape != loaded_weight.shape:
-            loaded_weight = loaded_weight.transpose([1, 0])
+        if current_platform.is_xpu or current_platform.is_gcu():
+            if expert_param.shape != loaded_weight.shape:
+                loaded_weight = loaded_weight.transpose([1, 0])
        assert expert_param.shape == loaded_weight.shape, (
            f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})"
        )