[Metax] support default_v1 loader & thinking model (#4956)

Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-11-12 16:32:26 +08:00
parent bde6e2f931
commit 9d9f5df8d0
4 changed files with 23 additions and 5 deletions
--- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
@@ -142,11 +142,18 @@ class MetaxCutlassMoEMethod(MoEMethodBase):
                1.0,
            )
        else:
+            added_weight_attrs0 = getattr(layer, self.added_weight_attrs[0])
+            added_weight_attrs1 = getattr(layer, self.added_weight_attrs[1])
+
+            if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+                added_weight_attrs0 = paddle.transpose(added_weight_attrs0, perm=[0, 2, 1])
+                added_weight_attrs1 = paddle.transpose(added_weight_attrs1, perm=[0, 2, 1])
+
            fused_moe_out = fused_expert_moe(
                x,
                gate.weight,
-                getattr(layer, self.added_weight_attrs[0]),
-                getattr(layer, self.added_weight_attrs[1]),
+                added_weight_attrs0,
+                added_weight_attrs1,
                None,
                (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
                None,
@@ -348,6 +355,7 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod):
        weight_name = self.added_weight_attrs[weight_id_map[weight_type]]
        unquantized_weight_name = weight_name.replace("quant_weight", "weight")
        weight_shape = self.up_gate_proj_weight_shape if weight_type == "gate_up" else self.down_proj_weight_shape
+        weight_shape[1], weight_shape[2] = weight_shape[2], weight_shape[1]
        weight_dtype = "int8"
        # scale
        scale_name = self.added_scale_attrs[weight_id_map[weight_type]]
--- a/fastdeploy/model_executor/layers/embeddings.py
+++ b/fastdeploy/model_executor/layers/embeddings.py
@@ -24,6 +24,7 @@ from paddle.distributed import fleet

 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.utils import h2d_copy, set_weight_attrs, slice_fn
+from fastdeploy.platforms import current_platform

 from .utils import (
    DEFAULT_VOCAB_PADDING_SIZE,
@@ -274,7 +275,8 @@ class VocabParallelEmbedding(nn.Layer):

        if output_dim == 0:
            h2d_copy(param[: shard_weight.shape[0]], shard_weight)
-            param[shard_weight.shape[0] :].fill_(0)
+            if not current_platform.is_maca():
+                param[shard_weight.shape[0] :].fill_(0)
        else:
            h2d_copy(param[:, : shard_weight.shape[1]], shard_weight)
            param[:, shard_weight.shape[1] :].fill_(0)
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -326,6 +326,9 @@ class WeightOnlyLinearMethod(QuantMethodBase):
                    arch=self.quant_config.weight_only_linear_arch,
                )

+                if current_platform.is_maca():
+                    quanted_weight_tensor = paddle.transpose(quanted_weight_tensor, [1, 0])
+
            free_tensor(layer.weight)

            layer.weight = layer.create_parameter(
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -363,8 +363,13 @@ def v1_loader_support(fd_config):
    def _err_msg(msg: str) -> str:
        logger.info(msg + "; fallback to the v0 loader for model loading.")

-    if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_iluvatar()):
-        _err_msg("v1loader currently only support backends gpu, xpu and iluvatar")
+    if not (
+        current_platform.is_cuda()
+        or current_platform.is_xpu()
+        or current_platform.is_iluvatar()
+        or current_platform.is_maca()
+    ):
+        _err_msg("v1loader currently only support backends gpu, xpu, iluvatar and maca")
        return False

    if is_pre_sliced_weight(fd_config.model_config.model):