[Metax] support default_v1 loader & thinking model (#4956)

Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
This commit is contained in:
MingkunZhang
2025-11-12 16:32:26 +08:00
committed by GitHub
parent bde6e2f931
commit 9d9f5df8d0
4 changed files with 23 additions and 5 deletions

View File

@@ -142,11 +142,18 @@ class MetaxCutlassMoEMethod(MoEMethodBase):
1.0,
)
else:
added_weight_attrs0 = getattr(layer, self.added_weight_attrs[0])
added_weight_attrs1 = getattr(layer, self.added_weight_attrs[1])
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
added_weight_attrs0 = paddle.transpose(added_weight_attrs0, perm=[0, 2, 1])
added_weight_attrs1 = paddle.transpose(added_weight_attrs1, perm=[0, 2, 1])
fused_moe_out = fused_expert_moe(
x,
gate.weight,
getattr(layer, self.added_weight_attrs[0]),
getattr(layer, self.added_weight_attrs[1]),
added_weight_attrs0,
added_weight_attrs1,
None,
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
None,
@@ -348,6 +355,7 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod):
weight_name = self.added_weight_attrs[weight_id_map[weight_type]]
unquantized_weight_name = weight_name.replace("quant_weight", "weight")
weight_shape = self.up_gate_proj_weight_shape if weight_type == "gate_up" else self.down_proj_weight_shape
weight_shape[1], weight_shape[2] = weight_shape[2], weight_shape[1]
weight_dtype = "int8"
# scale
scale_name = self.added_scale_attrs[weight_id_map[weight_type]]

View File

@@ -24,6 +24,7 @@ from paddle.distributed import fleet
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.utils import h2d_copy, set_weight_attrs, slice_fn
from fastdeploy.platforms import current_platform
from .utils import (
DEFAULT_VOCAB_PADDING_SIZE,
@@ -274,7 +275,8 @@ class VocabParallelEmbedding(nn.Layer):
if output_dim == 0:
h2d_copy(param[: shard_weight.shape[0]], shard_weight)
param[shard_weight.shape[0] :].fill_(0)
if not current_platform.is_maca():
param[shard_weight.shape[0] :].fill_(0)
else:
h2d_copy(param[:, : shard_weight.shape[1]], shard_weight)
param[:, shard_weight.shape[1] :].fill_(0)

View File

@@ -326,6 +326,9 @@ class WeightOnlyLinearMethod(QuantMethodBase):
arch=self.quant_config.weight_only_linear_arch,
)
if current_platform.is_maca():
quanted_weight_tensor = paddle.transpose(quanted_weight_tensor, [1, 0])
free_tensor(layer.weight)
layer.weight = layer.create_parameter(

View File

@@ -363,8 +363,13 @@ def v1_loader_support(fd_config):
def _err_msg(msg: str) -> str:
logger.info(msg + "; fallback to the v0 loader for model loading.")
if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_iluvatar()):
_err_msg("v1loader currently only support backends gpu, xpu and iluvatar")
if not (
current_platform.is_cuda()
or current_platform.is_xpu()
or current_platform.is_iluvatar()
or current_platform.is_maca()
):
_err_msg("v1loader currently only support backends gpu, xpu, iluvatar and maca")
return False
if is_pre_sliced_weight(fd_config.model_config.model):