V1 loader support ep (#3801)

This commit is contained in:
YuanRisheng
2025-09-03 16:05:41 +08:00
committed by GitHub
parent fa58a9fa8f
commit 0a1ce612c2
6 changed files with 34 additions and 15 deletions

View File

@@ -189,12 +189,20 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
if current_platform.is_cuda():
self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size * 2,
]
self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
else:
self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size * 2,
layer.hidden_size,
]
self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
layer.up_gate_proj_weight = layer.create_parameter(

View File

@@ -1054,13 +1054,13 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
if layer.fd_config.load_config.load_choices == "default_v1":
layer.up_gate_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.down_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size],
shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
@@ -1167,7 +1167,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
# 3.quantize weight
for expert_id in range(layer.num_experts):
for expert_id in range(layer.num_local_experts):
weight[expert_id], scale[expert_id] = weight_quantize(
getattr(layer, unquantized_weight_name)[expert_id], algo=self.moe_quant_type
)

View File

@@ -59,13 +59,13 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
]
if self.quant_config.is_checkpoint_bf16:
layer.up_gate_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.down_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size],
shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
@@ -164,7 +164,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
# 3.quantize weight
from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8
for expert_id in range(layer.num_experts):
for expert_id in range(layer.num_local_experts):
weight_quant, scale[expert_id] = per_block_cast_to_fp8(
getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size
)

View File

@@ -627,13 +627,13 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
]
if self.quant_config.is_checkpoint_bf16:
layer.up_gate_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.down_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size],
shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
@@ -732,7 +732,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
# 3.quantize weight
from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8
for expert_id in range(layer.num_experts):
for expert_id in range(layer.num_local_experts):
weight_quant, scale[expert_id] = per_block_cast_to_fp8(
getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size
)

View File

@@ -299,6 +299,7 @@ class FusedMoE(nn.Layer):
param_down_proj_name: Optional[str] = None,
ckpt_expert_key_name: str = "experts",
experts_offset: int = 0,
num_experts_start_offset: int = 0,
) -> list[tuple[str, str, int, str]]:
param_name_maping = []
@@ -323,7 +324,9 @@ class FusedMoE(nn.Layer):
expert_id,
shard_id,
)
for expert_id in range(experts_offset, experts_offset + num_experts)
for expert_id in range(
experts_offset + num_experts_start_offset, experts_offset + num_experts_start_offset + num_experts
)
for shard_id, weight_name in param_name_maping
]