mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
V1 loader support ep (#3801)
This commit is contained in:
@@ -189,12 +189,20 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
|
|||||||
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||||
|
|
||||||
if current_platform.is_cuda():
|
if current_platform.is_cuda():
|
||||||
self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
|
self.up_gate_proj_weight_shape = [
|
||||||
self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
|
layer.num_local_experts,
|
||||||
|
layer.hidden_size,
|
||||||
|
layer.moe_intermediate_size * 2,
|
||||||
|
]
|
||||||
|
self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
|
||||||
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
|
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
|
||||||
else:
|
else:
|
||||||
self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
|
self.up_gate_proj_weight_shape = [
|
||||||
self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
|
layer.num_local_experts,
|
||||||
|
layer.moe_intermediate_size * 2,
|
||||||
|
layer.hidden_size,
|
||||||
|
]
|
||||||
|
self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
|
||||||
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
|
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
|
||||||
|
|
||||||
layer.up_gate_proj_weight = layer.create_parameter(
|
layer.up_gate_proj_weight = layer.create_parameter(
|
||||||
|
@@ -1054,13 +1054,13 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
|
|||||||
|
|
||||||
if layer.fd_config.load_config.load_choices == "default_v1":
|
if layer.fd_config.load_config.load_choices == "default_v1":
|
||||||
layer.up_gate_proj_weight = layer.create_parameter(
|
layer.up_gate_proj_weight = layer.create_parameter(
|
||||||
shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
|
shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
|
||||||
dtype=layer.weight_dtype,
|
dtype=layer.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
)
|
)
|
||||||
|
|
||||||
layer.down_proj_weight = layer.create_parameter(
|
layer.down_proj_weight = layer.create_parameter(
|
||||||
shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size],
|
shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
|
||||||
dtype=layer.weight_dtype,
|
dtype=layer.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
)
|
)
|
||||||
@@ -1167,7 +1167,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
|
|||||||
|
|
||||||
# 3.quantize weight
|
# 3.quantize weight
|
||||||
|
|
||||||
for expert_id in range(layer.num_experts):
|
for expert_id in range(layer.num_local_experts):
|
||||||
weight[expert_id], scale[expert_id] = weight_quantize(
|
weight[expert_id], scale[expert_id] = weight_quantize(
|
||||||
getattr(layer, unquantized_weight_name)[expert_id], algo=self.moe_quant_type
|
getattr(layer, unquantized_weight_name)[expert_id], algo=self.moe_quant_type
|
||||||
)
|
)
|
||||||
|
@@ -59,13 +59,13 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
|||||||
]
|
]
|
||||||
if self.quant_config.is_checkpoint_bf16:
|
if self.quant_config.is_checkpoint_bf16:
|
||||||
layer.up_gate_proj_weight = layer.create_parameter(
|
layer.up_gate_proj_weight = layer.create_parameter(
|
||||||
shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
|
shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
|
||||||
dtype=layer.weight_dtype,
|
dtype=layer.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
)
|
)
|
||||||
|
|
||||||
layer.down_proj_weight = layer.create_parameter(
|
layer.down_proj_weight = layer.create_parameter(
|
||||||
shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size],
|
shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
|
||||||
dtype=layer.weight_dtype,
|
dtype=layer.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
)
|
)
|
||||||
@@ -164,7 +164,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
|||||||
# 3.quantize weight
|
# 3.quantize weight
|
||||||
from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8
|
from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8
|
||||||
|
|
||||||
for expert_id in range(layer.num_experts):
|
for expert_id in range(layer.num_local_experts):
|
||||||
weight_quant, scale[expert_id] = per_block_cast_to_fp8(
|
weight_quant, scale[expert_id] = per_block_cast_to_fp8(
|
||||||
getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size
|
getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size
|
||||||
)
|
)
|
||||||
|
@@ -627,13 +627,13 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
]
|
]
|
||||||
if self.quant_config.is_checkpoint_bf16:
|
if self.quant_config.is_checkpoint_bf16:
|
||||||
layer.up_gate_proj_weight = layer.create_parameter(
|
layer.up_gate_proj_weight = layer.create_parameter(
|
||||||
shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
|
shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
|
||||||
dtype=layer.weight_dtype,
|
dtype=layer.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
)
|
)
|
||||||
|
|
||||||
layer.down_proj_weight = layer.create_parameter(
|
layer.down_proj_weight = layer.create_parameter(
|
||||||
shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size],
|
shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
|
||||||
dtype=layer.weight_dtype,
|
dtype=layer.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
)
|
)
|
||||||
@@ -732,7 +732,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
# 3.quantize weight
|
# 3.quantize weight
|
||||||
from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8
|
from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8
|
||||||
|
|
||||||
for expert_id in range(layer.num_experts):
|
for expert_id in range(layer.num_local_experts):
|
||||||
weight_quant, scale[expert_id] = per_block_cast_to_fp8(
|
weight_quant, scale[expert_id] = per_block_cast_to_fp8(
|
||||||
getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size
|
getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size
|
||||||
)
|
)
|
||||||
|
@@ -299,6 +299,7 @@ class FusedMoE(nn.Layer):
|
|||||||
param_down_proj_name: Optional[str] = None,
|
param_down_proj_name: Optional[str] = None,
|
||||||
ckpt_expert_key_name: str = "experts",
|
ckpt_expert_key_name: str = "experts",
|
||||||
experts_offset: int = 0,
|
experts_offset: int = 0,
|
||||||
|
num_experts_start_offset: int = 0,
|
||||||
) -> list[tuple[str, str, int, str]]:
|
) -> list[tuple[str, str, int, str]]:
|
||||||
param_name_maping = []
|
param_name_maping = []
|
||||||
|
|
||||||
@@ -323,7 +324,9 @@ class FusedMoE(nn.Layer):
|
|||||||
expert_id,
|
expert_id,
|
||||||
shard_id,
|
shard_id,
|
||||||
)
|
)
|
||||||
for expert_id in range(experts_offset, experts_offset + num_experts)
|
for expert_id in range(
|
||||||
|
experts_offset + num_experts_start_offset, experts_offset + num_experts_start_offset + num_experts
|
||||||
|
)
|
||||||
for shard_id, weight_name in param_name_maping
|
for shard_id, weight_name in param_name_maping
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@@ -543,14 +543,22 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
|||||||
|
|
||||||
expert_params_mapping = []
|
expert_params_mapping = []
|
||||||
if getattr(self.fd_config.model_config, "moe_num_experts", None) is not None:
|
if getattr(self.fd_config.model_config, "moe_num_experts", None) is not None:
|
||||||
|
if self.fd_config.parallel_config.expert_parallel_size > 1:
|
||||||
|
num_experts = self.fd_config.parallel_config.num_experts_per_rank
|
||||||
|
num_experts_start_offset = self.fd_config.parallel_config.num_experts_start_offset
|
||||||
|
else:
|
||||||
|
num_experts = self.fd_config.model_config.moe_num_experts
|
||||||
|
num_experts_start_offset = 0
|
||||||
|
|
||||||
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
||||||
num_experts=self.fd_config.model_config.moe_num_experts,
|
num_experts=num_experts,
|
||||||
ckpt_down_proj_name="down_proj",
|
ckpt_down_proj_name="down_proj",
|
||||||
ckpt_gate_up_proj_name="up_gate_proj",
|
ckpt_gate_up_proj_name="up_gate_proj",
|
||||||
ckpt_gate_proj_name="gate_proj",
|
ckpt_gate_proj_name="gate_proj",
|
||||||
ckpt_up_proj_name="up_proj",
|
ckpt_up_proj_name="up_proj",
|
||||||
param_gate_up_proj_name="experts.up_gate_proj_",
|
param_gate_up_proj_name="experts.up_gate_proj_",
|
||||||
param_down_proj_name="experts.down_proj_",
|
param_down_proj_name="experts.down_proj_",
|
||||||
|
num_experts_start_offset=num_experts_start_offset,
|
||||||
)
|
)
|
||||||
all_param_mapping = general_params_mapping + expert_params_mapping
|
all_param_mapping = general_params_mapping + expert_params_mapping
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user