mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
refactor rl get_name_mappings_to_training (#2847)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* refactor rl get_name_mappings_to_training * fix tp>1 * change variable name(ffn1->up_gate_proj/ffn2->down_proj) * change variable name(linear_weight->weight/linear_bias->bias) * add rl names mapping for vl * fix ernie 0.3B error * fix develop code * fix
This commit is contained in:
@@ -145,13 +145,13 @@ class FusedMoE(nn.Layer):
|
||||
shape=gate_correction_bias_shape,
|
||||
dtype="float32",
|
||||
)
|
||||
ffn1_output_dim = self.moe_intermediate_size * 2
|
||||
up_gate_proj_output_dim = self.moe_intermediate_size * 2
|
||||
if self.moe_quant_type in ["fp8", "wint8"]:
|
||||
ffn1_weight_shape = [self.num_local_experts, ffn1_output_dim, self.hidden_size]
|
||||
ffn2_weight_shape = [self.num_local_experts, self.hidden_size, self.moe_intermediate_size]
|
||||
up_gate_proj_weight_shape = [self.num_local_experts, up_gate_proj_output_dim, self.hidden_size]
|
||||
down_proj_weight_shape = [self.num_local_experts, self.hidden_size, self.moe_intermediate_size]
|
||||
else:
|
||||
ffn1_weight_shape = [self.num_local_experts, self.hidden_size, ffn1_output_dim]
|
||||
ffn2_weight_shape = [self.num_local_experts, self.moe_intermediate_size, self.hidden_size]
|
||||
up_gate_proj_weight_shape = [self.num_local_experts, self.hidden_size, up_gate_proj_output_dim]
|
||||
down_proj_weight_shape = [self.num_local_experts, self.moe_intermediate_size, self.hidden_size]
|
||||
|
||||
# Create parameters
|
||||
if self.moe_quant_type == "fp8":
|
||||
@@ -161,15 +161,15 @@ class FusedMoE(nn.Layer):
|
||||
self.weight_dtype = "int8"
|
||||
self.init_weight_only_scale()
|
||||
|
||||
# FFN1 parameters
|
||||
self.moe_ffn1_weight = self.create_parameter(
|
||||
shape=ffn1_weight_shape,
|
||||
# up_gate_proj parameters
|
||||
self.up_gate_proj_weight = self.create_parameter(
|
||||
shape=up_gate_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
# FFN2 parameters
|
||||
self.moe_ffn2_weight = self.create_parameter(
|
||||
shape=ffn2_weight_shape,
|
||||
# down_proj parameters
|
||||
self.down_proj_weight = self.create_parameter(
|
||||
shape=down_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
@@ -178,44 +178,44 @@ class FusedMoE(nn.Layer):
|
||||
"""
|
||||
Initialize the weight scale.
|
||||
"""
|
||||
self.moe_ffn1_weight_scale = self.create_parameter(
|
||||
self.up_gate_proj_weight_scale = self.create_parameter(
|
||||
shape=[self.num_local_experts, self.moe_intermediate_size * 2],
|
||||
dtype=self._dtype,
|
||||
)
|
||||
self.moe_ffn2_weight_scale = self.create_parameter(
|
||||
self.down_proj_weight_scale = self.create_parameter(
|
||||
shape=[self.num_local_experts, self.hidden_size],
|
||||
dtype=self._dtype,
|
||||
)
|
||||
|
||||
def load_experts_weight(self, state_dict: dict,
|
||||
ffn1_expert_weight_key: str,
|
||||
ffn2_expert_weight_key: str):
|
||||
up_gate_proj_expert_weight_key: str,
|
||||
down_proj_expert_weight_key: str):
|
||||
"""
|
||||
Load experts weight from state_dict.
|
||||
Args:
|
||||
state_dict (dict): The state_dict of model.
|
||||
ffn1_expert_weight_key (str): The key of ffn1 expert weight.
|
||||
ffn2_expert_weight_key (str): The key of ffn2 expert weight.
|
||||
up_gate_proj_expert_weight_key (str): The key of up_gate_proj expert weight.
|
||||
down_proj_expert_weight_key (str): The key of down_proj expert weight.
|
||||
"""
|
||||
ffn1_weights = []
|
||||
ffn2_weights = []
|
||||
is_ffn_merged = ffn1_expert_weight_key.format(
|
||||
up_gate_proj_weights = []
|
||||
down_proj_weights = []
|
||||
is_ffn_merged = up_gate_proj_expert_weight_key.format(
|
||||
self.expert_id_offset) in state_dict
|
||||
if is_ffn_merged:
|
||||
for i in range(self.num_local_experts):
|
||||
expert_idx = self.expert_id_offset + i
|
||||
ffn1_weights.append(
|
||||
up_gate_proj_weights.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_weight_key.format(expert_idx))))
|
||||
ffn2_weights.append(
|
||||
up_gate_proj_expert_weight_key.format(expert_idx))))
|
||||
down_proj_weights.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_weight_key.format(expert_idx))))
|
||||
down_proj_expert_weight_key.format(expert_idx))))
|
||||
else:
|
||||
gate_expert_weight_key = ffn1_expert_weight_key.replace(
|
||||
gate_expert_weight_key = up_gate_proj_expert_weight_key.replace(
|
||||
"up_gate_proj", "gate_proj")
|
||||
up_expert_weight_key = ffn1_expert_weight_key.replace(
|
||||
up_expert_weight_key = up_gate_proj_expert_weight_key.replace(
|
||||
"up_gate_proj", "up_proj")
|
||||
for j in range(self.num_local_experts):
|
||||
expert_idx = self.expert_id_offset + j
|
||||
@@ -223,12 +223,12 @@ class FusedMoE(nn.Layer):
|
||||
state_dict.pop(gate_expert_weight_key.format(expert_idx)))
|
||||
up = get_tensor(
|
||||
state_dict.pop(up_expert_weight_key.format(expert_idx)))
|
||||
ffn1_weights.append(paddle.concat([gate, up], axis=-1))
|
||||
ffn2_weights.append(
|
||||
up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
|
||||
down_proj_weights.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_weight_key.format(expert_idx))))
|
||||
return ffn1_weights, ffn2_weights
|
||||
down_proj_expert_weight_key.format(expert_idx))))
|
||||
return up_gate_proj_weights, down_proj_weights
|
||||
|
||||
def extract_moe_ffn_weights(self, state_dict: dict):
|
||||
"""
|
||||
@@ -239,30 +239,30 @@ class FusedMoE(nn.Layer):
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing two lists:
|
||||
- ffn1_weights: List of tensors for first FFN layer weights
|
||||
- ffn2_weights: List of tensors for second FFN layer weights
|
||||
- up_gate_proj_weights: List of tensors for first FFN layer weights
|
||||
- down_proj_weights: List of tensors for second FFN layer weights
|
||||
|
||||
Raises:
|
||||
AssertionError: If required weight keys are missing or number of weights
|
||||
doesn't match number of local experts.
|
||||
"""
|
||||
ffn1_expert_weight_key = self.weight_key_map.get(
|
||||
"ffn1_expert_weight_key", None)
|
||||
ffn2_expert_weight_key = self.weight_key_map.get(
|
||||
"ffn2_expert_weight_key", None)
|
||||
assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
|
||||
assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
|
||||
up_gate_proj_expert_weight_key = self.weight_key_map.get(
|
||||
"up_gate_proj_expert_weight_key", None)
|
||||
down_proj_expert_weight_key = self.weight_key_map.get(
|
||||
"down_proj_expert_weight_key", None)
|
||||
assert up_gate_proj_expert_weight_key is not None, "up_gate_proj_expert_weight_key should not be none."
|
||||
assert down_proj_expert_weight_key is not None, "down_proj_expert_weight_key should not be none."
|
||||
|
||||
ffn1_weights, ffn2_weights = self.load_experts_weight(
|
||||
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
|
||||
up_gate_proj_weights, down_proj_weights = self.load_experts_weight(
|
||||
state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key)
|
||||
assert len(
|
||||
ffn1_weights
|
||||
) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
|
||||
up_gate_proj_weights
|
||||
) == self.num_local_experts, "up_gate_proj_weights length should be equal to num_local_experts."
|
||||
assert len(
|
||||
ffn2_weights
|
||||
) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
|
||||
down_proj_weights
|
||||
) == self.num_local_experts, "down_proj_weights length should be equal to num_local_experts."
|
||||
|
||||
return ffn1_weights, ffn2_weights
|
||||
return up_gate_proj_weights, down_proj_weights
|
||||
|
||||
def extract_gate_correction_bias(self, gate_correction_bias_key,
|
||||
state_dict):
|
||||
|
Reference in New Issue
Block a user