refactor rl get_name_mappings_to_training (#2847)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* refactor rl get_name_mappings_to_training

* fix tp>1

* change variable name(ffn1->up_gate_proj/ffn2->down_proj)

* change variable name(linear_weight->weight/linear_bias->bias)

* add rl names mapping for vl

* fix ernie 0.3B error

* fix develop code

* fix
This commit is contained in:
Yuanle Liu
2025-07-15 22:31:42 +08:00
committed by GitHub
parent e7bcbbab52
commit 61b3997b85
47 changed files with 1591 additions and 1629 deletions

View File

@@ -145,13 +145,13 @@ class FusedMoE(nn.Layer):
shape=gate_correction_bias_shape,
dtype="float32",
)
ffn1_output_dim = self.moe_intermediate_size * 2
up_gate_proj_output_dim = self.moe_intermediate_size * 2
if self.moe_quant_type in ["fp8", "wint8"]:
ffn1_weight_shape = [self.num_local_experts, ffn1_output_dim, self.hidden_size]
ffn2_weight_shape = [self.num_local_experts, self.hidden_size, self.moe_intermediate_size]
up_gate_proj_weight_shape = [self.num_local_experts, up_gate_proj_output_dim, self.hidden_size]
down_proj_weight_shape = [self.num_local_experts, self.hidden_size, self.moe_intermediate_size]
else:
ffn1_weight_shape = [self.num_local_experts, self.hidden_size, ffn1_output_dim]
ffn2_weight_shape = [self.num_local_experts, self.moe_intermediate_size, self.hidden_size]
up_gate_proj_weight_shape = [self.num_local_experts, self.hidden_size, up_gate_proj_output_dim]
down_proj_weight_shape = [self.num_local_experts, self.moe_intermediate_size, self.hidden_size]
# Create parameters
if self.moe_quant_type == "fp8":
@@ -161,15 +161,15 @@ class FusedMoE(nn.Layer):
self.weight_dtype = "int8"
self.init_weight_only_scale()
# FFN1 parameters
self.moe_ffn1_weight = self.create_parameter(
shape=ffn1_weight_shape,
# up_gate_proj parameters
self.up_gate_proj_weight = self.create_parameter(
shape=up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
# FFN2 parameters
self.moe_ffn2_weight = self.create_parameter(
shape=ffn2_weight_shape,
# down_proj parameters
self.down_proj_weight = self.create_parameter(
shape=down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
@@ -178,44 +178,44 @@ class FusedMoE(nn.Layer):
"""
Initialize the weight scale.
"""
self.moe_ffn1_weight_scale = self.create_parameter(
self.up_gate_proj_weight_scale = self.create_parameter(
shape=[self.num_local_experts, self.moe_intermediate_size * 2],
dtype=self._dtype,
)
self.moe_ffn2_weight_scale = self.create_parameter(
self.down_proj_weight_scale = self.create_parameter(
shape=[self.num_local_experts, self.hidden_size],
dtype=self._dtype,
)
def load_experts_weight(self, state_dict: dict,
ffn1_expert_weight_key: str,
ffn2_expert_weight_key: str):
up_gate_proj_expert_weight_key: str,
down_proj_expert_weight_key: str):
"""
Load experts weight from state_dict.
Args:
state_dict (dict): The state_dict of model.
ffn1_expert_weight_key (str): The key of ffn1 expert weight.
ffn2_expert_weight_key (str): The key of ffn2 expert weight.
up_gate_proj_expert_weight_key (str): The key of up_gate_proj expert weight.
down_proj_expert_weight_key (str): The key of down_proj expert weight.
"""
ffn1_weights = []
ffn2_weights = []
is_ffn_merged = ffn1_expert_weight_key.format(
up_gate_proj_weights = []
down_proj_weights = []
is_ffn_merged = up_gate_proj_expert_weight_key.format(
self.expert_id_offset) in state_dict
if is_ffn_merged:
for i in range(self.num_local_experts):
expert_idx = self.expert_id_offset + i
ffn1_weights.append(
up_gate_proj_weights.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_key.format(expert_idx))))
ffn2_weights.append(
up_gate_proj_expert_weight_key.format(expert_idx))))
down_proj_weights.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_key.format(expert_idx))))
down_proj_expert_weight_key.format(expert_idx))))
else:
gate_expert_weight_key = ffn1_expert_weight_key.replace(
gate_expert_weight_key = up_gate_proj_expert_weight_key.replace(
"up_gate_proj", "gate_proj")
up_expert_weight_key = ffn1_expert_weight_key.replace(
up_expert_weight_key = up_gate_proj_expert_weight_key.replace(
"up_gate_proj", "up_proj")
for j in range(self.num_local_experts):
expert_idx = self.expert_id_offset + j
@@ -223,12 +223,12 @@ class FusedMoE(nn.Layer):
state_dict.pop(gate_expert_weight_key.format(expert_idx)))
up = get_tensor(
state_dict.pop(up_expert_weight_key.format(expert_idx)))
ffn1_weights.append(paddle.concat([gate, up], axis=-1))
ffn2_weights.append(
up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
down_proj_weights.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_key.format(expert_idx))))
return ffn1_weights, ffn2_weights
down_proj_expert_weight_key.format(expert_idx))))
return up_gate_proj_weights, down_proj_weights
def extract_moe_ffn_weights(self, state_dict: dict):
"""
@@ -239,30 +239,30 @@ class FusedMoE(nn.Layer):
Returns:
tuple: A tuple containing two lists:
- ffn1_weights: List of tensors for first FFN layer weights
- ffn2_weights: List of tensors for second FFN layer weights
- up_gate_proj_weights: List of tensors for first FFN layer weights
- down_proj_weights: List of tensors for second FFN layer weights
Raises:
AssertionError: If required weight keys are missing or number of weights
doesn't match number of local experts.
"""
ffn1_expert_weight_key = self.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = self.weight_key_map.get(
"ffn2_expert_weight_key", None)
assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
up_gate_proj_expert_weight_key = self.weight_key_map.get(
"up_gate_proj_expert_weight_key", None)
down_proj_expert_weight_key = self.weight_key_map.get(
"down_proj_expert_weight_key", None)
assert up_gate_proj_expert_weight_key is not None, "up_gate_proj_expert_weight_key should not be none."
assert down_proj_expert_weight_key is not None, "down_proj_expert_weight_key should not be none."
ffn1_weights, ffn2_weights = self.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
up_gate_proj_weights, down_proj_weights = self.load_experts_weight(
state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key)
assert len(
ffn1_weights
) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
up_gate_proj_weights
) == self.num_local_experts, "up_gate_proj_weights length should be equal to num_local_experts."
assert len(
ffn2_weights
) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
down_proj_weights
) == self.num_local_experts, "down_proj_weights length should be equal to num_local_experts."
return ffn1_weights, ffn2_weights
return up_gate_proj_weights, down_proj_weights
def extract_gate_correction_bias(self, gate_correction_bias_key,
state_dict):