refactor rl get_name_mappings_to_training (#2847)

* refactor rl get_name_mappings_to_training * fix tp>1 * change variable name(ffn1->up_gate_proj/ffn2->down_proj) * change variable name(linear_weight->weight/linear_bias->bias) * add rl names mapping for vl * fix ernie 0.3B error * fix develop code * fix
2025-10-05 08:37:06 +08:00 · 2025-07-15 22:31:42 +08:00
parent e7bcbbab52
commit 61b3997b85
47 changed files with 1591 additions and 1629 deletions
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -145,13 +145,13 @@ class FusedMoE(nn.Layer):
                shape=gate_correction_bias_shape,
                dtype="float32",
            )
-        ffn1_output_dim = self.moe_intermediate_size * 2
+        up_gate_proj_output_dim = self.moe_intermediate_size * 2
        if self.moe_quant_type in ["fp8", "wint8"]:
-            ffn1_weight_shape = [self.num_local_experts, ffn1_output_dim, self.hidden_size]
-            ffn2_weight_shape = [self.num_local_experts, self.hidden_size, self.moe_intermediate_size]
+            up_gate_proj_weight_shape = [self.num_local_experts, up_gate_proj_output_dim, self.hidden_size]
+            down_proj_weight_shape = [self.num_local_experts, self.hidden_size, self.moe_intermediate_size]
        else:
-            ffn1_weight_shape = [self.num_local_experts, self.hidden_size, ffn1_output_dim]
-            ffn2_weight_shape = [self.num_local_experts, self.moe_intermediate_size, self.hidden_size]
+            up_gate_proj_weight_shape = [self.num_local_experts, self.hidden_size, up_gate_proj_output_dim]
+            down_proj_weight_shape = [self.num_local_experts, self.moe_intermediate_size, self.hidden_size]

        # Create parameters
        if self.moe_quant_type == "fp8":
@@ -161,15 +161,15 @@ class FusedMoE(nn.Layer):
            self.weight_dtype = "int8"
            self.init_weight_only_scale()

-        # FFN1 parameters
-        self.moe_ffn1_weight = self.create_parameter(
-            shape=ffn1_weight_shape,
+        # up_gate_proj parameters
+        self.up_gate_proj_weight = self.create_parameter(
+            shape=up_gate_proj_weight_shape,
            dtype=self.weight_dtype,
            default_initializer=paddle.nn.initializer.Constant(0),
        )
-        # FFN2 parameters
-        self.moe_ffn2_weight = self.create_parameter(
-            shape=ffn2_weight_shape,
+        # down_proj parameters
+        self.down_proj_weight = self.create_parameter(
+            shape=down_proj_weight_shape,
            dtype=self.weight_dtype,
            default_initializer=paddle.nn.initializer.Constant(0),
        )
@@ -178,44 +178,44 @@ class FusedMoE(nn.Layer):
        """
        Initialize the weight scale.
        """
-        self.moe_ffn1_weight_scale = self.create_parameter(
+        self.up_gate_proj_weight_scale = self.create_parameter(
            shape=[self.num_local_experts, self.moe_intermediate_size * 2],
            dtype=self._dtype,
        )
-        self.moe_ffn2_weight_scale = self.create_parameter(
+        self.down_proj_weight_scale = self.create_parameter(
            shape=[self.num_local_experts, self.hidden_size],
            dtype=self._dtype,
        )

    def load_experts_weight(self, state_dict: dict,
-                            ffn1_expert_weight_key: str,
-                            ffn2_expert_weight_key: str):
+                            up_gate_proj_expert_weight_key: str,
+                            down_proj_expert_weight_key: str):
        """
        Load experts weight from state_dict.
        Args:
            state_dict (dict): The state_dict of model.
-            ffn1_expert_weight_key (str): The key of ffn1 expert weight.
-            ffn2_expert_weight_key (str): The key of ffn2 expert weight.
+            up_gate_proj_expert_weight_key (str): The key of up_gate_proj expert weight.
+            down_proj_expert_weight_key (str): The key of down_proj expert weight.
        """
-        ffn1_weights = []
-        ffn2_weights = []
-        is_ffn_merged = ffn1_expert_weight_key.format(
+        up_gate_proj_weights = []
+        down_proj_weights = []
+        is_ffn_merged = up_gate_proj_expert_weight_key.format(
            self.expert_id_offset) in state_dict
        if is_ffn_merged:
            for i in range(self.num_local_experts):
                expert_idx = self.expert_id_offset + i
-                ffn1_weights.append(
+                up_gate_proj_weights.append(
                    get_tensor(
                        state_dict.pop(
-                            ffn1_expert_weight_key.format(expert_idx))))
-                ffn2_weights.append(
+                            up_gate_proj_expert_weight_key.format(expert_idx))))
+                down_proj_weights.append(
                    get_tensor(
                        state_dict.pop(
-                            ffn2_expert_weight_key.format(expert_idx))))
+                            down_proj_expert_weight_key.format(expert_idx))))
        else:
-            gate_expert_weight_key = ffn1_expert_weight_key.replace(
+            gate_expert_weight_key = up_gate_proj_expert_weight_key.replace(
                "up_gate_proj", "gate_proj")
-            up_expert_weight_key = ffn1_expert_weight_key.replace(
+            up_expert_weight_key = up_gate_proj_expert_weight_key.replace(
                "up_gate_proj", "up_proj")
            for j in range(self.num_local_experts):
                expert_idx = self.expert_id_offset + j
@@ -223,12 +223,12 @@ class FusedMoE(nn.Layer):
                    state_dict.pop(gate_expert_weight_key.format(expert_idx)))
                up = get_tensor(
                    state_dict.pop(up_expert_weight_key.format(expert_idx)))
-                ffn1_weights.append(paddle.concat([gate, up], axis=-1))
-                ffn2_weights.append(
+                up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
+                down_proj_weights.append(
                    get_tensor(
                        state_dict.pop(
-                            ffn2_expert_weight_key.format(expert_idx))))
-        return ffn1_weights, ffn2_weights
+                            down_proj_expert_weight_key.format(expert_idx))))
+        return up_gate_proj_weights, down_proj_weights

    def extract_moe_ffn_weights(self, state_dict: dict):
        """
@@ -239,30 +239,30 @@ class FusedMoE(nn.Layer):

        Returns:
            tuple: A tuple containing two lists:
-                - ffn1_weights: List of tensors for first FFN layer weights
-                - ffn2_weights: List of tensors for second FFN layer weights
+                - up_gate_proj_weights: List of tensors for first FFN layer weights
+                - down_proj_weights: List of tensors for second FFN layer weights

        Raises:
            AssertionError: If required weight keys are missing or number of weights
                doesn't match number of local experts.
        """
-        ffn1_expert_weight_key = self.weight_key_map.get(
-            "ffn1_expert_weight_key", None)
-        ffn2_expert_weight_key = self.weight_key_map.get(
-            "ffn2_expert_weight_key", None)
-        assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
-        assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
+        up_gate_proj_expert_weight_key = self.weight_key_map.get(
+            "up_gate_proj_expert_weight_key", None)
+        down_proj_expert_weight_key = self.weight_key_map.get(
+            "down_proj_expert_weight_key", None)
+        assert up_gate_proj_expert_weight_key is not None, "up_gate_proj_expert_weight_key should not be none."
+        assert down_proj_expert_weight_key is not None, "down_proj_expert_weight_key should not be none."

-        ffn1_weights, ffn2_weights = self.load_experts_weight(
-            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        up_gate_proj_weights, down_proj_weights = self.load_experts_weight(
+            state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key)
        assert len(
-            ffn1_weights
-        ) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
+            up_gate_proj_weights
+        ) == self.num_local_experts, "up_gate_proj_weights length should be equal to num_local_experts."
        assert len(
-            ffn2_weights
-        ) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
+            down_proj_weights
+        ) == self.num_local_experts, "down_proj_weights length should be equal to num_local_experts."

-        return ffn1_weights, ffn2_weights
+        return up_gate_proj_weights, down_proj_weights

    def extract_gate_correction_bias(self, gate_correction_bias_key,
                                     state_dict):