From bfeb664ab8573c2ac2d257a0b0e9a6def180f9d0 Mon Sep 17 00:00:00 2001
From: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
Date: Thu, 24 Jul 2025 00:16:42 +0800
Subject: [PATCH] update (#2978)

---
 fastdeploy/model_executor/layers/moe/moe.py   | 44 ++++++++++++++++---
 fastdeploy/model_executor/layers/utils.py     | 27 ++----------
 .../model_executor/load_weight_utils.py       | 14 ++++++
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index 6ea31642f..b573ccb0d 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -228,19 +228,53 @@ class FusedMoE(nn.Layer):
         if is_ffn_merged:
             for i in range(self.num_local_experts):
                 expert_idx = self.expert_id_offset + i
+                down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx)
+                up_gate_proj_expert_weight_key_name = up_gate_proj_expert_weight_key.format(expert_idx)
                 up_gate_proj_weights.append(
-                    get_tensor(state_dict.pop(up_gate_proj_expert_weight_key.format(expert_idx)))
+                    get_tensor(
+                        state_dict.pop(up_gate_proj_expert_weight_key_name)
+                        if up_gate_proj_expert_weight_key_name in state_dict
+                        else up_gate_proj_expert_weight_key_name,
+                        self.fd_config.parallel_config.model_name_or_path,
+                    )
+                )
+                down_proj_weights.append(
+                    get_tensor(
+                        state_dict.pop(down_proj_expert_weight_key_name)
+                        if down_proj_expert_weight_key_name in state_dict
+                        else down_proj_expert_weight_key_name,
+                        self.fd_config.parallel_config.model_name_or_path,
+                    )
                 )
-                down_proj_weights.append(get_tensor(state_dict.pop(down_proj_expert_weight_key.format(expert_idx))))
         else:
             gate_expert_weight_key = up_gate_proj_expert_weight_key.replace("up_gate_proj", "gate_proj")
             up_expert_weight_key = up_gate_proj_expert_weight_key.replace("up_gate_proj", "up_proj")
             for j in range(self.num_local_experts):
                 expert_idx = self.expert_id_offset + j
-                gate = get_tensor(state_dict.pop(gate_expert_weight_key.format(expert_idx)))
-                up = get_tensor(state_dict.pop(up_expert_weight_key.format(expert_idx)))
+                gate_expert_weight_key_name = gate_expert_weight_key.format(expert_idx)
+                up_expert_weight_key_name = up_expert_weight_key.format(expert_idx)
+                down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx)
+                gate = get_tensor(
+                    state_dict.pop(gate_expert_weight_key_name)
+                    if gate_expert_weight_key_name in state_dict
+                    else gate_expert_weight_key_name,
+                    self.fd_config.parallel_config.model_name_or_path,
+                )
+                up = get_tensor(
+                    state_dict.pop(up_expert_weight_key_name)
+                    if up_expert_weight_key_name in state_dict
+                    else up_expert_weight_key_name,
+                    self.fd_config.parallel_config.model_name_or_path,
+                )
                 up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
-                down_proj_weights.append(get_tensor(state_dict.pop(down_proj_expert_weight_key.format(expert_idx))))
+                down_proj_weights.append(
+                    get_tensor(
+                        state_dict.pop(down_proj_expert_weight_key_name)
+                        if down_proj_expert_weight_key_name in state_dict
+                        else down_proj_expert_weight_key_name,
+                        self.fd_config.parallel_config.model_name_or_path,
+                    )
+                )
         return up_gate_proj_weights, down_proj_weights
 
     def extract_moe_ffn_weights(self, state_dict: dict):
diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
index fa057965f..75171982f 100644
--- a/fastdeploy/model_executor/layers/utils.py
+++ b/fastdeploy/model_executor/layers/utils.py
@@ -37,7 +37,6 @@ if current_platform.is_cuda() and current_platform.available():
             "And ensure the Paddle version supports FastDeploy's custom operators"
         )
 
-import re
 
 from fastdeploy import envs
 
@@ -107,7 +106,7 @@ def _set_var_distributed(var: Tensor, split_axis: int):
         main_block._find_var_recursive(var.name).is_distributed = True
 
 
-def get_tensor(input: Union[paddle.Tensor, np.ndarray, str]) -> paddle.Tensor:
+def get_tensor(input: Union[paddle.Tensor, np.ndarray, str], model_path=None) -> paddle.Tensor:
     """
     Return a corresponding PaddlePaddle tensor based on the type and content of the input.
 
@@ -125,28 +124,9 @@ def get_tensor(input: Union[paddle.Tensor, np.ndarray, str]) -> paddle.Tensor:
     elif isinstance(input, np.ndarray):
         return paddle.to_tensor(input)
     elif isinstance(input, str):
-        if ".safetensors" in input:
-            match = re.match(r"\[(.*?)\](.*)", input)
-            if match:
-                key_name = match.group(1)
-                model_path = match.group(2)
-            from safetensors import safe_open
+        from fastdeploy.model_executor.load_weight_utils import load_reordered_experts
 
-            with safe_open(model_path, framework="np", device="cpu") as f:
-                if key_name in f.keys():
-                    weight = f.get_tensor(key_name)
-                    weight = paddle.Tensor(weight, zero_copy=True)
-                    weight = weight._copy_to(paddle.framework._current_expected_place(), False)
-                    return weight
-                else:
-                    return None
-        else:
-            if cache_params != "none":
-                tmp_key = input.split("/")[-1]
-                if tmp_key in c8_state_dict:
-                    print(f"Loading {tmp_key} in extra C8_state_dict")
-                    return paddle.to_tensor(c8_state_dict.pop(tmp_key))
-            return paddle.load(input)
+        return load_reordered_experts(model_path, input)
     else:
         return input
 
@@ -377,6 +357,7 @@ def create_and_set_parameter(layer: nn.Layer, name: str, tensor: paddle.Tensor):
     )
     getattr(layer, name).set_value(tensor)
 
+
 @functools.cache
 def create_empty_tensor(shape: Tuple[int, ...], dtype: Union[paddle.dtype, str]) -> paddle.Tensor:
     """
diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py
index f5ed5543e..f04dc9b81 100644
--- a/fastdeploy/model_executor/load_weight_utils.py
+++ b/fastdeploy/model_executor/load_weight_utils.py
@@ -32,6 +32,20 @@ from fastdeploy.model_executor.models.tp_utils import (
 from fastdeploy.platforms import current_platform
 
 
+def load_reordered_experts(model_path: str, key_name: str):
+    from safetensors import safe_open
+
+    with open(os.path.join(model_path, "model.safetensors.index.json"), "r") as f:
+        weight_list = json.load(f)["weight_map"]
+    safetensor_path = os.path.join(model_path, weight_list[key_name])
+    with safe_open(safetensor_path, framework="np", device="cpu") as f:
+        if key_name in f.keys():
+            weight = f.get_tensor(key_name)
+            weight = paddle.Tensor(weight, zero_copy=True)
+            weight = weight._copy_to(paddle.framework._current_expected_place(), False)
+            return weight
+
+
 def load_ep_checkpoint(model_path: str, fd_config: FDConfig, return_numpy: bool = False):
     """
     load ep checkpoint