From 9afa236e39128f4ad0f0a65fc439d176e04309a5 Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Tue, 26 Aug 2025 16:19:30 +0800
Subject: [PATCH] [NewFeatures] support eplb (#3547)

* [NewFeatures] support eplb

* fix eplb
---
 .../backends/dcu/fused_moe_triton_backends.py |   4 +-
 .../gcu/moe/fused_moe_method_gcu_backend.py   |   6 +-
 .../backends/gcu/quantization/weight_only.py  |   2 +-
 .../moe/fused_moe_triton_metax_backend.py     |   4 +-
 .../layers/moe/fused_moe_cutlass_backend.py   | 109 ++++++++++++++----
 .../layers/moe/fused_moe_deepgemm_backend.py  |   8 +-
 .../layers/moe/fused_moe_marlin_backend.py    |   2 +-
 .../layers/moe/fused_moe_triton_backend.py    |  10 +-
 .../layers/moe/fused_moe_wint2_backend.py     |   2 +-
 .../layers/moe/fused_moe_xpu_backend.py       |   4 +-
 fastdeploy/model_executor/layers/moe/moe.py   |  34 +++---
 .../layers/quantization/block_wise_fp8.py     |   2 +-
 .../layers/quantization/tensor_wise_fp8.py    |   2 +-
 .../layers/quantization/weight_only.py        |   2 +-
 .../model_executor/load_weight_utils.py       |   6 +-
 .../model_executor/models/ernie4_5_moe.py     |  40 ++++++-
 .../models/ernie4_5_vl/ernie4_5_vl_moe.py     |   4 +-
 17 files changed, 174 insertions(+), 67 deletions(-)

diff --git a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
index 0a6c31b06..cd6a51161 100644
--- a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
@@ -38,7 +38,7 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
             "down_proj_weight_scale",
         ]
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False) -> None:
         """process_prequanted_weights"""
         pass
 
@@ -46,7 +46,7 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
         """
         Triton MoE create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
         assert self.quant_method.name() == "wint8"
diff --git a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
index cf7462e26..c899cafc7 100644
--- a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
@@ -49,7 +49,7 @@ class GCUFusedMoeMethod(UnquantizedFusedMoEMethod):
         self.group_size = -1
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
         stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
         layer.up_gate_proj_weight.set_value(paddle.transpose(stacked_up_gate_proj_weights, [0, 2, 1]))
@@ -254,7 +254,7 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
         self.quant_multi_process_group_size = int(os.getenv("FD_MOE_QUANT_MULTI_PROCESS_GROUP_SIZE", 8))
         logger.info(f"GCUWeightOnlyMoEMethod quant_multi_process_group_size: {self.quant_multi_process_group_size}")
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
         """
         Paddle gcu process prequanted weights.
         """
@@ -299,7 +299,7 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
         """
         Paddle cutlass create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         self.check(layer, up_gate_proj_weights, down_proj_weights)
 
         def quant_worker(p_group_idx, shared_dict, weights, moe_quant_type, group_size):
diff --git a/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
index 9aebf64ce..bfebcd247 100644
--- a/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
@@ -59,7 +59,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
             is_bias=False,
         )
 
-    def process_prequanted_weights(self, layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False) -> None:
         """
         Process pre-quantized weights before applying them to the model
         Args:
diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py
index e4bd40021..e945a189a 100644
--- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py
@@ -41,7 +41,7 @@ class MetaxTritonWeightOnlyMoEMethod(QuantMethodBase):
             "down_proj_weight_scale",
         ]
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False) -> None:
         """process_prequanted_weights"""
         pass
 
@@ -50,7 +50,7 @@ class MetaxTritonWeightOnlyMoEMethod(QuantMethodBase):
         """
         Triton MoE create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
 
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
index bea2e130b..d111a0426 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -74,7 +74,9 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
     """
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            layer.extract_moe_ffn_weights(state_dict)
+        )
         stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
         stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
 
@@ -333,7 +335,7 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
         self.moe_quant_type = "w4a8"
         self.pack_num = 2
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
         """
         Paddle cutlass process prequanted weights.
         """
@@ -349,6 +351,7 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
                 state_dict,
                 up_gate_proj_expert_weight_key,
                 down_proj_expert_weight_key,
+                is_rearrange,
             )
         )
 
@@ -358,22 +361,62 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
         up_gate_proj_in_scale = []
         down_proj_in_scale = []
 
+        if isinstance(state_dict, list):
+            state_dict = dict(state_dict)
+
         if layer.ep_size > 1:
             for expert_idx in ep_rank_to_expert_id_list:
-                scale_tensor = get_tensor(state_dict[up_gate_proj_expert_in_scale_key.format(expert_idx)])
+                scale_tensor = get_tensor(
+                    (
+                        state_dict[up_gate_proj_expert_in_scale_key.format(expert_idx)]
+                        if up_gate_proj_expert_in_scale_key.format(expert_idx) in state_dict
+                        else up_gate_proj_expert_in_scale_key.format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
                 up_gate_proj_in_scale_all_experts.append(scale_tensor)
 
         for expert_idx in logical_expert_ids:
             up_gate_proj_weight_scale.append(
-                get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx)))
+                get_tensor(
+                    (
+                        state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx))
+                        if up_gate_proj_expert_weight_scale_key.format(expert_idx) in state_dict
+                        else up_gate_proj_expert_weight_scale_key.format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
             )
             down_proj_weight_scale.append(
-                get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx)))
+                get_tensor(
+                    (
+                        state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx))
+                        if down_proj_expert_weight_scale_key.format(expert_idx) in state_dict
+                        else down_proj_expert_weight_scale_key.format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
             )
             up_gate_proj_in_scale.append(
-                get_tensor(state_dict.pop(up_gate_proj_expert_in_scale_key.format(expert_idx)))
+                get_tensor(
+                    (
+                        state_dict.pop(up_gate_proj_expert_in_scale_key.format(expert_idx))
+                        if up_gate_proj_expert_in_scale_key.format(expert_idx) in state_dict
+                        else up_gate_proj_expert_in_scale_key.format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
+            )
+            down_proj_in_scale.append(
+                get_tensor(
+                    (
+                        state_dict.pop(down_proj_expert_in_scale_key.format(expert_idx))
+                        if down_proj_expert_in_scale_key.format(expert_idx) in state_dict
+                        else down_proj_expert_in_scale_key.format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
             )
-            down_proj_in_scale.append(get_tensor(state_dict.pop(down_proj_expert_in_scale_key.format(expert_idx))))
 
         up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0)
         down_proj_weight = paddle.stack(down_proj_weights, axis=0)
@@ -435,7 +478,9 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
         """
         Paddle cutlass load weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            layer.extract_moe_ffn_weights(state_dict)
+        )
         self.check(layer, up_gate_proj_weights, down_proj_weights)
         for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
             weight_name = self.added_weight_attrs[idx]
@@ -446,7 +491,9 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
             quanted_weight = paddle.stack(weight_list, axis=0)
             getattr(layer, weight_name).set_value(quanted_weight)
 
-        self.load_w4a8_scale_weights(layer, layer.weight_key_map, state_dict)
+        self.load_w4a8_scale_weights(
+            layer, layer.weight_key_map, state_dict, logical_expert_ids, ep_rank_to_expert_id_list
+        )
 
     def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict):
         """
@@ -499,7 +546,14 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
             ),
         )
 
-    def load_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict, state_dict: dict):
+    def load_w4a8_scale_weights(
+        self,
+        layer: nn.Layer,
+        weight_key_map: dict,
+        state_dict: dict,
+        logical_expert_ids: paddle.Tensor,
+        ep_rank_to_expert_id_list: list,
+    ):
         """
         Get w4a8 weights from state dict and process them.
         Args:
@@ -508,8 +562,15 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
             state_dict (dict): The state dict.
         """
 
-        def _extract_scale_tensor(state_dict, key_template, expert_idx):
-            return get_tensor(state_dict.pop(key_template.format(expert_idx)))
+        def _extract_scale_tensor(layer: nn.Layer, state_dict, key_template, expert_idx):
+            return get_tensor(
+                (
+                    state_dict.pop(key_template.format(expert_idx))
+                    if key_template.format(expert_idx) in state_dict
+                    else key_template.format(expert_idx)
+                ),
+                layer.fd_config.model_config.model,
+            )
 
         def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
             processed_in_scale = 1 / paddle.concat(in_scales)
@@ -551,17 +612,23 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
 
         # 2. Extract scale tensor from state dict
         if layer.ep_size > 1:
-            for expert_idx in range(layer.num_experts):
-                scale_tensor = get_tensor(state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)])
+            for expert_idx in ep_rank_to_expert_id_list:
+                scale_tensor = get_tensor(
+                    (
+                        state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)]
+                        if scale_key_map["up_gate_proj_in_scale"].format(expert_idx) in state_dict
+                        else scale_key_map["up_gate_proj_in_scale"].format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
                 up_gate_proj_in_scales_all_experts.append(1 / scale_tensor)
             getattr(layer, "up_gate_proj_in_scale_all_experts").set_value(
                 paddle.concat(up_gate_proj_in_scales_all_experts)
             )
 
-        for local_expert_idx in range(layer.num_local_experts):
-            expert_idx = local_expert_idx + layer.expert_id_offset
+        for expert_idx in logical_expert_ids:
             for name, scale_key_template in scale_key_map.items():
-                scale_tensor = _extract_scale_tensor(state_dict, scale_key_template, expert_idx)
+                scale_tensor = _extract_scale_tensor(layer, state_dict, scale_key_template, expert_idx)
                 scale_weight_map[name].append(scale_tensor)
 
         # 3. Process scale tensor and set to layer
@@ -845,7 +912,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
         self.moe_quant_type = self.quant_config.algo
         self.pack_num = 1
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
         """
         Paddle cutlass process prequanted weights.
         """
@@ -855,9 +922,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
         down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None)
 
         up_gate_proj_weights, down_proj_weights, logical_expert_ids, _ = layer.load_experts_weight(
-            state_dict,
-            up_gate_proj_expert_weight_key,
-            down_proj_expert_weight_key,
+            state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key, is_rearrange
         )
         # self.check(layer, up_gate_proj_weights, down_proj_weights)
         up_gate_proj_weight_scale = []
@@ -1065,7 +1130,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
         """
         Paddle cutlass load weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         self.check(layer, up_gate_proj_weights, down_proj_weights)
         for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
             weight_name = self.added_weight_attrs[idx]
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
index 9c2c2e9bd..b53302093 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -99,7 +99,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
         """
         deepgemm create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
 
         self.check(layer, up_gate_proj_weights, down_proj_weights)
 
@@ -124,7 +124,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
             quanted_weight_scale = quanted_weight_scale.transpose([0, 2, 1]).contiguous()
             getattr(layer, scale_name).set_value(quanted_weight_scale)
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
         """
         Paddle cutlass process prequanted weights.
         """
@@ -134,9 +134,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
         down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None)
 
         up_gate_proj_weights, down_proj_weights, logical_expert_ids, _ = layer.load_experts_weight(
-            state_dict,
-            up_gate_proj_expert_weight_key,
-            down_proj_expert_weight_key,
+            state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key, is_rearrange
         )
         # self.check(layer, up_gate_proj_weights, down_proj_weights)
         up_gate_proj_weight_scale = []
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
index fb05fff09..f41c1614e 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
@@ -197,7 +197,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
         """
         Marlin MoE load weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
         assert up_gate_proj_weights[0].shape == [
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
index 8af8e9859..5526df882 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -48,7 +48,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
             "down_proj_weight_scale",
         ]
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False) -> None:
         """process_prequanted_weights"""
         pass
 
@@ -112,7 +112,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
         """
         Triton MoE load weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
 
@@ -311,7 +311,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
             "down_proj_in_scale",
         ]
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False) -> None:
         """process_prequanted_weights"""
 
         up_gate_proj_tensor, down_proj_tensor = layer.extract_moe_ffn_weights(state_dict)
@@ -595,7 +595,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
             "down_proj_weight_scale",
         ]
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False) -> None:
         """process_prequanted_weights"""
 
         raise NotImplementedError
@@ -667,7 +667,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
         """
         Triton MoE create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
 
         self.check(layer, up_gate_proj_weights, down_proj_weights)
         for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
index b230d9e5e..20d8e5196 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
@@ -73,7 +73,7 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
         """
         pass
 
-    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
         """
         Paddle cutlass process prequanted weights.
         """
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py
index 190e8d425..80de3d3a0 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py
@@ -34,7 +34,7 @@ class XPUMoEMethod(UnquantizedFusedMoEMethod):
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
 
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         for weights in [up_gate_proj_weights, down_proj_weights]:
             for idx, weight in enumerate(weights):
                 weights[idx] = weight.transpose([1, 0])
@@ -119,7 +119,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
         """
         Paddle cutlass create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
         assert up_gate_proj_weights[0].shape == [
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index 1de42ecf1..beb9bcade 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -80,6 +80,7 @@ class FusedMoE(nn.Layer):
         layer_idx: int = -1,
         moe_tag: str = "",
         gate_correction_bias=None,
+        redundant_table_manger: RedundantExpertManger = None,
         weight_key_map: dict = {},
     ):
         """
@@ -147,15 +148,8 @@ class FusedMoE(nn.Layer):
             self.moe_quant_type = moe_quant_config.name()
         else:
             self.quant_method = get_moe_method()
-        self.redundant_table_manger = None
+        self.redundant_table_manger = redundant_table_manger
         if self.ep_size > 1:
-            if fd_config.model_config.enable_redundant_experts is True:
-                self.redundant_table_manger = RedundantExpertManger(
-                    n_routed_experts=fd_config.model_config.moe_num_experts,
-                    num_hidden_layers=fd_config.model_config.num_hidden_layers,
-                    redundant_experts_num=fd_config.model_config.redundant_experts_num,
-                    ep_size=self.ep_size,
-                )
             self.quant_method.init_ep(self)
 
         if fd_config.load_config.dynamic_load_weight:
@@ -423,6 +417,7 @@ class FusedMoE(nn.Layer):
         state_dict: dict,
         up_gate_proj_expert_weight_key: str,
         down_proj_expert_weight_key: str,
+        is_rearrange: bool = False,
     ):
         """
         Load experts weight from state_dict.
@@ -451,7 +446,12 @@ class FusedMoE(nn.Layer):
             ]
         up_gate_proj_weights = []
         down_proj_weights = []
-        is_ffn_merged = up_gate_proj_expert_weight_key.format(self.expert_id_offset) in state_dict
+        if isinstance(state_dict, list):
+            state_dict = dict(state_dict)
+        is_ffn_merged = (
+            up_gate_proj_expert_weight_key.format(logical_expert_ids[0] if is_rearrange else self.expert_id_offset)
+            in state_dict
+        )
         if is_ffn_merged:
             for expert_idx in logical_expert_ids:
                 down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx)
@@ -533,10 +533,12 @@ class FusedMoE(nn.Layer):
         assert up_gate_proj_expert_weight_key is not None, "up_gate_proj_expert_weight_key should not be none."
         assert down_proj_expert_weight_key is not None, "down_proj_expert_weight_key should not be none."
 
-        up_gate_proj_weights, down_proj_weights, logical_expert_ids, _ = self.load_experts_weight(
-            state_dict,
-            up_gate_proj_expert_weight_key,
-            down_proj_expert_weight_key,
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            self.load_experts_weight(
+                state_dict,
+                up_gate_proj_expert_weight_key,
+                down_proj_expert_weight_key,
+            )
         )
         assert (
             len(up_gate_proj_weights) == self.num_local_experts
@@ -545,7 +547,7 @@ class FusedMoE(nn.Layer):
             len(down_proj_weights) == self.num_local_experts
         ), "down_proj_weights length should be equal to num_local_experts."
 
-        return up_gate_proj_weights, down_proj_weights
+        return up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list
 
     def extract_gate_correction_bias(self, gate_correction_bias_key, state_dict):
         """
@@ -561,7 +563,7 @@ class FusedMoE(nn.Layer):
         if is_supported_moe_backend is not None and is_supported_moe_backend(self.quant_method):
             if self.fd_config.model_config.is_quantized:
                 if getattr(self.fd_config.quant_config, "is_permuted", True):
-                    self.quant_method.process_prequanted_weights(self, state_dict)
+                    self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
                 else:
                     self.quant_method.process_loaded_weights(self, state_dict)
             else:
@@ -569,7 +571,7 @@ class FusedMoE(nn.Layer):
         else:
             if self.fd_config.model_config.is_quantized:
                 if getattr(self.fd_config.quant_config, "is_permuted", True):
-                    self.quant_method.process_prequanted_weights(self, state_dict)
+                    self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
                 else:
                     self.quant_method.create_weights(self, state_dict)
             else:
diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
index a003e1888..f76ff8ca4 100644
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -108,7 +108,7 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
         layer.weight.copy_(quanted_weight_tensor, False)
         layer.weight_scale.set_value(weight_block_scale_tensor)
 
-    def process_prequanted_weights(self, layer, state_dict):
+    def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False):
         """
         process_prequanted_weights
         """
diff --git a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
index 9576882ec..cf7161437 100644
--- a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
@@ -90,7 +90,7 @@ class TensorWiseFP8LinearMethod(QuantMethodBase):
             default_initializer=paddle.nn.initializer.Constant(0),
         )
 
-    def process_prequanted_weights(self, layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False) -> None:
         """
         Process pre-quantized weights before applying them to the model
         Args:
diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
index 6e4c6f34b..40161eb5d 100644
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -305,7 +305,7 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
     ) -> None:
         super().__init__(quant_config)
 
-    def process_prequanted_weights(self, layer, state_dict) -> None:
+    def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False) -> None:
         """
         Process pre-quantized weights before applying them to the model
         Args:
diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py
index 25b4dde28..ce7bb8ac4 100644
--- a/fastdeploy/model_executor/load_weight_utils.py
+++ b/fastdeploy/model_executor/load_weight_utils.py
@@ -127,7 +127,11 @@ def load_ep_checkpoint(model_path: str, fd_config: FDConfig, return_numpy: bool
             num_local_ffn_keys.append(down_proj_in_scale_key)
 
         # for EP w4a8, we need all expert's activation_scale for up_gate_proj
-        for j in range(fd_config.model_config.moe_num_experts):
+        num_experts = fd_config.model_config.moe_num_experts
+        if isinstance(num_experts, list):
+            num_experts = num_experts[0]
+
+        for j in range(num_experts):
             up_gate_proj_in_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.activation_scale"
             num_local_ffn_keys.append(up_gate_proj_in_scale_key)
 
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
index 8c497996b..c04e67d4e 100644
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -49,6 +49,7 @@ from fastdeploy.model_executor.models.model_base import ModelForCasualLM
 from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
 from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
 from fastdeploy.model_executor.models.utils import WeightMeta
+from fastdeploy.worker.experts_manager import RedundantExpertManger
 
 
 class Ernie4_5_MLP(nn.Layer):
@@ -97,7 +98,9 @@ class Ernie4_5_MLP(nn.Layer):
 
 
 class Ernie4_5_MoE(nn.Layer):
-    def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None:
+    def __init__(
+        self, fd_config: FDConfig, layer_id: int, prefix: str, redundant_table_manger: RedundantExpertManger = None
+    ) -> None:
         super().__init__()
         moe_quant_type = ""
         if hasattr(fd_config.quant_config, "moe_quant_type"):
@@ -175,6 +178,7 @@ class Ernie4_5_MoE(nn.Layer):
             top_k=fd_config.model_config.moe_k,
             layer_idx=layer_id,
             gate_correction_bias=None,
+            redundant_table_manger=redundant_table_manger,
             weight_key_map=weight_key_map,
         )
 
@@ -209,6 +213,9 @@ class Ernie4_5_MoE(nn.Layer):
         if self.num_shared_experts > 0:
             self.shared_experts.load_state_dict(state_dict)
 
+    def update_state_dict(self, state_dict):
+        self.fused_moe.load_state_dict(state_dict, True)
+
     def split_allgather_out(self, hidden_states: paddle.Tensor, token_num: int):
         token_num_per_rank = (token_num + self.tensor_parallel_size - 1) // self.tensor_parallel_size
         # AllGather will hang when the data shapes on multi-ranks are different!
@@ -287,6 +294,7 @@ class Ernie4_5_DecoderLayer(nn.Layer):
     def __init__(
         self,
         fd_config: FDConfig,
+        redundant_table_manger: RedundantExpertManger = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -305,6 +313,7 @@ class Ernie4_5_DecoderLayer(nn.Layer):
             self.mlp = Ernie4_5_MoE(
                 fd_config=fd_config,
                 layer_id=layer_id,
+                redundant_table_manger=redundant_table_manger,
                 prefix=f"{prefix}.mlp",
             )
         else:
@@ -334,6 +343,9 @@ class Ernie4_5_DecoderLayer(nn.Layer):
         self.input_layernorm.load_state_dict(state_dict)
         self.post_attention_layernorm.load_state_dict(state_dict)
 
+    def update_state_dict(self, state_dict):
+        self.mlp.update_state_dict(state_dict)
+
     def forward(
         self,
         forward_meta: ForwardMeta,
@@ -374,6 +386,15 @@ class Ernie4_5_Model(nn.Layer):
 
         self.num_layers = fd_config.model_config.num_hidden_layers
         fd_config.model_config.pretrained_config.prefix_name = "ernie"
+        self.fd_config = fd_config
+        self.redundant_table_manger = None
+        if fd_config.model_config.enable_redundant_experts is True:
+            self.redundant_table_manger = RedundantExpertManger(
+                n_routed_experts=fd_config.model_config.moe_num_experts,
+                num_hidden_layers=fd_config.model_config.num_hidden_layers,
+                redundant_experts_num=fd_config.model_config.redundant_experts_num,
+                ep_size=fd_config.parallel_config.expert_parallel_size,
+            )
 
         self.embed_tokens = VocabParallelEmbedding(
             fd_config=fd_config,
@@ -387,6 +408,7 @@ class Ernie4_5_Model(nn.Layer):
             [
                 Ernie4_5_DecoderLayer(
                     fd_config=fd_config,
+                    redundant_table_manger=self.redundant_table_manger,
                     prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}",
                 )
                 for i in range(self.num_layers)
@@ -415,6 +437,22 @@ class Ernie4_5_Model(nn.Layer):
             logger.info(f"Start load layer {i}")
             self.layers[i].load_state_dict(state_dict)
 
+    def update_state_dict(self, state_dict):
+        """
+        Update model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        for i in range(
+            self.fd_config.model_config.moe_layer_start_index,
+            self.fd_config.model_config.num_hidden_layers,
+        ):
+            logger.info(f"Start update layer {i}")
+            self.layers[i].update_state_dict(state_dict)
+
     def forward(
         self,
         ids_remove_padding: paddle.Tensor,
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
index 950109c93..2df1e924f 100644
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -86,8 +86,8 @@ class Ernie4_5_VLMoeBlock(nn.Layer):
     ) -> None:
         super().__init__()
         moe_quant_type = ""
-        if hasattr(fd_config, "quant_config") and fd_config.quant_config is not None:
-            moe_quant_type = getattr(fd_config.quant_config, "name", lambda: "")()
+        if hasattr(fd_config.quant_config, "moe_quant_type"):
+            moe_quant_type = fd_config.quant_config.moe_quant_type
 
         if moe_quant_type == "tensor_wise_fp8" or (
             moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized