From 4f17f9aa6e1287c30959f36be9c6803efd55001e Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <hiteezsf@163.com>
Date: Fri, 15 Aug 2025 12:54:08 +0800
Subject: [PATCH] add w4a8 online quant eplb

---
 .../backends/dcu/fused_moe_triton_backends.py |  2 +-
 .../gcu/moe/fused_moe_method_gcu_backend.py   |  4 +-
 .../layers/moe/fused_moe_cutlass_backend.py   | 86 ++++++++++++++-----
 .../layers/moe/fused_moe_deepgemm_backend.py  |  4 +-
 .../layers/moe/fused_moe_marlin_backend.py    |  2 +-
 .../layers/moe/fused_moe_triton_backend.py    |  6 +-
 .../layers/moe/fused_moe_xpu_backend.py       |  4 +-
 fastdeploy/model_executor/layers/moe/moe.py   | 12 +--
 8 files changed, 83 insertions(+), 37 deletions(-)

diff --git a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
index 0a6c31b06..bd02e78f4 100644
--- a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
@@ -46,7 +46,7 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
         """
         Triton MoE create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
         assert self.quant_method.name() == "wint8"
diff --git a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
index 1877bf901..b55113649 100644
--- a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
@@ -51,7 +51,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
         Paddle gcu create weight process.
         """
         # bf16
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
         stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
         for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]):
@@ -312,7 +312,7 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
         """
         Paddle cutlass create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         self.check(layer, up_gate_proj_weights, down_proj_weights)
 
         def quant_worker(p_group_idx, shared_dict, weights, moe_quant_type, group_size):
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
index 56ce1e874..2f5ee6d50 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -76,7 +76,9 @@ class CutlassMoEMethod(MoEMethodBase):
         Paddle cutlass create weight process.
         """
         # bf16
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            layer.extract_moe_ffn_weights(state_dict)
+        )
         stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
         stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
         for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]):
@@ -443,7 +445,9 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
         """
         Paddle cutlass create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            layer.extract_moe_ffn_weights(state_dict)
+        )
         self.check(layer, up_gate_proj_weights, down_proj_weights)
         for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
             weight_name = self.added_weight_attrs[idx]
@@ -454,9 +458,13 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
             quanted_weight = paddle.stack(weight_list, axis=0)
             create_and_set_parameter(layer, weight_name, quanted_weight)
 
-        self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict)
+        self.create_w4a8_scale_weights(
+            layer, layer.weight_key_map, state_dict, logical_expert_ids, ep_rank_to_expert_id_list
+        )
 
-    def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict, state_dict: dict):
+    def create_w4a8_scale_weights(
+        self, layer: nn.Layer, weight_key_map: dict, state_dict: dict, logical_expert_ids, ep_rank_to_expert_id_list
+    ):
         """
         Get w4a8 weights from state dict and process them.
         Args:
@@ -465,8 +473,15 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
             state_dict (dict): The state dict.
         """
 
-        def _extract_scale_tensor(state_dict, key_template, expert_idx):
-            return get_tensor(state_dict.pop(key_template.format(expert_idx)))
+        def _extract_scale_tensor(layer: nn.Layer, state_dict, key_template, expert_idx):
+            return get_tensor(
+                (
+                    state_dict.pop(key_template.format(expert_idx))
+                    if key_template.format(expert_idx) in state_dict
+                    else key_template.format(expert_idx)
+                ),
+                layer.fd_config.model_config.model,
+            )
 
         def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
             processed_in_scale = 1 / paddle.concat(in_scales)
@@ -508,17 +523,23 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
 
         # 2. Extract scale tensor from state dict
         if layer.ep_size > 1:
-            for expert_idx in range(layer.num_experts):
-                scale_tensor = get_tensor(state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)])
+            for expert_idx in ep_rank_to_expert_id_list:
+                scale_tensor = get_tensor(
+                    (
+                        state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)]
+                        if scale_key_map["up_gate_proj_in_scale"].format(expert_idx) in state_dict
+                        else scale_key_map["up_gate_proj_in_scale"].format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
                 up_gate_proj_in_scales_all_experts.append(1 / scale_tensor)
             create_and_set_parameter(
                 layer, "up_gate_proj_in_scale_all_experts", paddle.concat(up_gate_proj_in_scales_all_experts)
             )
 
-        for local_expert_idx in range(layer.num_local_experts):
-            expert_idx = local_expert_idx + layer.expert_id_offset
+        for expert_idx in logical_expert_ids:
             for name, scale_key_template in scale_key_map.items():
-                scale_tensor = _extract_scale_tensor(state_dict, scale_key_template, expert_idx)
+                scale_tensor = _extract_scale_tensor(layer, state_dict, scale_key_template, expert_idx)
                 scale_weight_map[name].append(scale_tensor)
 
         # 3. Process scale tensor and set to layer
@@ -647,7 +668,9 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
         """
         Paddle cutlass create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            layer.extract_moe_ffn_weights(state_dict)
+        )
         self.check(layer, up_gate_proj_weights, down_proj_weights)
         for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
             weight_name = self.added_weight_attrs[idx]
@@ -658,9 +681,13 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
             quanted_weight = paddle.stack(weight_list, axis=0)
             create_and_set_parameter(layer, weight_name, quanted_weight)
 
-        self.create_w4afp8_scale_weights(layer, layer.weight_key_map, state_dict)
+        self.create_w4afp8_scale_weights(
+            layer, layer.weight_key_map, state_dict, logical_expert_ids, ep_rank_to_expert_id_list
+        )
 
-    def create_w4afp8_scale_weights(self, layer: nn.Layer, weight_key_map: dict, state_dict: dict):
+    def create_w4afp8_scale_weights(
+        self, layer: nn.Layer, weight_key_map: dict, state_dict: dict, logical_expert_ids, ep_rank_to_expert_id_list
+    ):
         """
         Get w4a8 weights from state dict and process them.
         Args:
@@ -669,8 +696,15 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
             state_dict (dict): The state dict.
         """
 
-        def _extract_scale_tensor(state_dict, key_template, expert_idx):
-            return get_tensor(state_dict.pop(key_template.format(expert_idx)))
+        def _extract_scale_tensor(layer: nn.Layer, state_dict, key_template, expert_idx):
+            return get_tensor(
+                (
+                    state_dict.pop(key_template.format(expert_idx))
+                    if key_template.format(expert_idx) in state_dict
+                    else key_template.format(expert_idx)
+                ),
+                layer.fd_config.model_config.model,
+            )
 
         def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
             processed_in_scale = 1 / paddle.concat(in_scales)
@@ -713,17 +747,23 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
 
         # 2. Extract scale tensor from state dict
         if layer.ep_size > 1:
-            for expert_idx in range(layer.num_experts):
-                scale_tensor = get_tensor(state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)])
+            for expert_idx in ep_rank_to_expert_id_list:
+                scale_tensor = get_tensor(
+                    (
+                        state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)]
+                        if scale_key_map["up_gate_proj_in_scale"].format(expert_idx) in state_dict
+                        else scale_key_map["up_gate_proj_in_scale"].format(expert_idx)
+                    ),
+                    layer.fd_config.model_config.model,
+                )
                 up_gate_proj_in_scales_all_experts.append(1 / scale_tensor)
             create_and_set_parameter(
                 layer, "up_gate_proj_in_scale_all_experts", paddle.concat(up_gate_proj_in_scales_all_experts)
             )
 
-        for local_expert_idx in range(layer.num_local_experts):
-            expert_idx = local_expert_idx + layer.expert_id_offset
+        for expert_idx in logical_expert_ids:
             for name, scale_key_template in scale_key_map.items():
-                scale_tensor = _extract_scale_tensor(state_dict, scale_key_template, expert_idx)
+                scale_tensor = _extract_scale_tensor(layer, state_dict, scale_key_template, expert_idx)
                 scale_weight_map[name].append(scale_tensor)
 
         # 3. Process scale tensor and set to layer
@@ -793,7 +833,9 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
         """
         Paddle cutlass create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            layer.extract_moe_ffn_weights(state_dict)
+        )
         self.check(layer, up_gate_proj_weights, down_proj_weights)
 
         for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
index bf39adffd..7c81d4732 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -37,7 +37,9 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
         deepgemm create weight process.
         """
 
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            layer.extract_moe_ffn_weights(state_dict)
+        )
 
         self.check(layer, up_gate_proj_weights, down_proj_weights)
 
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
index 848f52b95..97002edf0 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
@@ -143,7 +143,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
         """
         Marlin MoE create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
         assert up_gate_proj_weights[0].shape == [
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
index 352fdbca2..6f5683af4 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -56,7 +56,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
         """
         Triton MoE create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
 
@@ -267,7 +267,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
     def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
         """process_prequanted_weights"""
 
-        up_gate_proj_tensor, down_proj_tensor = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_tensor, down_proj_tensor, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert up_gate_proj_tensor[0].shape == [
             layer.hidden_size,
             layer.moe_intermediate_size * 2,
@@ -534,7 +534,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
         """
         Triton MoE create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
 
         self.check(layer, up_gate_proj_weights, down_proj_weights)
 
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py
index c320ed481..aa413f5f4 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py
@@ -36,7 +36,7 @@ class XPUMoEMethod(MoEMethodBase):
         Paddle cutlass create weight process.
         """
         # bf16
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         for weights in [up_gate_proj_weights, down_proj_weights]:
             for idx, weight in enumerate(weights):
                 weights[idx] = weight.transpose([1, 0])
@@ -130,7 +130,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
         """
         Paddle cutlass create weight process.
         """
-        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
+        up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
         assert len(up_gate_proj_weights) == layer.num_local_experts
         assert len(down_proj_weights) == layer.num_local_experts
         assert up_gate_proj_weights[0].shape == [
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index 627041b17..ef7cb32f5 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -334,10 +334,12 @@ class FusedMoE(nn.Layer):
         assert up_gate_proj_expert_weight_key is not None, "up_gate_proj_expert_weight_key should not be none."
         assert down_proj_expert_weight_key is not None, "down_proj_expert_weight_key should not be none."
 
-        up_gate_proj_weights, down_proj_weights, logical_expert_ids, _ = self.load_experts_weight(
-            state_dict,
-            up_gate_proj_expert_weight_key,
-            down_proj_expert_weight_key,
+        up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
+            self.load_experts_weight(
+                state_dict,
+                up_gate_proj_expert_weight_key,
+                down_proj_expert_weight_key,
+            )
         )
         assert (
             len(up_gate_proj_weights) == self.num_local_experts
@@ -346,7 +348,7 @@ class FusedMoE(nn.Layer):
             len(down_proj_weights) == self.num_local_experts
         ), "down_proj_weights length should be equal to num_local_experts."
 
-        return up_gate_proj_weights, down_proj_weights
+        return up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list
 
     def extract_gate_correction_bias(self, gate_correction_bias_key, state_dict):
         """