From e150a418d44281f4564cf115d1027be2822bd772 Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Mon, 24 Nov 2025 18:59:18 +0800
Subject: [PATCH] support moe offline quant (#5142)

---
 fastdeploy/config.py                                        | 1 +
 fastdeploy/model_executor/layers/moe/moe.py                 | 2 +-
 fastdeploy/model_executor/layers/quantization/__init__.py   | 6 ++++++
 fastdeploy/model_executor/models/ernie4_5_moe.py            | 3 ++-
 .../model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py    | 3 ++-
 5 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 91d2bae64..7fbe16a00 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -180,6 +180,7 @@ class ModelConfig:
     ):
         self.model = ""
         self.is_quantized = False
+        self.is_moe_quantized = False
         self.max_model_len = 0
         self.dtype = "bfloat16"
         self.enable_logprob = False
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index d8abac428..223c3f84b 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -584,7 +584,7 @@ class FusedMoE(nn.Layer):
         """
         load_state_dict function.
         """
-        if self.is_quantized:
+        if self.is_quantized or self.fd_config.model_config.is_moe_quantized:
             if getattr(self.fd_config.quant_config, "is_permuted", True):
                 self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
             else:
diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
index f87163698..5d882aed2 100644
--- a/fastdeploy/model_executor/layers/quantization/__init__.py
+++ b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -51,8 +51,14 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
             if quantization_config is not None:
                 if "is_quantized" in quantization_config:
                     model_config.is_quantized = quantization_config["is_quantized"]
+                elif "is_moe_quantized" in quantization_config:
+                    model_config.is_moe_quantized = quantization_config["is_moe_quantized"]
                 elif "kv_cache_quant_type" not in quantization_config:
                     model_config.is_quantized = True
+                    if "is_moe_quantized" not in quantization_config:
+                        model_config.is_quantized = True
+                    else:
+                        model_config.is_moe_quantized = True
             if quantization_config is not None and quantization_config.get("quantization", None) is None:
                 raise ValueError(
                     "quantization_config should have a key named 'quantization' for specify quant config."
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
index 9ff0a2185..1a12f3997 100644
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -138,7 +138,8 @@ class Ernie4_5_MoE(nn.Layer):
                 "down_proj_expert_code_zp_key": f"{prefix}.experts.{{}}.down_proj.code_zp",
             }
         elif moe_quant_type == "tensor_wise_fp8" or (
-            moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized
+            moe_quant_type == "block_wise_fp8"
+            and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized)
         ):
             weight_key_map = {
                 "gate_weight_key": f"{prefix}.gate.weight",
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
index 026d8c7d7..2e357579b 100644
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -105,7 +105,8 @@ class Ernie4_5_VLMoeBlock(nn.Layer):
                 moe_quant_type = fd_config.quant_config.moe_quant_type
 
         if moe_quant_type == "tensor_wise_fp8" or (
-            moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized
+            moe_quant_type == "block_wise_fp8"
+            and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized)
         ):
             weight_key_map = {
                 "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias",