support w4afp8 moe offline permute & load (#5613)

2025-12-24 13:28:13 +08:00 · 2025-12-22 15:12:57 +08:00
parent 81384ef29e
commit 40f3897a4e
3 changed files with 70 additions and 38 deletions
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -212,8 +212,10 @@ class FusedMoE(nn.Layer):
        self._dtype = self._helper.get_default_dtype()
        self.weight_dtype = self._dtype

-        self.is_quantized = fd_config.model_config.is_quantized and not (
-            fd_config.quant_config.name() == "mix_quant" and fd_config.quant_config.moe_quant_type is None
+        self.is_moe_quantized = getattr(self.fd_config.model_config, "is_moe_quantized", False)
+        self.is_quantized = self.is_moe_quantized or (
+            fd_config.model_config.is_quantized
+            and not (fd_config.quant_config.name() == "mix_quant" and fd_config.quant_config.moe_quant_type is None)
        )
        moe_quant_config = fd_config.quant_config
        self.moe_quant_config = moe_quant_config