support w4afp8 moe offline permute & load (#5613)

2025-12-24 13:28:13 +08:00 · 2025-12-22 15:12:57 +08:00
parent 81384ef29e
commit 40f3897a4e
3 changed files with 70 additions and 38 deletions
--- a/fastdeploy/model_executor/layers/quantization/mix_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py
@@ -40,6 +40,7 @@ class MixQuantConfig(QuantConfigBase):
        is_quantized: bool = False,
        hadamard_block_size: int = 128,
        moe_dynamic_quant: bool = False,
+        is_moe_quantized: bool = False,
    ) -> None:
        super().__init__()
        self.dense_quant_type = dense_quant_type
@@ -59,6 +60,7 @@ class MixQuantConfig(QuantConfigBase):
        self.is_quantized = is_quantized
        self.hadamard_block_size = hadamard_block_size
        self.moe_dynamic_quant = moe_dynamic_quant
+        self.is_moe_quantized = is_moe_quantized

    def name(self) -> str:
        return "mix_quant"
@@ -76,6 +78,7 @@ class MixQuantConfig(QuantConfigBase):
            config.get("is_quantized", False),
            config.get("hadamard_block_size", 128),
            config.get("moe_dynamic_quant", False),
+            config.get("is_moe_quantized", False),
        )

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
@@ -102,7 +105,7 @@ class MixQuantConfig(QuantConfigBase):
                        .from_config(
                            {
                                "is_permuted": self.is_permuted,
-                                "is_quantized": not self.is_checkpoint_bf16,
+                                "is_quantized": not self.is_checkpoint_bf16 or self.is_moe_quantized,
                                "hadamard_block_size": self.hadamard_block_size,
                            }
                        )