support model loading for w4a8 offline quant (#3064)

支持W4A8 EP 对离线量化权重的load
2025-10-07 17:41:52 +08:00 · 2025-07-29 21:54:37 +08:00
parent be0a0f2bb2
commit 3214fb5393
4 changed files with 80 additions and 10 deletions
--- a/fastdeploy/model_executor/layers/quantization/w4a8.py
+++ b/fastdeploy/model_executor/layers/quantization/w4a8.py
@@ -25,15 +25,17 @@ class W4A8Config(QuantConfigBase):
    quantization config for weight 4bits and activation 8bits
    """

-    def __init__(self) -> None:
+    def __init__(self, is_permuted) -> None:
        super().__init__()
+        self.is_permuted = is_permuted

    def name(self) -> str:
        return "w4a8"

    @classmethod
    def from_config(cls, config: dict) -> "W4A8Config":
-        return cls()
+        is_permuted = getattr(config, "is_permuted", False)
+        return cls(is_permuted)

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
        if isinstance(layer, FusedMoE):