【New Feature】W4afp8 supports per group quantization (#4987)

* w4afp8 支持per group

* code style

* fix transpose

* revert fast hardmard

---------

Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com>
Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
This commit is contained in:
yangjianfengo1
2025-11-13 19:17:27 +08:00
committed by GitHub
parent a5e949d9d0
commit ae7bee8122
21 changed files with 3114 additions and 2248 deletions

View File

@@ -39,6 +39,7 @@ class MixQuantConfig(QuantConfigBase):
is_permuted: bool = True,
is_quantized: bool = False,
hadamard_block_size: int = 128,
moe_dynamic_quant: bool = False,
) -> None:
super().__init__()
self.dense_quant_type = dense_quant_type
@@ -55,7 +56,9 @@ class MixQuantConfig(QuantConfigBase):
self.quant_round_type = 0
self.is_permuted = is_permuted
self.is_checkpoint_bf16 = not is_quantized
self.is_quantized = is_quantized
self.hadamard_block_size = hadamard_block_size
self.moe_dynamic_quant = moe_dynamic_quant
def name(self) -> str:
return "mix_quant"
@@ -72,6 +75,7 @@ class MixQuantConfig(QuantConfigBase):
config.get("is_permuted", True),
config.get("is_quantized", False),
config.get("hadamard_block_size", 128),
config.get("moe_dynamic_quant", False),
)
def get_quant_method(self, layer) -> Optional[QuantMethodBase]: