mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
support moe offline quant (#5142)
This commit is contained in:
@@ -180,6 +180,7 @@ class ModelConfig:
|
||||
):
|
||||
self.model = ""
|
||||
self.is_quantized = False
|
||||
self.is_moe_quantized = False
|
||||
self.max_model_len = 0
|
||||
self.dtype = "bfloat16"
|
||||
self.enable_logprob = False
|
||||
|
||||
@@ -584,7 +584,7 @@ class FusedMoE(nn.Layer):
|
||||
"""
|
||||
load_state_dict function.
|
||||
"""
|
||||
if self.is_quantized:
|
||||
if self.is_quantized or self.fd_config.model_config.is_moe_quantized:
|
||||
if getattr(self.fd_config.quant_config, "is_permuted", True):
|
||||
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
|
||||
else:
|
||||
|
||||
@@ -51,8 +51,14 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
|
||||
if quantization_config is not None:
|
||||
if "is_quantized" in quantization_config:
|
||||
model_config.is_quantized = quantization_config["is_quantized"]
|
||||
elif "is_moe_quantized" in quantization_config:
|
||||
model_config.is_moe_quantized = quantization_config["is_moe_quantized"]
|
||||
elif "kv_cache_quant_type" not in quantization_config:
|
||||
model_config.is_quantized = True
|
||||
if "is_moe_quantized" not in quantization_config:
|
||||
model_config.is_quantized = True
|
||||
else:
|
||||
model_config.is_moe_quantized = True
|
||||
if quantization_config is not None and quantization_config.get("quantization", None) is None:
|
||||
raise ValueError(
|
||||
"quantization_config should have a key named 'quantization' for specify quant config."
|
||||
|
||||
@@ -138,7 +138,8 @@ class Ernie4_5_MoE(nn.Layer):
|
||||
"down_proj_expert_code_zp_key": f"{prefix}.experts.{{}}.down_proj.code_zp",
|
||||
}
|
||||
elif moe_quant_type == "tensor_wise_fp8" or (
|
||||
moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized
|
||||
moe_quant_type == "block_wise_fp8"
|
||||
and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized)
|
||||
):
|
||||
weight_key_map = {
|
||||
"gate_weight_key": f"{prefix}.gate.weight",
|
||||
|
||||
@@ -105,7 +105,8 @@ class Ernie4_5_VLMoeBlock(nn.Layer):
|
||||
moe_quant_type = fd_config.quant_config.moe_quant_type
|
||||
|
||||
if moe_quant_type == "tensor_wise_fp8" or (
|
||||
moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized
|
||||
moe_quant_type == "block_wise_fp8"
|
||||
and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized)
|
||||
):
|
||||
weight_key_map = {
|
||||
"gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias",
|
||||
|
||||
Reference in New Issue
Block a user