""" # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ from typing import Optional from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.moe.moe import FusedMoE from . import get_quantization_config from .quant_base import QuantConfigBase, QuantMethodBase class MixQuantConfig(QuantConfigBase): """ Quantization config for layers that has different quantization methods. """ def __init__( self, dense_quant_type: str, moe_quant_type: str, kv_cache_quant_type: str = None, image_moe_quant_type: str = None, is_channel_wise: bool = False, has_zero_point: bool = False, is_permuted: bool = True, is_quantized: bool = False, hadamard_block_size: int = 128, ) -> None: super().__init__() self.dense_quant_type = dense_quant_type self.moe_quant_type = moe_quant_type self.kv_cache_quant_type = kv_cache_quant_type if image_moe_quant_type is None: self.image_moe_quant_type = moe_quant_type else: self.image_moe_quant_type = image_moe_quant_type self.is_channel_wise = is_channel_wise self.has_zero_point = has_zero_point self.quant_max_bound = 0 self.quant_min_bound = 0 self.quant_round_type = 0 self.is_permuted = is_permuted self.is_checkpoint_bf16 = not is_quantized self.is_quantized = is_quantized self.hadamard_block_size = hadamard_block_size def name(self) -> str: return "mix_quant" @classmethod def from_config(cls, config: dict) -> "MixQuantConfig": return cls( config["dense_quant_type"], config["moe_quant_type"], config.get("kv_cache_quant_type", None), config.get("image_moe_quant_type", None), config.get("is_channel_wise", False), config.get("has_zero_point", False), config.get("is_permuted", True), config.get("is_quantized", False), config.get("hadamard_block_size", 128), ) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if isinstance(layer, FusedMoE): if layer.moe_tag == "Image": return ( get_quantization_config(self.image_moe_quant_type) .from_config( { "is_permuted": self.is_permuted, "is_quantized": self.is_quantized, "hadamard_block_size": self.hadamard_block_size, } ) .get_quant_method(layer) ) else: return ( get_quantization_config(self.moe_quant_type) .from_config( { "is_permuted": self.is_permuted, "is_quantized": self.is_quantized, "hadamard_block_size": self.hadamard_block_size, } ) .get_quant_method(layer) ) elif isinstance(layer, Attention): if self.kv_cache_quant_type is not None: return ( get_quantization_config("kvcache") .from_config(self.kv_cache_quant_type, self.is_channel_wise, self.has_zero_point) .get_quant_method(layer) ) else: return None else: return ( get_quantization_config(self.dense_quant_type) .from_config({"is_quantized": self.is_quantized}) .get_quant_method(layer) )