FastDeploy/fastdeploy/model_executor/layers/quantization/wint2.py

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from typing import Optional

from ..moe import FusedMoE
from . import get_quantization_config
from .quant_base import QuantConfigBase, QuantMethodBase


class WINT2Config(QuantConfigBase):
    """
    Quantization config for wint8 linear and w4w2 MoE.
    """

    def __init__(
        self,
        dense_quant_type: str,
        dense_quant_granularity: str,
        moe_quant_type: str,
        moe_w4_quant_type: str,
        moe_w4_quant_granularity: str,
        moe_w4_quant_start_layer: int,
        moe_w4_quant_end_layer: int,
        moe_w2_quant_type: str,
        moe_w2_quant_granularity: str,
        moe_w2_quant_group_size: int,
        moe_w2_quant_start_layer: int,
        moe_w2_quant_end_layer: int,
    ) -> None:
        super().__init__()
        self.quant_max_bound = 0
        self.quant_min_bound = 0
        self.quant_round_type = 0

        # wint2 quantization config
        self.dense_quant_type = dense_quant_type
        self.dense_quant_granularity = dense_quant_granularity
        self.moe_quant_type = moe_quant_type
        self.moe_w4_quant_type = moe_w4_quant_type
        self.moe_w4_quant_granularity = moe_w4_quant_granularity
        self.moe_w4_quant_start_layer = moe_w4_quant_start_layer
        self.moe_w4_quant_end_layer = moe_w4_quant_end_layer
        self.moe_w2_quant_type = moe_w2_quant_type
        self.moe_w2_quant_granularity = moe_w2_quant_granularity
        self.moe_w2_quant_group_size = moe_w2_quant_group_size
        self.moe_w2_quant_start_layer = moe_w2_quant_start_layer
        self.moe_w2_quant_end_layer = moe_w2_quant_end_layer

    def name(self) -> str:
        """
        Get the name of the quantization configuration.
        Returns:
            str: The name of the quantization configuration.
        """
        return "wint2"

    @classmethod
    def from_config(cls, config: dict) -> "WINT2Config":
        """
        Create a new instance of `WINT2Config` using the provided configuration dictionary.
        Args:
            config (dict): A dictionary containing the configuration parameters for the new instance.

        Returns:
            WINT2Config: The newly created instance of `WINT2Config`.
        """

        dense_quant_type = config.get("dense_quant_config", "wint8")
        dense_quant_granularity = config.get("dense_quant_granularity", "per_channel")

        moe_quant_config = config.get("moe_quant_config", {})
        moe_quant_type = moe_quant_config.get("quant_type", "w4w2")

        moe_w4_quant_config = moe_quant_config.get("moe_w4_quant_config", {})
        moe_w4_quant_type = moe_w4_quant_config.get("quant_type", "wint4")
        moe_w4_quant_granularity = moe_w4_quant_config.get("quant_granularity", "per_channel")
        moe_w4_quant_start_layer = moe_w4_quant_config.get("quant_start_layer", 0)
        moe_w4_quant_end_layer = moe_w4_quant_config.get("quant_end_layer", 6)

        moe_w2_quant_config = moe_quant_config.get("moe_w2_quant_config", {})
        moe_w2_quant_type = moe_w2_quant_config.get("quant_type", "wint2")
        moe_w2_quant_granularity = moe_w2_quant_config.get("quant_granularity", "pp_acc")
        moe_w2_quant_group_size = moe_w2_quant_config.get("quant_group_size", 0)
        moe_w2_quant_start_layer = moe_w2_quant_config.get("quant_start_layer", 0)
        moe_w2_quant_end_layer = moe_w2_quant_config.get("quant_end_layer", 0)

        return cls(
            dense_quant_type,
            dense_quant_granularity,
            moe_quant_type,
            moe_w4_quant_type,
            moe_w4_quant_granularity,
            moe_w4_quant_start_layer,
            moe_w4_quant_end_layer,
            moe_w2_quant_type,
            moe_w2_quant_granularity,
            moe_w2_quant_group_size,
            moe_w2_quant_start_layer,
            moe_w2_quant_end_layer,
        )

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
        """
        Get the quantization method associated with the given layer based on the current quantization configuration.
        Args:
            layer (Layer): The layer for which the quantization method should be retrieved.

        Returns:
            QuantMethodBase: The quantization method associated with the given layer.
        """
        if isinstance(layer, FusedMoE):
            if layer.layer_idx <= self.moe_w4_quant_end_layer:
                return get_quantization_config(self.moe_w4_quant_type).from_config({}).get_quant_method(layer)
            else:
                from fastdeploy.model_executor.layers.moe.fused_moe_wint2_backend import (
                    CutlassWint2FusedMoeMethod,
                )

                return CutlassWint2FusedMoeMethod(self)
        else:
            return get_quantization_config(self.dense_quant_type).from_config({}).get_quant_method(layer)