mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-27 02:20:31 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			136 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			136 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| """
 | |
| 
 | |
| from typing import Optional
 | |
| 
 | |
| from ..moe import FusedMoE
 | |
| from . import get_quantization_config
 | |
| from .quant_base import QuantConfigBase, QuantMethodBase
 | |
| 
 | |
| 
 | |
| class WINT2Config(QuantConfigBase):
 | |
|     """
 | |
|     Quantization config for wint8 linear and w4w2 MoE.
 | |
|     """
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         dense_quant_type: str,
 | |
|         dense_quant_granularity: str,
 | |
|         moe_quant_type: str,
 | |
|         moe_w4_quant_type: str,
 | |
|         moe_w4_quant_granularity: str,
 | |
|         moe_w4_quant_start_layer: int,
 | |
|         moe_w4_quant_end_layer: int,
 | |
|         moe_w2_quant_type: str,
 | |
|         moe_w2_quant_granularity: str,
 | |
|         moe_w2_quant_group_size: int,
 | |
|         moe_w2_quant_start_layer: int,
 | |
|         moe_w2_quant_end_layer: int,
 | |
|     ) -> None:
 | |
|         super().__init__()
 | |
|         self.quant_max_bound = 0
 | |
|         self.quant_min_bound = 0
 | |
|         self.quant_round_type = 0
 | |
| 
 | |
|         # wint2 quantization config
 | |
|         self.dense_quant_type = dense_quant_type
 | |
|         self.dense_quant_granularity = dense_quant_granularity
 | |
|         self.moe_quant_type = moe_quant_type
 | |
|         self.moe_w4_quant_type = moe_w4_quant_type
 | |
|         self.moe_w4_quant_granularity = moe_w4_quant_granularity
 | |
|         self.moe_w4_quant_start_layer = moe_w4_quant_start_layer
 | |
|         self.moe_w4_quant_end_layer = moe_w4_quant_end_layer
 | |
|         self.moe_w2_quant_type = moe_w2_quant_type
 | |
|         self.moe_w2_quant_granularity = moe_w2_quant_granularity
 | |
|         self.moe_w2_quant_group_size = moe_w2_quant_group_size
 | |
|         self.moe_w2_quant_start_layer = moe_w2_quant_start_layer
 | |
|         self.moe_w2_quant_end_layer = moe_w2_quant_end_layer
 | |
| 
 | |
|     def name(self) -> str:
 | |
|         """
 | |
|         Get the name of the quantization configuration.
 | |
|         Returns:
 | |
|             str: The name of the quantization configuration.
 | |
|         """
 | |
|         return "wint2"
 | |
| 
 | |
|     @classmethod
 | |
|     def from_config(cls, config: dict) -> "WINT2Config":
 | |
|         """
 | |
|         Create a new instance of `WINT2Config` using the provided configuration dictionary.
 | |
|         Args:
 | |
|             config (dict): A dictionary containing the configuration parameters for the new instance.
 | |
| 
 | |
|         Returns:
 | |
|             WINT2Config: The newly created instance of `WINT2Config`.
 | |
|         """
 | |
| 
 | |
|         dense_quant_type = config.get("dense_quant_config", "wint8")
 | |
|         dense_quant_granularity = config.get("dense_quant_granularity", "per_channel")
 | |
| 
 | |
|         moe_quant_config = config.get("moe_quant_config", {})
 | |
|         moe_quant_type = moe_quant_config.get("quant_type", "w4w2")
 | |
| 
 | |
|         moe_w4_quant_config = moe_quant_config.get("moe_w4_quant_config", {})
 | |
|         moe_w4_quant_type = moe_w4_quant_config.get("quant_type", "wint4")
 | |
|         moe_w4_quant_granularity = moe_w4_quant_config.get("quant_granularity", "per_channel")
 | |
|         moe_w4_quant_start_layer = moe_w4_quant_config.get("quant_start_layer", 0)
 | |
|         moe_w4_quant_end_layer = moe_w4_quant_config.get("quant_end_layer", 6)
 | |
| 
 | |
|         moe_w2_quant_config = moe_quant_config.get("moe_w2_quant_config", {})
 | |
|         moe_w2_quant_type = moe_w2_quant_config.get("quant_type", "wint2")
 | |
|         moe_w2_quant_granularity = moe_w2_quant_config.get("quant_granularity", "pp_acc")
 | |
|         moe_w2_quant_group_size = moe_w2_quant_config.get("quant_group_size", 0)
 | |
|         moe_w2_quant_start_layer = moe_w2_quant_config.get("quant_start_layer", 0)
 | |
|         moe_w2_quant_end_layer = moe_w2_quant_config.get("quant_end_layer", 0)
 | |
| 
 | |
|         return cls(
 | |
|             dense_quant_type,
 | |
|             dense_quant_granularity,
 | |
|             moe_quant_type,
 | |
|             moe_w4_quant_type,
 | |
|             moe_w4_quant_granularity,
 | |
|             moe_w4_quant_start_layer,
 | |
|             moe_w4_quant_end_layer,
 | |
|             moe_w2_quant_type,
 | |
|             moe_w2_quant_granularity,
 | |
|             moe_w2_quant_group_size,
 | |
|             moe_w2_quant_start_layer,
 | |
|             moe_w2_quant_end_layer,
 | |
|         )
 | |
| 
 | |
|     def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
 | |
|         """
 | |
|         Get the quantization method associated with the given layer based on the current quantization configuration.
 | |
|         Args:
 | |
|             layer (Layer): The layer for which the quantization method should be retrieved.
 | |
| 
 | |
|         Returns:
 | |
|             QuantMethodBase: The quantization method associated with the given layer.
 | |
|         """
 | |
|         if isinstance(layer, FusedMoE):
 | |
|             if layer.layer_idx <= self.moe_w4_quant_end_layer:
 | |
|                 return get_quantization_config(self.moe_w4_quant_type).from_config({}).get_quant_method(layer)
 | |
|             else:
 | |
|                 from fastdeploy.model_executor.layers.moe.fused_moe_wint2_backend import (
 | |
|                     CutlassWint2FusedMoeMethod,
 | |
|                 )
 | |
| 
 | |
|                 return CutlassWint2FusedMoeMethod(self)
 | |
|         else:
 | |
|             return get_quantization_config(self.dense_quant_type).from_config({}).get_quant_method(layer)
 | 
