mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 00:06:38 +08:00
[Optimize]support machete weight only gemm (#3561)
* support machete weight only gemm * add generate * update * fix * change file location * add sm_version limit * fix * fix * fix ci * fix coverage * fix xpu
This commit is contained in:
@@ -21,6 +21,7 @@ from typing import Optional
|
||||
import paddle
|
||||
from paddle.nn.quant import weight_only_linear, weight_quantize
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
@@ -33,6 +34,12 @@ from ..utils import get_tensor
|
||||
from .quant_base import QuantConfigBase, QuantMethodBase
|
||||
|
||||
|
||||
def get_sm_version():
|
||||
prop = paddle.device.cuda.get_device_properties()
|
||||
cc = prop.major * 10 + prop.minor
|
||||
return cc
|
||||
|
||||
|
||||
class WeightOnlyConfig(QuantConfigBase):
|
||||
"""
|
||||
Quantization config for weight only
|
||||
@@ -132,6 +139,14 @@ class WeightOnlyConfig(QuantConfigBase):
|
||||
else:
|
||||
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
|
||||
else:
|
||||
if (
|
||||
self.name() == "wint4"
|
||||
and envs.FD_USE_MACHETE == "1"
|
||||
and get_sm_version() == 90
|
||||
and layer.weight_shape[1]
|
||||
and layer.weight_shape[1] % 128 == 0
|
||||
):
|
||||
return MacheteWeightOnlyLinearMethod(self)
|
||||
return GPUWeightOnlyLinearMethod(self)
|
||||
|
||||
|
||||
@@ -329,3 +344,73 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
||||
quanted_weight_tensor = paddle.transpose(quanted_weight_tensor, [1, 0])
|
||||
layer.weight.set_value(quanted_weight_tensor)
|
||||
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
|
||||
|
||||
|
||||
class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
||||
"""
|
||||
Weight only quantization method for linear layer on GPU using Machete
|
||||
The weights are loaded in the BF16 numerical format. After loading, the quantization coefficients will be computed,
|
||||
and the weights will be quantized to int8 or int4.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quant_config: WeightOnlyConfig,
|
||||
) -> None:
|
||||
super().__init__(quant_config)
|
||||
|
||||
def create_weights(self, layer, **extra_weight_attrs):
|
||||
|
||||
assert layer.bias is None, "Machete weight only linear method does not support bias."
|
||||
assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4."
|
||||
|
||||
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
|
||||
weight_scale_shape = [1, layer.weight_shape[1]]
|
||||
|
||||
# layer.weight_shape.reverse()
|
||||
if self.quant_config.name() == "wint4":
|
||||
layer.weight_shape[0] //= 8
|
||||
layer.weight_dtype = "int32"
|
||||
|
||||
layer.weight = layer.create_parameter(
|
||||
shape=layer.weight_shape,
|
||||
dtype=layer.weight_dtype,
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
layer.weight_scale = layer.create_parameter(
|
||||
shape=weight_scale_shape,
|
||||
dtype=layer._dtype,
|
||||
is_bias=False,
|
||||
)
|
||||
|
||||
def process_prequanted_weights(self, layer, state_dict) -> None:
|
||||
pass
|
||||
|
||||
def process_loaded_weights(self, layer, weight) -> None:
|
||||
from fastdeploy.model_executor.layers.quantization.ops import (
|
||||
machete_quantize_and_pack,
|
||||
)
|
||||
|
||||
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
|
||||
w=weight,
|
||||
atype=layer._dtype,
|
||||
quant_type="uint4b8",
|
||||
)
|
||||
layer.weight.set_value(quanted_weight_tensor)
|
||||
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
|
||||
|
||||
def apply(self, layer, x):
|
||||
assert layer.bias is None, "Machete weight only linear method does not support bias."
|
||||
assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4."
|
||||
from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
|
||||
|
||||
linear_out = machete_wint_mm(
|
||||
x,
|
||||
w_prepack=layer.weight,
|
||||
w_g_s=layer.weight_scale,
|
||||
weight_dtype="uint4b8",
|
||||
)
|
||||
|
||||
return linear_out
|
||||
|
Reference in New Issue
Block a user