[Optimize]support machete weight only gemm (#3561)

* support machete weight only gemm

* add generate

* update

* fix

* change file location

* add sm_version limit

* fix

* fix

* fix ci

* fix coverage

* fix xpu
This commit is contained in:
Sunny-bot1
2025-08-28 09:49:58 +08:00
committed by GitHub
parent e37e86b3b8
commit 479c8b85d3
29 changed files with 5436 additions and 0 deletions

View File

@@ -21,6 +21,7 @@ from typing import Optional
import paddle
from paddle.nn.quant import weight_only_linear, weight_quantize
from fastdeploy import envs
from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear,
QKVParallelLinear,
@@ -33,6 +34,12 @@ from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
def get_sm_version():
prop = paddle.device.cuda.get_device_properties()
cc = prop.major * 10 + prop.minor
return cc
class WeightOnlyConfig(QuantConfigBase):
"""
Quantization config for weight only
@@ -132,6 +139,14 @@ class WeightOnlyConfig(QuantConfigBase):
else:
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
else:
if (
self.name() == "wint4"
and envs.FD_USE_MACHETE == "1"
and get_sm_version() == 90
and layer.weight_shape[1]
and layer.weight_shape[1] % 128 == 0
):
return MacheteWeightOnlyLinearMethod(self)
return GPUWeightOnlyLinearMethod(self)
@@ -329,3 +344,73 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
quanted_weight_tensor = paddle.transpose(quanted_weight_tensor, [1, 0])
layer.weight.set_value(quanted_weight_tensor)
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
"""
Weight only quantization method for linear layer on GPU using Machete
The weights are loaded in the BF16 numerical format. After loading, the quantization coefficients will be computed,
and the weights will be quantized to int8 or int4.
"""
def __init__(
self,
quant_config: WeightOnlyConfig,
) -> None:
super().__init__(quant_config)
def create_weights(self, layer, **extra_weight_attrs):
assert layer.bias is None, "Machete weight only linear method does not support bias."
assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4."
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
weight_scale_shape = [1, layer.weight_shape[1]]
# layer.weight_shape.reverse()
if self.quant_config.name() == "wint4":
layer.weight_shape[0] //= 8
layer.weight_dtype = "int32"
layer.weight = layer.create_parameter(
shape=layer.weight_shape,
dtype=layer.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.weight_scale = layer.create_parameter(
shape=weight_scale_shape,
dtype=layer._dtype,
is_bias=False,
)
def process_prequanted_weights(self, layer, state_dict) -> None:
pass
def process_loaded_weights(self, layer, weight) -> None:
from fastdeploy.model_executor.layers.quantization.ops import (
machete_quantize_and_pack,
)
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
w=weight,
atype=layer._dtype,
quant_type="uint4b8",
)
layer.weight.set_value(quanted_weight_tensor)
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
def apply(self, layer, x):
assert layer.bias is None, "Machete weight only linear method does not support bias."
assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4."
from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
linear_out = machete_wint_mm(
x,
w_prepack=layer.weight,
w_g_s=layer.weight_scale,
weight_dtype="uint4b8",
)
return linear_out