[Optimize]support machete weight only gemm (#3561)

* support machete weight only gemm * add generate * update * fix * change file location * add sm_version limit * fix * fix * fix ci * fix coverage * fix xpu
2025-10-04 00:06:38 +08:00 · 2025-08-28 09:49:58 +08:00
parent e37e86b3b8
commit 479c8b85d3
29 changed files with 5436 additions and 0 deletions
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -21,6 +21,7 @@ from typing import Optional
 import paddle
 from paddle.nn.quant import weight_only_linear, weight_quantize

+from fastdeploy import envs
 from fastdeploy.model_executor.layers.linear import (
    MergedColumnParallelLinear,
    QKVParallelLinear,
@@ -33,6 +34,12 @@ from ..utils import get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase


+def get_sm_version():
+    prop = paddle.device.cuda.get_device_properties()
+    cc = prop.major * 10 + prop.minor
+    return cc
+
+
 class WeightOnlyConfig(QuantConfigBase):
    """
    Quantization config for weight only
@@ -132,6 +139,14 @@ class WeightOnlyConfig(QuantConfigBase):
                else:
                    raise ValueError(f"Unsupported MOE backend {layer.use_method}")
            else:
+                if (
+                    self.name() == "wint4"
+                    and envs.FD_USE_MACHETE == "1"
+                    and get_sm_version() == 90
+                    and layer.weight_shape[1]
+                    and layer.weight_shape[1] % 128 == 0
+                ):
+                    return MacheteWeightOnlyLinearMethod(self)
                return GPUWeightOnlyLinearMethod(self)


@@ -329,3 +344,73 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
            quanted_weight_tensor = paddle.transpose(quanted_weight_tensor, [1, 0])
        layer.weight.set_value(quanted_weight_tensor)
        layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
+
+
+class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
+    """
+    Weight only quantization method for linear layer on GPU using Machete
+    The weights are loaded in the BF16 numerical format. After loading, the quantization coefficients will be computed,
+    and the weights will be quantized to int8 or int4.
+    """
+
+    def __init__(
+        self,
+        quant_config: WeightOnlyConfig,
+    ) -> None:
+        super().__init__(quant_config)
+
+    def create_weights(self, layer, **extra_weight_attrs):
+
+        assert layer.bias is None, "Machete weight only linear method does not support bias."
+        assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4."
+
+        # The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
+        weight_scale_shape = [1, layer.weight_shape[1]]
+
+        # layer.weight_shape.reverse()
+        if self.quant_config.name() == "wint4":
+            layer.weight_shape[0] //= 8
+        layer.weight_dtype = "int32"
+
+        layer.weight = layer.create_parameter(
+            shape=layer.weight_shape,
+            dtype=layer.weight_dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
+        layer.weight_scale = layer.create_parameter(
+            shape=weight_scale_shape,
+            dtype=layer._dtype,
+            is_bias=False,
+        )
+
+    def process_prequanted_weights(self, layer, state_dict) -> None:
+        pass
+
+    def process_loaded_weights(self, layer, weight) -> None:
+        from fastdeploy.model_executor.layers.quantization.ops import (
+            machete_quantize_and_pack,
+        )
+
+        quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
+            w=weight,
+            atype=layer._dtype,
+            quant_type="uint4b8",
+        )
+        layer.weight.set_value(quanted_weight_tensor)
+        layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
+
+    def apply(self, layer, x):
+        assert layer.bias is None, "Machete weight only linear method does not support bias."
+        assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4."
+        from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
+
+        linear_out = machete_wint_mm(
+            x,
+            w_prepack=layer.weight,
+            w_g_s=layer.weight_scale,
+            weight_dtype="uint4b8",
+        )
+
+        return linear_out