From aa1cc09c5b447236c93097072b13d13a34ab854a Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com> Date: Sun, 28 Sep 2025 16:11:09 +0800 Subject: [PATCH] fix machete pre quant (#4295) --- fastdeploy/model_executor/layers/linear.py | 1 + fastdeploy/model_executor/layers/quantization/weight_only.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index ff9c16a3e..83c84454f 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -129,6 +129,7 @@ class LinearBase(nn.Layer): self.with_bias = with_bias self.add_bias = add_bias self.prefix = prefix + self.is_quantized = fd_config.model_config.is_quantized # key if weight_key: self.weight_key = f"{prefix}.{weight_key}" diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index 070d0fbf4..8d18b8607 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -20,6 +20,7 @@ from typing import Optional import paddle from paddle.nn.quant import weight_quantize +from paddleformers.utils.log import logger from fastdeploy import envs from fastdeploy.model_executor.layers.linear import ( @@ -159,9 +160,11 @@ class WeightOnlyConfig(QuantConfigBase): if ( _ENABLE_MACHETE and envs.FD_USE_MACHETE == "1" + and not layer.is_quantized and layer.weight_shape[1] and layer.weight_shape[1] % 128 == 0 ): + logger.info("Using Machete kernel for WeightOnlyLinearMethod") return MacheteWeightOnlyLinearMethod(self) return GPUWeightOnlyLinearMethod(self) @@ -399,7 +402,7 @@ class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod): super().__init__(quant_config) def process_prequanted_weights(self, layer, state_dict) -> None: - pass + raise NotImplementedError("Machete kernel doesn't support prequant. Please set FD_USE_MACHETE to 0.") def process_loaded_weights(self, layer, weight) -> None: from fastdeploy.model_executor.layers.quantization.ops import (