From aa1cc09c5b447236c93097072b13d13a34ab854a Mon Sep 17 00:00:00 2001
From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com>
Date: Sun, 28 Sep 2025 16:11:09 +0800
Subject: [PATCH] fix machete pre quant (#4295)

---
 fastdeploy/model_executor/layers/linear.py                   | 1 +
 fastdeploy/model_executor/layers/quantization/weight_only.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index ff9c16a3e..83c84454f 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -129,6 +129,7 @@ class LinearBase(nn.Layer):
         self.with_bias = with_bias
         self.add_bias = add_bias
         self.prefix = prefix
+        self.is_quantized = fd_config.model_config.is_quantized
         # key
         if weight_key:
             self.weight_key = f"{prefix}.{weight_key}"
diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
index 070d0fbf4..8d18b8607 100644
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -20,6 +20,7 @@ from typing import Optional
 
 import paddle
 from paddle.nn.quant import weight_quantize
+from paddleformers.utils.log import logger
 
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.linear import (
@@ -159,9 +160,11 @@ class WeightOnlyConfig(QuantConfigBase):
                 if (
                     _ENABLE_MACHETE
                     and envs.FD_USE_MACHETE == "1"
+                    and not layer.is_quantized
                     and layer.weight_shape[1]
                     and layer.weight_shape[1] % 128 == 0
                 ):
+                    logger.info("Using Machete kernel for WeightOnlyLinearMethod")
                     return MacheteWeightOnlyLinearMethod(self)
                 return GPUWeightOnlyLinearMethod(self)
 
@@ -399,7 +402,7 @@ class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
         super().__init__(quant_config)
 
     def process_prequanted_weights(self, layer, state_dict) -> None:
-        pass
+        raise NotImplementedError("Machete kernel doesn't support prequant. Please set FD_USE_MACHETE to 0.")
 
     def process_loaded_weights(self, layer, weight) -> None:
         from fastdeploy.model_executor.layers.quantization.ops import (