fix machete pre quant (#4295)

2025-10-06 00:57:33 +08:00 · 2025-09-28 16:11:09 +08:00
parent 7b6cb72ab2
commit aa1cc09c5b
2 changed files with 5 additions and 1 deletions
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -129,6 +129,7 @@ class LinearBase(nn.Layer):
        self.with_bias = with_bias
        self.add_bias = add_bias
        self.prefix = prefix
        self.is_quantized = fd_config.model_config.is_quantized
        # key
        if weight_key:
            self.weight_key = f"{prefix}.{weight_key}"
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -20,6 +20,7 @@ from typing import Optional
 import paddle
 from paddle.nn.quant import weight_quantize
 from paddleformers.utils.log import logger
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.linear import (
@@ -159,9 +160,11 @@ class WeightOnlyConfig(QuantConfigBase):
                if (
                    _ENABLE_MACHETE
                    and envs.FD_USE_MACHETE == "1"
                    and not layer.is_quantized
                    and layer.weight_shape[1]
                    and layer.weight_shape[1] % 128 == 0
                ):
                    logger.info("Using Machete kernel for WeightOnlyLinearMethod")
                    return MacheteWeightOnlyLinearMethod(self)
                return GPUWeightOnlyLinearMethod(self)
@@ -399,7 +402,7 @@ class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        super().__init__(quant_config)
    def process_prequanted_weights(self, layer, state_dict) -> None:
-        pass
+        raise NotImplementedError("Machete kernel doesn't support prequant. Please set FD_USE_MACHETE to 0.")
    def process_loaded_weights(self, layer, weight) -> None:
        from fastdeploy.model_executor.layers.quantization.ops import (