[Optimize] Machete using group scale default (#4121)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled

This commit is contained in:
Sunny-bot1
2025-09-18 13:51:11 +08:00
committed by GitHub
parent 62b8b02e08
commit c3b8ebeb18
2 changed files with 15 additions and 5 deletions

View File

@@ -161,7 +161,6 @@ class WeightOnlyConfig(QuantConfigBase):
and envs.FD_USE_MACHETE == "1" and envs.FD_USE_MACHETE == "1"
and layer.weight_shape[1] and layer.weight_shape[1]
and layer.weight_shape[1] % 128 == 0 and layer.weight_shape[1] % 128 == 0
and not layer.add_bias
): ):
return MacheteWeightOnlyLinearMethod(self) return MacheteWeightOnlyLinearMethod(self)
return GPUWeightOnlyLinearMethod(self) return GPUWeightOnlyLinearMethod(self)
@@ -244,7 +243,8 @@ class WeightOnlyLinearMethod(QuantMethodBase):
) )
else: else:
if isinstance(self, MacheteWeightOnlyLinearMethod): if isinstance(self, MacheteWeightOnlyLinearMethod):
weight_scale_shape = [1, layer.weight_shape[1]] # Using group scale for machete, group size is 128
weight_scale_shape = [(layer.weight_shape[0] + 127) // 128, layer.weight_shape[1]]
if self.quant_config.name() == "wint4": if self.quant_config.name() == "wint4":
layer.weight_shape[0] //= 8 layer.weight_shape[0] //= 8
else: else:
@@ -299,10 +299,12 @@ class WeightOnlyLinearMethod(QuantMethodBase):
machete_quantize_and_pack, machete_quantize_and_pack,
) )
# Using group scale for machete, group size is 128
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack( quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
w=layer.weight, w=layer.weight,
atype=layer._dtype, atype=layer._dtype,
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
group_size=128,
) )
else: else:
quanted_weight_tensor, weight_scale_tensor = weight_quantize( quanted_weight_tensor, weight_scale_tensor = weight_quantize(
@@ -404,23 +406,27 @@ class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
machete_quantize_and_pack, machete_quantize_and_pack,
) )
# Using group scale for machete, group size is 128
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack( quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
w=weight, w=weight,
atype=layer._dtype, atype=layer._dtype,
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
group_size=128,
) )
layer.weight.set_value(quanted_weight_tensor) layer.weight.set_value(quanted_weight_tensor)
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype())) layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
def apply(self, layer, x): def apply(self, layer, x):
assert layer.bias is None, "Machete weight only linear method does not support bias."
from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
# Using group scale for machete, group size is 128
linear_out = machete_wint_mm( linear_out = machete_wint_mm(
x, x,
w_prepack=layer.weight, w_prepack=layer.weight,
w_g_s=layer.weight_scale, w_g_s=layer.weight_scale,
weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
group_size=128,
) )
if layer.with_bias:
linear_out = paddle.add(linear_out, layer.bias)
return linear_out return linear_out

View File

@@ -135,6 +135,8 @@ class WeightOnlyInt4LinearTestCase(unittest.TestCase):
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
group_size=self.machete_group_size, group_size=self.machete_group_size,
) )
if self.bias is not None:
out = paddle.add(out, self.bias)
return out.numpy() return out.numpy()
def test_weight_only_linear(self): def test_weight_only_linear(self):
@@ -158,7 +160,7 @@ class WeightOnlyInt8LinearTestCase(unittest.TestCase):
self.dtype = "float16" self.dtype = "float16"
self.rtol = 1e-5 self.rtol = 1e-5
self.atol = 1e-1 self.atol = 1e-1
self.bias = False self.bias = True
self.batch = 1 self.batch = 1
self.token = 512 self.token = 512
self.in_features = 7168 self.in_features = 7168
@@ -224,6 +226,8 @@ class WeightOnlyInt8LinearTestCase(unittest.TestCase):
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
group_size=self.machete_group_size, group_size=self.machete_group_size,
) )
if self.bias is not None:
out = paddle.add(out, self.bias)
return out.numpy() return out.numpy()
def test_weight_only_linear(self): def test_weight_only_linear(self):