mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-08 01:50:27 +08:00
[Optimize] Machete using group scale default (#4121)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
This commit is contained in:
@@ -161,7 +161,6 @@ class WeightOnlyConfig(QuantConfigBase):
|
|||||||
and envs.FD_USE_MACHETE == "1"
|
and envs.FD_USE_MACHETE == "1"
|
||||||
and layer.weight_shape[1]
|
and layer.weight_shape[1]
|
||||||
and layer.weight_shape[1] % 128 == 0
|
and layer.weight_shape[1] % 128 == 0
|
||||||
and not layer.add_bias
|
|
||||||
):
|
):
|
||||||
return MacheteWeightOnlyLinearMethod(self)
|
return MacheteWeightOnlyLinearMethod(self)
|
||||||
return GPUWeightOnlyLinearMethod(self)
|
return GPUWeightOnlyLinearMethod(self)
|
||||||
@@ -244,7 +243,8 @@ class WeightOnlyLinearMethod(QuantMethodBase):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if isinstance(self, MacheteWeightOnlyLinearMethod):
|
if isinstance(self, MacheteWeightOnlyLinearMethod):
|
||||||
weight_scale_shape = [1, layer.weight_shape[1]]
|
# Using group scale for machete, group size is 128
|
||||||
|
weight_scale_shape = [(layer.weight_shape[0] + 127) // 128, layer.weight_shape[1]]
|
||||||
if self.quant_config.name() == "wint4":
|
if self.quant_config.name() == "wint4":
|
||||||
layer.weight_shape[0] //= 8
|
layer.weight_shape[0] //= 8
|
||||||
else:
|
else:
|
||||||
@@ -299,10 +299,12 @@ class WeightOnlyLinearMethod(QuantMethodBase):
|
|||||||
machete_quantize_and_pack,
|
machete_quantize_and_pack,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Using group scale for machete, group size is 128
|
||||||
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
|
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
|
||||||
w=layer.weight,
|
w=layer.weight,
|
||||||
atype=layer._dtype,
|
atype=layer._dtype,
|
||||||
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
|
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
|
||||||
|
group_size=128,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
|
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
|
||||||
@@ -404,23 +406,27 @@ class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
|||||||
machete_quantize_and_pack,
|
machete_quantize_and_pack,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Using group scale for machete, group size is 128
|
||||||
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
|
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
|
||||||
w=weight,
|
w=weight,
|
||||||
atype=layer._dtype,
|
atype=layer._dtype,
|
||||||
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
|
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
|
||||||
|
group_size=128,
|
||||||
)
|
)
|
||||||
layer.weight.set_value(quanted_weight_tensor)
|
layer.weight.set_value(quanted_weight_tensor)
|
||||||
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
|
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
|
||||||
|
|
||||||
def apply(self, layer, x):
|
def apply(self, layer, x):
|
||||||
assert layer.bias is None, "Machete weight only linear method does not support bias."
|
|
||||||
from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
|
from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
|
||||||
|
|
||||||
|
# Using group scale for machete, group size is 128
|
||||||
linear_out = machete_wint_mm(
|
linear_out = machete_wint_mm(
|
||||||
x,
|
x,
|
||||||
w_prepack=layer.weight,
|
w_prepack=layer.weight,
|
||||||
w_g_s=layer.weight_scale,
|
w_g_s=layer.weight_scale,
|
||||||
weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
|
weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
|
||||||
|
group_size=128,
|
||||||
)
|
)
|
||||||
|
if layer.with_bias:
|
||||||
|
linear_out = paddle.add(linear_out, layer.bias)
|
||||||
return linear_out
|
return linear_out
|
||||||
|
@@ -135,6 +135,8 @@ class WeightOnlyInt4LinearTestCase(unittest.TestCase):
|
|||||||
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
|
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
|
||||||
group_size=self.machete_group_size,
|
group_size=self.machete_group_size,
|
||||||
)
|
)
|
||||||
|
if self.bias is not None:
|
||||||
|
out = paddle.add(out, self.bias)
|
||||||
return out.numpy()
|
return out.numpy()
|
||||||
|
|
||||||
def test_weight_only_linear(self):
|
def test_weight_only_linear(self):
|
||||||
@@ -158,7 +160,7 @@ class WeightOnlyInt8LinearTestCase(unittest.TestCase):
|
|||||||
self.dtype = "float16"
|
self.dtype = "float16"
|
||||||
self.rtol = 1e-5
|
self.rtol = 1e-5
|
||||||
self.atol = 1e-1
|
self.atol = 1e-1
|
||||||
self.bias = False
|
self.bias = True
|
||||||
self.batch = 1
|
self.batch = 1
|
||||||
self.token = 512
|
self.token = 512
|
||||||
self.in_features = 7168
|
self.in_features = 7168
|
||||||
@@ -224,6 +226,8 @@ class WeightOnlyInt8LinearTestCase(unittest.TestCase):
|
|||||||
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
|
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
|
||||||
group_size=self.machete_group_size,
|
group_size=self.machete_group_size,
|
||||||
)
|
)
|
||||||
|
if self.bias is not None:
|
||||||
|
out = paddle.add(out, self.bias)
|
||||||
return out.numpy()
|
return out.numpy()
|
||||||
|
|
||||||
def test_weight_only_linear(self):
|
def test_weight_only_linear(self):
|
||||||
|
Reference in New Issue
Block a user