[Feature] refactor metax_gpu attention and moe and remove some useless code (#3688)

Co-authored-by: yongqiangma <xing.wo@163.com>
This commit is contained in:
SuperNova
2025-09-12 14:40:25 +08:00
committed by GitHub
parent cab7a633fe
commit 805f29a06c
5 changed files with 389 additions and 289 deletions

View File

@@ -313,24 +313,14 @@ class WeightOnlyLinearMethod(QuantMethodBase):
raise NotImplementedError
def apply(self, layer, x):
if current_platform.is_maca():
linear_out = weight_only_linear(
x,
weight=layer.weight,
bias=layer.bias if layer.add_bias else None,
weight_scale=layer.weight_scale,
weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
arch=80,
)
else:
linear_out = weight_only_linear(
x,
weight=layer.weight,
bias=layer.bias if layer.add_bias else None,
weight_scale=layer.weight_scale,
weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
arch=self.quant_config.weight_only_linear_arch,
)
linear_out = weight_only_linear(
x,
weight=layer.weight,
bias=layer.bias if layer.add_bias else None,
weight_scale=layer.weight_scale,
weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
arch=self.quant_config.weight_only_linear_arch,
)
return linear_out