diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py index ed07c55ad..121cba0fa 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py @@ -90,44 +90,49 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod): is_bias=False, ) - def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None: - """ - loaded_weights using xpu special quantization - """ + def _quantize_weight_in_blocks(self, weight: paddle.Tensor) -> tuple[paddle.Tensor, paddle.Tensor]: k, n = weight.shape - quanted_weight_tensors = [] - weight_scale_tensors = [] - offset = 30720 - for i in range(0, n, offset): - end_n = min(i + offset, n) - weight_i = weight[:, i:end_n] - quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight_i, self.quant_config.algo, -1, -1) - quanted_weight_tensors.append(quanted_weight_tensor) - weight_scale_tensors.append(weight_scale_tensor) - quanted_weight_tensor = paddle.concat(quanted_weight_tensors, axis=1) - weight_scale_tensor = paddle.concat(weight_scale_tensors, axis=0) - layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0])) - layer.weight_scale.set_value(weight_scale_tensor) + BLOCK_SIZE = 30720 + + quanted_weight_list = [] + scale_list = [] + + for i in range(0, n, BLOCK_SIZE): + end_n = min(i + BLOCK_SIZE, n) + weight_block = weight[:, i:end_n] + + quanted_weight, scale = weight_quantize_xpu(weight_block, self.quant_config.algo, -1, -1) + quanted_weight_list.append(quanted_weight) + scale_list.append(scale) + + quanted_weight = paddle.concat(quanted_weight_list, axis=1) + weight_scale = paddle.concat(scale_list, axis=0) + + return quanted_weight, weight_scale + + def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None: + quanted_weight, weight_scale = self._quantize_weight_in_blocks(weight) + layer.weight.set_value(paddle.transpose(quanted_weight, [1, 0])) + layer.weight_scale.set_value(weight_scale) def process_weights_after_loading(self, layer) -> None: if not self.quant_config.is_checkpoint_bf16: return - quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(layer.weight, self.quant_config.algo, -1, -1) - + quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight) free_tensor(layer.weight) layer.weight = layer.create_parameter( - shape=quanted_weight_tensor.shape[::-1], + shape=quanted_weight.shape[::-1], dtype="int8", is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), ) layer.weight_scale = layer.create_parameter( - shape=weight_scale_tensor.shape, - dtype=weight_scale_tensor.dtype, + shape=weight_scale.shape, + dtype=weight_scale.dtype, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), ) - layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0])) - layer.weight_scale.copy_(weight_scale_tensor, False) + layer.weight.set_value(paddle.transpose(quanted_weight, [1, 0])) + layer.weight_scale.copy_(weight_scale, False) diff --git a/tests/xpu_ci/test_ep4tp1_online.py b/tests/xpu_ci/test_ep4tp1_online.py index e83ba2275..8acb5da3a 100644 --- a/tests/xpu_ci/test_ep4tp1_online.py +++ b/tests/xpu_ci/test_ep4tp1_online.py @@ -79,8 +79,6 @@ def test_ep4tp1_online(xpu_env): str(port_num + 47873), "--gpu-memory-utilization", "0.9", - "--load-choices", - "default", ] # 启动服务器