mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[XPU] support ep4tp1+v1 loader (#5398)
This commit is contained in:
@@ -90,44 +90,49 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
||||
is_bias=False,
|
||||
)
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
|
||||
"""
|
||||
loaded_weights using xpu special quantization
|
||||
"""
|
||||
def _quantize_weight_in_blocks(self, weight: paddle.Tensor) -> tuple[paddle.Tensor, paddle.Tensor]:
|
||||
k, n = weight.shape
|
||||
quanted_weight_tensors = []
|
||||
weight_scale_tensors = []
|
||||
offset = 30720
|
||||
for i in range(0, n, offset):
|
||||
end_n = min(i + offset, n)
|
||||
weight_i = weight[:, i:end_n]
|
||||
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight_i, self.quant_config.algo, -1, -1)
|
||||
quanted_weight_tensors.append(quanted_weight_tensor)
|
||||
weight_scale_tensors.append(weight_scale_tensor)
|
||||
quanted_weight_tensor = paddle.concat(quanted_weight_tensors, axis=1)
|
||||
weight_scale_tensor = paddle.concat(weight_scale_tensors, axis=0)
|
||||
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
|
||||
layer.weight_scale.set_value(weight_scale_tensor)
|
||||
BLOCK_SIZE = 30720
|
||||
|
||||
quanted_weight_list = []
|
||||
scale_list = []
|
||||
|
||||
for i in range(0, n, BLOCK_SIZE):
|
||||
end_n = min(i + BLOCK_SIZE, n)
|
||||
weight_block = weight[:, i:end_n]
|
||||
|
||||
quanted_weight, scale = weight_quantize_xpu(weight_block, self.quant_config.algo, -1, -1)
|
||||
quanted_weight_list.append(quanted_weight)
|
||||
scale_list.append(scale)
|
||||
|
||||
quanted_weight = paddle.concat(quanted_weight_list, axis=1)
|
||||
weight_scale = paddle.concat(scale_list, axis=0)
|
||||
|
||||
return quanted_weight, weight_scale
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
|
||||
quanted_weight, weight_scale = self._quantize_weight_in_blocks(weight)
|
||||
layer.weight.set_value(paddle.transpose(quanted_weight, [1, 0]))
|
||||
layer.weight_scale.set_value(weight_scale)
|
||||
|
||||
def process_weights_after_loading(self, layer) -> None:
|
||||
if not self.quant_config.is_checkpoint_bf16:
|
||||
return
|
||||
|
||||
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(layer.weight, self.quant_config.algo, -1, -1)
|
||||
|
||||
quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight)
|
||||
free_tensor(layer.weight)
|
||||
|
||||
layer.weight = layer.create_parameter(
|
||||
shape=quanted_weight_tensor.shape[::-1],
|
||||
shape=quanted_weight.shape[::-1],
|
||||
dtype="int8",
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
layer.weight_scale = layer.create_parameter(
|
||||
shape=weight_scale_tensor.shape,
|
||||
dtype=weight_scale_tensor.dtype,
|
||||
shape=weight_scale.shape,
|
||||
dtype=weight_scale.dtype,
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
|
||||
layer.weight_scale.copy_(weight_scale_tensor, False)
|
||||
layer.weight.set_value(paddle.transpose(quanted_weight, [1, 0]))
|
||||
layer.weight_scale.copy_(weight_scale, False)
|
||||
|
||||
@@ -79,8 +79,6 @@ def test_ep4tp1_online(xpu_env):
|
||||
str(port_num + 47873),
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
"--load-choices",
|
||||
"default",
|
||||
]
|
||||
|
||||
# 启动服务器
|
||||
|
||||
Reference in New Issue
Block a user