[XPU] support ep4tp1+v1 loader (#5398)

This commit is contained in:
zccjjj
2025-12-05 18:51:15 +08:00
committed by GitHub
parent 35846909c7
commit 5b900667e3
2 changed files with 29 additions and 26 deletions

View File

@@ -90,44 +90,49 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
is_bias=False,
)
def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
"""
loaded_weights using xpu special quantization
"""
def _quantize_weight_in_blocks(self, weight: paddle.Tensor) -> tuple[paddle.Tensor, paddle.Tensor]:
k, n = weight.shape
quanted_weight_tensors = []
weight_scale_tensors = []
offset = 30720
for i in range(0, n, offset):
end_n = min(i + offset, n)
weight_i = weight[:, i:end_n]
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight_i, self.quant_config.algo, -1, -1)
quanted_weight_tensors.append(quanted_weight_tensor)
weight_scale_tensors.append(weight_scale_tensor)
quanted_weight_tensor = paddle.concat(quanted_weight_tensors, axis=1)
weight_scale_tensor = paddle.concat(weight_scale_tensors, axis=0)
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
layer.weight_scale.set_value(weight_scale_tensor)
BLOCK_SIZE = 30720
quanted_weight_list = []
scale_list = []
for i in range(0, n, BLOCK_SIZE):
end_n = min(i + BLOCK_SIZE, n)
weight_block = weight[:, i:end_n]
quanted_weight, scale = weight_quantize_xpu(weight_block, self.quant_config.algo, -1, -1)
quanted_weight_list.append(quanted_weight)
scale_list.append(scale)
quanted_weight = paddle.concat(quanted_weight_list, axis=1)
weight_scale = paddle.concat(scale_list, axis=0)
return quanted_weight, weight_scale
def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
quanted_weight, weight_scale = self._quantize_weight_in_blocks(weight)
layer.weight.set_value(paddle.transpose(quanted_weight, [1, 0]))
layer.weight_scale.set_value(weight_scale)
def process_weights_after_loading(self, layer) -> None:
if not self.quant_config.is_checkpoint_bf16:
return
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(layer.weight, self.quant_config.algo, -1, -1)
quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight)
free_tensor(layer.weight)
layer.weight = layer.create_parameter(
shape=quanted_weight_tensor.shape[::-1],
shape=quanted_weight.shape[::-1],
dtype="int8",
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.weight_scale = layer.create_parameter(
shape=weight_scale_tensor.shape,
dtype=weight_scale_tensor.dtype,
shape=weight_scale.shape,
dtype=weight_scale.dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
layer.weight_scale.copy_(weight_scale_tensor, False)
layer.weight.set_value(paddle.transpose(quanted_weight, [1, 0]))
layer.weight_scale.copy_(weight_scale, False)

View File

@@ -79,8 +79,6 @@ def test_ep4tp1_online(xpu_env):
str(port_num + 47873),
"--gpu-memory-utilization",
"0.9",
"--load-choices",
"default",
]
# 启动服务器