Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

View File

@@ -16,6 +16,6 @@
xpu backend methods
"""
from .quantization.weight_only import XPUWeightOnlyLinearMethod
from .quantization.weight_only import XPUWeightOnlyLinearMethod, XPUWeightOnlyMoEMethod
__all__ = ['XPUWeightOnlyLinearMethod']
__all__ = ['XPUWeightOnlyLinearMethod', 'XPUWeightOnlyMoEMethod']

View File

@@ -0,0 +1,16 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
xpu quantization methods
"""

View File

@@ -13,15 +13,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from abc import abstractmethod
from typing import Optional
from typing import Dict
import paddle
from paddle import nn
from .utils import xpu_quant_weight
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantMethodBase
from fastdeploy.model_executor.layers.quantization.weight_only import (
WeightOnlyConfig, WeightOnlyLinearMethod)
from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu
from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig, WeightOnlyLinearMethod
class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
"""
@@ -34,12 +37,133 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
) -> None:
super().__init__(quant_config)
def process_loaded_weights(self, layer, weight) -> None:
def create_weights(self, layer: nn.Layer) -> None:
"""
Create weights for linear layer on XPU
"""
layer.linear_weight_shape.reverse()
if self.quant_config.name() == "weight_only_int4":
layer.linear_weight_shape[0] //= 2
layer.weight_dtype = "int8"
linear_weight_scale_shape = [layer.embed_dim]
if hasattr(layer, "linear_weight_shape"):
if isinstance(layer.linear_weight_shape, list):
layer_weight_shape = layer.linear_weight_shape
linear_weight_scale_shape = layer_weight_shape[:1]
layer.linear_weight_scale = layer.create_parameter(
shape=linear_weight_scale_shape,
dtype="float32",
is_bias=False,
)
def process_loaded_weights(self, layer: nn.Layer,
weight: paddle.Tensor) -> None:
"""
loaded_weights using xpu special quantization
"""
quanted_weight_tensor, weight_scale_tensor = xpu_quant_weight(
weight.cpu().numpy())
layer.linear_weight.set_value(quanted_weight_tensor)
layer.linear_weight_scale.set_value(
weight_scale_tensor.astype(paddle.get_default_dtype()))
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(
weight, self.quant_config.algo, -1, -1)
layer.linear_weight.set_value(
paddle.transpose(quanted_weight_tensor, [1, 0]))
layer.linear_weight_scale.set_value(weight_scale_tensor)
class XPUWeightOnlyMoEMethod(QuantMethodBase):
"""
XPU Fused MoE Method.
"""
def __init__(
self,
quant_config: WeightOnlyConfig,
) -> None:
super().__init__()
self.quant_config = quant_config
self.moe_quant_type = self.quant_config.algo
def create_weights(self, layer: nn.Layer, state_dict: Dict[str,
paddle.Tensor]):
"""
Paddle cutlass create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
assert len(ffn1_weights) == layer.num_local_experts
assert len(ffn2_weights) == layer.num_local_experts
assert ffn1_weights[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = added_weight_attrs[idx]
scale_name = added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(layer.num_local_experts):
quant_weight, scale = weight_quantize_xpu(
weight_tensor[i], self.moe_quant_type, -1,
-1) # weight is [k,n]
weight_list.append(quant_weight.transpose(
[1, 0])) # transpose weight to [n,k]
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
setattr(
layer, weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(quanted_weight)
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
setattr(
layer, scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
))
getattr(layer, scale_name).set_value(quanted_weight_scale)
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
XPU compute Fused MoE.
"""
from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
fused_moe_out = xpu_moe_layer(
x,
layer.gate_weight.transpose([1, 0]),
layer.gate_correction_bias,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None, # ffn1 bias
None, # ffn2 bias
(layer.moe_ffn1_weight_scale
if hasattr(layer, "moe_ffn1_weight_scale") else None),
(layer.moe_ffn2_weight_scale
if hasattr(layer, "moe_ffn2_weight_scale") else None),
(layer.moe_ffn2_in_scale
if hasattr(layer, "moe_ffn2_in_scale") else None),
self.moe_quant_type,
layer.top_k,
False, # moe group, used in deepseek
)
if layer.tp_size > 1:
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
tensor_model_parallel_all_reduce(fused_moe_out)
return fused_moe_out

View File

@@ -16,11 +16,13 @@
!! This file will be deleted after the platform is fully functional
"""
from typing import Tuple
import numpy as np
import paddle
def xpu_clip_and_round(x):
def xpu_clip_and_round(x: np.ndarray) -> np.ndarray:
"""
Clip and round the input array to the range [-127, 127] and convert to int8.
@@ -33,7 +35,8 @@ def xpu_clip_and_round(x):
return np.clip(np.around(x), -127, 127).astype("int8")
def xpu_quant_qkv_weight(weight_np):
def xpu_quant_qkv_weight(
weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize the query, key, and value weights for the Transformer model.
@@ -61,7 +64,8 @@ def xpu_quant_qkv_weight(weight_np):
return quanted_weight, weight_scales
def xpu_quant_weight(weight_np):
def xpu_quant_weight(
weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize the weight tensor for XPU devices.