FastDeploy/custom_ops/xpu_ops/test/test_moe_expert_ffn.py

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle

from fastdeploy.model_executor.ops.xpu import moe_expert_ffn

np.random.seed(2025)

token_num = 7
expert_num = 64
hidden_dim = 8192
ffn_inter_dim = 7168
ffn_outer_dim = ffn_inter_dim // 2
num_max_dispatch_tokens_per_rank = 128
num_rank = 8
expert_num_per_rank = expert_num // num_rank
used_in_ep_low_latency = True
hadamard_blocksize = 512

ffn_in = (np.random.random([token_num, hidden_dim]) - 0.5).astype("float32")
token_num_lod = np.full([expert_num_per_rank + 1], 0, "int32")
token_num_lod[-1] = token_num
token_num_lod[1:-1] = np.random.randint(0, token_num, [expert_num_per_rank - 1])
token_num_lod = np.sort(token_num_lod)
token_num_per_expert = token_num_lod[1:] - token_num_lod[:-1]
ffn1_w = (np.random.random([expert_num_per_rank, ffn_inter_dim, hidden_dim]) - 0.5).astype("float32")
ffn2_w = (np.random.random([expert_num_per_rank, hidden_dim, ffn_outer_dim]) - 0.5).astype("float32")
ffn2_shift = (np.random.random([1, ffn_outer_dim]) - 0.5).astype("float32")
ffn2_smooth = (np.random.random([1, ffn_outer_dim]) - 0.5).astype("float32")

if used_in_ep_low_latency:
    ffn_in_tmp = ffn_in
    ffn_in = np.zeros(
        [
            expert_num_per_rank,
            num_max_dispatch_tokens_per_rank * num_rank,
            hidden_dim,
        ],
        "float32",
    )
    for i in range(expert_num_per_rank):
        ffn_in[i][: token_num_per_expert[i]] = ffn_in_tmp[token_num_lod[i] : token_num_lod[i + 1]]
    token_num_info = token_num_per_expert
else:
    token_num_info = token_num_lod

print(f"ffn_in: {ffn_in}")
print(f"token_num_lod: {token_num_lod}")
print(f"token_num_per_expert: {token_num_per_expert}")
print(f"ffn1_w: {ffn1_w}")
print(f"ffn2_w: {ffn2_w}")


def clip_and_round(x, quant_max_bound=127):
    return np.clip(np.around(x), -quant_max_bound, quant_max_bound).astype("int8")


def weight_quant_wint8(w_fp32):
    w_max = np.max(np.abs(w_fp32), axis=-1, keepdims=True)
    w_int8 = clip_and_round(w_fp32 / w_max * 127.0)
    return w_int8, w_max.reshape([-1])


def weight_quant_wint4(w_fp32):
    w_max = np.max(np.abs(w_fp32), axis=-1, keepdims=True)
    w_int4 = clip_and_round(w_fp32 / w_max * 7.0, 7)
    w_int4 = (w_int4[:, :, 1::2] & 0xF) << 4 | (w_int4[:, :, ::2] & 0xF)  # pack int4
    return w_int4, w_max.reshape([-1])


def weight_quant(w_fp32, algo="weight_only_int8"):
    if algo == "weight_only_int8":
        return weight_quant_wint8(w_fp32)
    elif algo == "weight_only_int4":
        return weight_quant_wint4(w_fp32)
    else:
        return None, None


quant_method = "weight_only_int4"
print(f"quant_method={quant_method}, used_in_ep_low_latency={used_in_ep_low_latency}")
ffn1_quant_w, ffn1_w_scale = weight_quant(ffn1_w, quant_method)
ffn2_quant_w, ffn2_w_scale = weight_quant(ffn2_w, quant_method)
print(f"ffn1_w {ffn1_w.shape}: {ffn1_w}")
print(f"ffn2_w {ffn2_w.shape}: {ffn2_w}")
print(f"ffn1_quant_w {ffn1_quant_w.shape}: {ffn1_quant_w}")
print(f"ffn1_w_scale {ffn1_w_scale.shape}: {ffn1_w_scale}")
print(f"ffn2_quant_w {ffn2_quant_w.shape}: {ffn2_quant_w}")
print(f"ffn2_w_scale {ffn2_w_scale.shape}: {ffn2_w_scale}")


def weight_dequant_wint8(w_int, w_scale):
    w_shape = w_int.shape
    w_scale_new_shape = list(w_shape)
    w_scale_new_shape[-1] = 1
    w_scale_new = w_scale.reshape(w_scale_new_shape)
    w_fp32 = w_int.astype("float32") / 127.0 * w_scale_new
    return w_fp32


def weight_dequant_wint4(w_int, w_scale):
    w_shape = w_int.shape
    w_scale_new_shape = list(w_shape)
    w_scale_new_shape[-1] = 1
    # w_scale_new_shape[-2] = w_scale_new_shape[-2] * 2
    w_scale_new = w_scale.reshape(w_scale_new_shape)
    w_new_shape = list(w_shape)
    w_new_shape[-1] = w_new_shape[-1] * 2
    w_int8 = np.zeros(w_new_shape, dtype=np.int8)
    w_int8[:, :, ::2] = w_int & 0xF
    w_int8[:, :, 1::2] = (w_int >> 4) & 0xF
    w_int8 = np.where(w_int8 >= 8, w_int8 - 16, w_int8)
    w_fp32 = w_int8.astype("float32") / 7.0 * w_scale_new
    return w_fp32


def weight_dequant(w_int, w_scale, algo="weight_only_int8"):
    if algo == "weight_only_int8":
        return weight_dequant_wint8(w_int, w_scale)
    elif algo == "weight_only_int4":
        return weight_dequant_wint4(w_int, w_scale)
    else:
        return None, None


def fwt(a):
    """
    快速 Walsh-Hadamard 变换（正向变换）
    :param a: 输入列表，长度必须是2的幂
    :return: 变换后的列表
    """
    n = len(a)
    # 检查输入长度是否为2的幂
    if n == 0 or n & (n - 1) != 0:
        raise ValueError("输入长度必须是2的幂")

    # 复制输入以避免修改原始数据
    a = a.copy()
    h = 1
    while h < n:
        for i in range(0, n, 2 * h):
            for j in range(i, i + h):
                x = a[j]
                y = a[j + h]
                a[j] = x + y
                a[j + h] = x - y
        h <<= 1  # 等同于 h *= 2
    return a


def hadamard(_x, block_size):
    x = np.copy(_x).reshape((-1, _x.shape[-1]))
    if block_size == -1:
        return x
    m = 1
    n = x.shape[-1]
    for i in range(len(x.shape) - 1):
        m = m * x.shape[i]
    for i in range(m):
        for j in range(0, n, block_size):
            subx = x[i][j : j + block_size]
            x[i][j : j + block_size] = fwt(subx)
    return x.reshape(_x.shape)


# print(f"ffn1_w {ffn1_w.shape}: {ffn1_w}")
# ffn1_quant_w8, ffn1_w8_scale = weight_quant(ffn1_w, "weight_only_int8")
# ffn1_quant_w4, ffn1_w4_scale = weight_quant(ffn1_w, "weight_only_int4")
# print(f"ffn1_quant_w8 {ffn1_quant_w8.shape}: {ffn1_quant_w8}")
# print(f"ffn1_w8_scale {ffn1_w8_scale.shape}: {ffn1_w8_scale}")
# print(f"ffn1_quant_w4 {ffn1_quant_w4.shape}: {ffn1_quant_w4}")
# print(f"ffn1_w4_scale {ffn1_w4_scale.shape}: {ffn1_w4_scale}")

# ffn1_w8_dq = weight_dequant(ffn1_quant_w8, ffn1_w8_scale, "weight_only_int8")
# ffn1_w4_dq = weight_dequant(ffn1_quant_w4, ffn1_w4_scale, "weight_only_int4")
# print(f"ffn1_w8_dq {ffn1_w8_dq.shape}: {ffn1_w8_dq}")
# print(f"ffn1_w4_dq {ffn1_w4_dq.shape}: {ffn1_w4_dq}")


def batch_matmul(x, token_num_info, w, w_scale, algo):
    w_fp32 = weight_dequant(w, w_scale, algo)
    print(f"x {x.shape}, w {w_fp32.shape}")
    out_hidden_dim = w_fp32.shape[1]
    if not used_in_ep_low_latency:
        y = np.zeros([x.shape[0], out_hidden_dim], "float32")
        token_num_lod = token_num_info
        for i in range(expert_num_per_rank):
            start_i = token_num_lod[i]
            end_i = token_num_lod[i + 1]
            subx = x[start_i:end_i]
            subw = w_fp32[i : i + 1].transpose([0, 2, 1])
            y[start_i:end_i] = np.matmul(subx, subw)
    else:
        y = np.zeros(
            [
                expert_num_per_rank,
                num_max_dispatch_tokens_per_rank,
                out_hidden_dim,
            ],
            "float32",
        )
        token_num_per_expert = token_num_info
        for i in range(expert_num_per_rank):
            subx = x[i][: token_num_per_expert[i]]
            subw = w_fp32[i : i + 1].transpose([0, 2, 1])
            y[i][: token_num_per_expert[i]] = np.matmul(subx, subw)
    return y


def swiglu(x):
    new_shape = list(x.shape)
    new_shape[-1] //= 2
    x1 = np.copy(x[..., : new_shape[-1]])
    x2 = np.copy(x[..., new_shape[-1] :])
    y = x1 * 1.0 / (1.0 + np.exp(-x1)) * x2
    return y


ref_ffn1_out = batch_matmul(ffn_in, token_num_info, ffn1_quant_w, ffn1_w_scale, quant_method)
print(f"ref_ffn1_out {ref_ffn1_out.shape}: {ref_ffn1_out}")
ref_swiglu_out = swiglu(ref_ffn1_out)
print(f"ref_swiglu_out {ref_swiglu_out.shape}: {ref_swiglu_out}")
ref_swiglu_out = (ref_swiglu_out + ffn2_shift) * ffn2_smooth
ref_hadamard_out = hadamard(ref_swiglu_out, hadamard_blocksize)
ref_ffn2_out = batch_matmul(
    ref_hadamard_out,
    token_num_info,
    ffn2_quant_w,
    ffn2_w_scale,
    quant_method,
)

ffn_in_tensor = paddle.to_tensor(ffn_in).astype("bfloat16")
token_num_info_tensor = paddle.to_tensor(token_num_info)
ffn1_quant_w_tensor = paddle.to_tensor(ffn1_quant_w)
ffn2_quant_w_tensor = paddle.to_tensor(ffn2_quant_w)
ffn1_w_scale_tensor = paddle.to_tensor(ffn1_w_scale)
ffn2_w_scale_tensor = paddle.to_tensor(ffn2_w_scale)
ffn2_shift_tensor = paddle.to_tensor(ffn2_shift).astype("bfloat16")
ffn2_smooth_tensor = paddle.to_tensor(ffn2_smooth).astype("bfloat16")

ffn2_out = moe_expert_ffn(
    ffn_in_tensor,
    token_num_info_tensor,
    ffn1_quant_w_tensor,
    ffn2_quant_w_tensor,
    None,  # ffn1_bias
    None,  # ffn2_bias
    None,  # ffn1_act_scale
    None,  # ffn2_act_scale
    ffn1_w_scale_tensor,
    ffn2_w_scale_tensor,
    ffn2_shift_tensor,
    ffn2_smooth_tensor,
    quant_method,
    hadamard_blocksize,
    token_num,
)
ffn2_out = ffn2_out.astype("float32").numpy()
print(f"ffn2_out: {ffn2_out}")
print(f"ref_ffn2_out: {ref_ffn2_out}")

if not used_in_ep_low_latency:
    diff = np.sum(np.abs(ffn2_out - ref_ffn2_out)) / np.sum(np.abs(ffn2_out))
    print(f"diff: {diff}")
    assert diff < 0.01, f"diff: {diff}\nffn2_out:\n{ffn2_out}\nref_ffn2_out:\n{ref_ffn2_out}\n"
else:
    diff_all = 0
    for i in range(expert_num_per_rank):
        token_num_this_expert = token_num_per_expert[i]
        if token_num_this_expert == 0:
            continue
        tmp_ffn2_out = ffn2_out[i][:token_num_this_expert]
        tmp_ref_ffn2_out = ref_ffn2_out[i][:token_num_this_expert]
        diff = np.sum(np.abs(tmp_ffn2_out - tmp_ref_ffn2_out)) / np.sum(np.abs(tmp_ffn2_out))
        print(f"diff: {diff}")
        print(f"{i}, tmp_ffn2_out: {tmp_ffn2_out}")
        print(f"{i}, tmp_ref_ffn2_out: {tmp_ref_ffn2_out}")
        diff_all += diff
    diff_avg = diff_all / expert_num_per_rank
    print(f"diff_avg: {diff_avg}")
    assert diff_avg < 0.03, f"diff_avg: {diff_avg}\nffn2_out:\n{ffn2_out}\nref_ffn2_out:\n{ref_ffn2_out}\n"