[XPU] moe support VL 0-dim input (#4408)

This commit is contained in:
Lucas
2025-10-16 14:01:01 +08:00
committed by GitHub
parent fd5dd1a0f1
commit a5063b96c8

View File

@@ -262,7 +262,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
# layer.top_k,
# False, # moe group, used in deepseek
# )
# if layer.tp_size > 1:
# if layer.reduce_results and layer.tp_size > 1:
# from fastdeploy.distributed.communication import (
# tensor_model_parallel_all_reduce,
# )
@@ -271,48 +271,53 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
# return fused_moe_out
gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True)
topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True)
token_nums_per_expert_list = list(range(64)) # 填充做占位符
permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = (
ep_moe_expert_dispatch(
x,
topk_idx,
topk_weights,
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
token_nums_per_expert_list,
x.shape[0] * layer.top_k,
self.moe_quant_type,
token_num = x.shape[0]
if token_num > 0:
gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True)
topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True)
token_nums_per_expert_list = list(range(64)) # 填充做占位符
permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = (
ep_moe_expert_dispatch(
x,
topk_idx,
topk_weights,
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
token_nums_per_expert_list,
x.shape[0] * layer.top_k,
self.moe_quant_type,
)
)
)
ffn_out = moe_expert_ffn(
permute_input,
token_num_lod,
layer.up_gate_proj_weight,
layer.down_proj_weight,
None, # moe_ffn1_bias
None, # moe_ffn2_bias
None, # ffn1 in scale
None, # ffn2 in scale
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
None, # moe_ffn2_shift
None, # moe_ffn2_smooth
self.moe_quant_type,
-1,
x.shape[0] * layer.top_k, # token_all_num
)
topk_weights_bf16 = topk_weights.astype("bfloat16")
tmp_ffn_out = ep_moe_expert_combine(
ffn_out,
permute_indices_per_token,
topk_weights_bf16,
permute_indices_per_token.shape[0],
ffn_out.shape[0],
ffn_out.shape[1],
permute_indices_per_token.shape[1],
)
ffn_out = moe_expert_ffn(
permute_input,
token_num_lod,
layer.up_gate_proj_weight,
layer.down_proj_weight,
None, # moe_ffn1_bias
None, # moe_ffn2_bias
None, # ffn1 in scale
None, # ffn2 in scale
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
None, # moe_ffn2_shift
None, # moe_ffn2_smooth
self.moe_quant_type,
-1,
x.shape[0] * layer.top_k, # token_all_num
)
topk_weights_bf16 = topk_weights.astype("bfloat16")
tmp_ffn_out = ep_moe_expert_combine(
ffn_out,
permute_indices_per_token,
topk_weights_bf16,
permute_indices_per_token.shape[0],
ffn_out.shape[0],
ffn_out.shape[1],
permute_indices_per_token.shape[1],
)
else:
tmp_ffn_out = paddle.empty(x.shape, x.dtype)
if layer.reduce_results and layer.tp_size > 1:
from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce,
@@ -858,7 +863,7 @@ class XPUW4A8MoEMethod(XPUMoEMethod):
ffn_out.shape[1],
permute_indices_per_token.shape[1],
)
if layer.tp_size > 1:
if layer.reduce_results and layer.tp_size > 1:
from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce,
)