mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[XPU] moe support VL 0-dim input (#4408)
This commit is contained in:
@@ -262,7 +262,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
||||
# layer.top_k,
|
||||
# False, # moe group, used in deepseek
|
||||
# )
|
||||
# if layer.tp_size > 1:
|
||||
# if layer.reduce_results and layer.tp_size > 1:
|
||||
# from fastdeploy.distributed.communication import (
|
||||
# tensor_model_parallel_all_reduce,
|
||||
# )
|
||||
@@ -271,48 +271,53 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
||||
|
||||
# return fused_moe_out
|
||||
|
||||
gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True)
|
||||
topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True)
|
||||
token_nums_per_expert_list = list(range(64)) # 填充做占位符
|
||||
permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = (
|
||||
ep_moe_expert_dispatch(
|
||||
x,
|
||||
topk_idx,
|
||||
topk_weights,
|
||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||
token_nums_per_expert_list,
|
||||
x.shape[0] * layer.top_k,
|
||||
self.moe_quant_type,
|
||||
token_num = x.shape[0]
|
||||
if token_num > 0:
|
||||
gate_out = paddle.matmul(x.cast("float32"), gate.weight.transpose([1, 0]), transpose_y=True)
|
||||
topk_idx, topk_weights = moe_topk_select(gate_out, layer.gate_correction_bias, layer.top_k, True)
|
||||
token_nums_per_expert_list = list(range(64)) # 填充做占位符
|
||||
permute_input, permute_indices_per_token, token_num_lod, dst_weights, ffn1_act_scale_per_token = (
|
||||
ep_moe_expert_dispatch(
|
||||
x,
|
||||
topk_idx,
|
||||
topk_weights,
|
||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||
token_nums_per_expert_list,
|
||||
x.shape[0] * layer.top_k,
|
||||
self.moe_quant_type,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
ffn_out = moe_expert_ffn(
|
||||
permute_input,
|
||||
token_num_lod,
|
||||
layer.up_gate_proj_weight,
|
||||
layer.down_proj_weight,
|
||||
None, # moe_ffn1_bias
|
||||
None, # moe_ffn2_bias
|
||||
None, # ffn1 in scale
|
||||
None, # ffn2 in scale
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
None, # moe_ffn2_shift
|
||||
None, # moe_ffn2_smooth
|
||||
self.moe_quant_type,
|
||||
-1,
|
||||
x.shape[0] * layer.top_k, # token_all_num
|
||||
)
|
||||
topk_weights_bf16 = topk_weights.astype("bfloat16")
|
||||
tmp_ffn_out = ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
permute_indices_per_token,
|
||||
topk_weights_bf16,
|
||||
permute_indices_per_token.shape[0],
|
||||
ffn_out.shape[0],
|
||||
ffn_out.shape[1],
|
||||
permute_indices_per_token.shape[1],
|
||||
)
|
||||
ffn_out = moe_expert_ffn(
|
||||
permute_input,
|
||||
token_num_lod,
|
||||
layer.up_gate_proj_weight,
|
||||
layer.down_proj_weight,
|
||||
None, # moe_ffn1_bias
|
||||
None, # moe_ffn2_bias
|
||||
None, # ffn1 in scale
|
||||
None, # ffn2 in scale
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
None, # moe_ffn2_shift
|
||||
None, # moe_ffn2_smooth
|
||||
self.moe_quant_type,
|
||||
-1,
|
||||
x.shape[0] * layer.top_k, # token_all_num
|
||||
)
|
||||
topk_weights_bf16 = topk_weights.astype("bfloat16")
|
||||
tmp_ffn_out = ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
permute_indices_per_token,
|
||||
topk_weights_bf16,
|
||||
permute_indices_per_token.shape[0],
|
||||
ffn_out.shape[0],
|
||||
ffn_out.shape[1],
|
||||
permute_indices_per_token.shape[1],
|
||||
)
|
||||
else:
|
||||
tmp_ffn_out = paddle.empty(x.shape, x.dtype)
|
||||
|
||||
if layer.reduce_results and layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
@@ -858,7 +863,7 @@ class XPUW4A8MoEMethod(XPUMoEMethod):
|
||||
ffn_out.shape[1],
|
||||
permute_indices_per_token.shape[1],
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
if layer.reduce_results and layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user