【Inference Optimize】Support ERNIE-4_5-300B-A47B-2BITS-Paddle model TP2/TP4 Inference (#2666)

* Support TP2&TP4 Wint

* Support TP2&TP4 Wint2 Inference
This commit is contained in:
AIbin
2025-07-01 18:29:11 +08:00
committed by GitHub
parent 2b7f74d427
commit a197dcd729

View File

@@ -18,6 +18,8 @@ import paddle
from paddle import nn from paddle import nn
import fastdeploy import fastdeploy
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from ..quantization.quant_base import QuantMethodBase from ..quantization.quant_base import QuantMethodBase
from ..utils import create_and_set_parameter, get_tensor from ..utils import create_and_set_parameter, get_tensor
@@ -222,7 +224,6 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
) )
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
fused_moe_out = moe_expert_reduce( fused_moe_out = moe_expert_reduce(
ffn_out, ffn_out,
topk_weights, topk_weights,
@@ -233,4 +234,7 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
routed_scaling_factor=1.0, routed_scaling_factor=1.0,
) )
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(fused_moe_out)
return fused_moe_out return fused_moe_out