【Inference Optimize】Support ERNIE-4_5-300B-A47B-2BITS-Paddle model TP2/TP4 Inference (#2666)

* Support TP2&TP4 Wint

* Support TP2&TP4 Wint2 Inference
This commit is contained in:
AIbin
2025-07-01 18:29:11 +08:00
committed by GitHub
parent 2b7f74d427
commit a197dcd729

View File

@@ -18,6 +18,8 @@ import paddle
from paddle import nn
import fastdeploy
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from ..quantization.quant_base import QuantMethodBase
from ..utils import create_and_set_parameter, get_tensor
@@ -222,7 +224,6 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
)
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
fused_moe_out = moe_expert_reduce(
ffn_out,
topk_weights,
@@ -233,4 +234,7 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
routed_scaling_factor=1.0,
)
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(fused_moe_out)
return fused_moe_out