From a197dcd729c4a854f7e8a57c68ffa20c00d0eb01 Mon Sep 17 00:00:00 2001 From: AIbin <37361953+chang-wenbin@users.noreply.github.com> Date: Tue, 1 Jul 2025 18:29:11 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Inference=20Optimize=E3=80=91Support?= =?UTF-8?q?=20ERNIE-4=5F5-300B-A47B-2BITS-Paddle=20model=20TP2/TP4=20Infer?= =?UTF-8?q?ence=20(#2666)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Support TP2&TP4 Wint * Support TP2&TP4 Wint2 Inference --- .../model_executor/layers/moe/fused_moe_wint2_backend.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py index bcad4b41b..ea7d722c7 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py @@ -18,6 +18,8 @@ import paddle from paddle import nn import fastdeploy +from fastdeploy.distributed.communication_op import \ + tensor_model_parallel_all_reduce from ..quantization.quant_base import QuantMethodBase from ..utils import create_and_set_parameter, get_tensor @@ -222,7 +224,6 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod): ) from fastdeploy.model_executor.ops.gpu import moe_expert_reduce - fused_moe_out = moe_expert_reduce( ffn_out, topk_weights, @@ -233,4 +234,7 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod): routed_scaling_factor=1.0, ) + if layer.tp_size > 1: + tensor_model_parallel_all_reduce(fused_moe_out) + return fused_moe_out