【Inference Optimize】Support ERNIE-4_5-300B-A47B-2BITS-Paddle model TP2/TP4 Inference (#2666)

* Support TP2&TP4 Wint * Support TP2&TP4 Wint2 Inference
2025-09-26 20:41:53 +08:00 · 2025-07-01 18:29:11 +08:00
parent 2b7f74d427
commit a197dcd729
1 changed files with 5 additions and 1 deletions
--- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
@@ -18,6 +18,8 @@ import paddle
 from paddle import nn

 import fastdeploy
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce

 from ..quantization.quant_base import QuantMethodBase
 from ..utils import create_and_set_parameter, get_tensor
@@ -222,7 +224,6 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
        )

        from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
-
        fused_moe_out = moe_expert_reduce(
            ffn_out,
            topk_weights,
@@ -233,4 +234,7 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
            routed_scaling_factor=1.0,
        )

+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(fused_moe_out)
+
        return fused_moe_out