mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-26 20:41:53 +08:00
【Inference Optimize】Support ERNIE-4_5-300B-A47B-2BITS-Paddle model TP2/TP4 Inference (#2666)
* Support TP2&TP4 Wint * Support TP2&TP4 Wint2 Inference
This commit is contained in:
@@ -18,6 +18,8 @@ import paddle
|
||||
from paddle import nn
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
tensor_model_parallel_all_reduce
|
||||
|
||||
from ..quantization.quant_base import QuantMethodBase
|
||||
from ..utils import create_and_set_parameter, get_tensor
|
||||
@@ -222,7 +224,6 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
|
||||
)
|
||||
|
||||
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
|
||||
|
||||
fused_moe_out = moe_expert_reduce(
|
||||
ffn_out,
|
||||
topk_weights,
|
||||
@@ -233,4 +234,7 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
|
||||
routed_scaling_factor=1.0,
|
||||
)
|
||||
|
||||
if layer.tp_size > 1:
|
||||
tensor_model_parallel_all_reduce(fused_moe_out)
|
||||
|
||||
return fused_moe_out
|
||||
|
Reference in New Issue
Block a user