mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
【Inference Optimize】Support ERNIE-4_5-300B-A47B-2BITS-Paddle model TP2/TP4 Inference (#2666)
* Support TP2&TP4 Wint * Support TP2&TP4 Wint2 Inference
This commit is contained in:
@@ -18,6 +18,8 @@ import paddle
|
|||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
|
||||||
import fastdeploy
|
import fastdeploy
|
||||||
|
from fastdeploy.distributed.communication_op import \
|
||||||
|
tensor_model_parallel_all_reduce
|
||||||
|
|
||||||
from ..quantization.quant_base import QuantMethodBase
|
from ..quantization.quant_base import QuantMethodBase
|
||||||
from ..utils import create_and_set_parameter, get_tensor
|
from ..utils import create_and_set_parameter, get_tensor
|
||||||
@@ -222,7 +224,6 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
|
|||||||
)
|
)
|
||||||
|
|
||||||
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
|
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
|
||||||
|
|
||||||
fused_moe_out = moe_expert_reduce(
|
fused_moe_out = moe_expert_reduce(
|
||||||
ffn_out,
|
ffn_out,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
@@ -233,4 +234,7 @@ class TritonWint2FusedMoeMethod(Wint2MoeMethod):
|
|||||||
routed_scaling_factor=1.0,
|
routed_scaling_factor=1.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if layer.tp_size > 1:
|
||||||
|
tensor_model_parallel_all_reduce(fused_moe_out)
|
||||||
|
|
||||||
return fused_moe_out
|
return fused_moe_out
|
||||||
|
Reference in New Issue
Block a user