mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -20,6 +20,7 @@ import paddle
|
||||
from paddle import nn
|
||||
from paddle.base.core import Config
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
try:
|
||||
from paddle.distributed.communication import deep_ep
|
||||
except:
|
||||
@@ -103,10 +104,12 @@ class DeepEPEngine:
|
||||
self.num_experts,
|
||||
)
|
||||
# Allocate a buffer if not existed or not enough buffer size
|
||||
if (self.deepep_engine is None
|
||||
or self.deepep_engine.group != self.group
|
||||
or not self.deepep_engine.low_latency_mode
|
||||
or self.deepep_engine.num_rdma_bytes < num_rdma_bytes):
|
||||
if (
|
||||
self.deepep_engine is None
|
||||
or self.deepep_engine.group != self.group
|
||||
or not self.deepep_engine.low_latency_mode
|
||||
or self.deepep_engine.num_rdma_bytes < num_rdma_bytes
|
||||
):
|
||||
# NOTES: for best performance, the QP number **must** be equal to the number of the local experts
|
||||
assert self.num_experts % self.ep_size == 0
|
||||
self.deepep_engine = deep_ep.Buffer(
|
||||
@@ -140,13 +143,7 @@ class DeepEPEngine:
|
||||
event: the event after executing the kernel (valid only if `async_finish` is set).
|
||||
hook: the receiving hook function (valid only if `return_recv_hook` is set).
|
||||
"""
|
||||
(
|
||||
packed_recv_x,
|
||||
recv_expert_count,
|
||||
handle,
|
||||
_,
|
||||
dispatch_hook,
|
||||
) = self.deepep_engine.low_latency_dispatch(
|
||||
(packed_recv_x, recv_expert_count, handle, _, dispatch_hook,) = self.deepep_engine.low_latency_dispatch(
|
||||
hidden_states,
|
||||
topk_idx,
|
||||
expertwise_scale,
|
||||
@@ -172,15 +169,14 @@ class DeepEPEngine:
|
||||
combined_hidden_states: [num_tokens, hidden]
|
||||
"""
|
||||
|
||||
combined_hidden_states, _, combine_hook = (
|
||||
self.deepep_engine.low_latency_combine(
|
||||
hidden_states,
|
||||
topk_idx,
|
||||
topk_weights,
|
||||
handle,
|
||||
async_finish=False,
|
||||
return_recv_hook=True,
|
||||
))
|
||||
combined_hidden_states, _, combine_hook = self.deepep_engine.low_latency_combine(
|
||||
hidden_states,
|
||||
topk_idx,
|
||||
topk_weights,
|
||||
handle,
|
||||
async_finish=False,
|
||||
return_recv_hook=True,
|
||||
)
|
||||
return combined_hidden_states, combine_hook
|
||||
|
||||
def clean_low_latency_buffer(self):
|
||||
@@ -188,8 +184,8 @@ class DeepEPEngine:
|
||||
clean_low_latency_buffer
|
||||
"""
|
||||
self.deepep_engine.clean_low_latency_buffer(
|
||||
self.num_max_dispatch_tokens_per_rank, self.hidden,
|
||||
self.num_experts)
|
||||
self.num_max_dispatch_tokens_per_rank, self.hidden, self.num_experts
|
||||
)
|
||||
|
||||
def barrier_all(self):
|
||||
"""
|
||||
@@ -203,14 +199,16 @@ class EPRunner:
|
||||
EPRunnerBase
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
moe_phase: MoEPhase,
|
||||
num_max_dispatch_tokens_per_rank: int = 1,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0):
|
||||
def __init__(
|
||||
self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
moe_phase: MoEPhase,
|
||||
num_max_dispatch_tokens_per_rank: int = 1,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
):
|
||||
self.top_k = top_k
|
||||
self.num_experts = num_experts
|
||||
self.ep_engine = DeepEPEngine(
|
||||
@@ -255,24 +253,38 @@ class EPPrefillRunner(EPRunner):
|
||||
EPPrefillRunner
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0):
|
||||
super().__init__(top_k,
|
||||
hidden,
|
||||
num_experts,
|
||||
MoEPhase.PREFILL,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank)
|
||||
def __init__(
|
||||
self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
):
|
||||
super().__init__(
|
||||
top_k,
|
||||
hidden,
|
||||
num_experts,
|
||||
MoEPhase.PREFILL,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
)
|
||||
|
||||
def dispatch(self, x: paddle.Tensor, topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor, *args, **kwargs):
|
||||
(num_tokens_per_rank, _, num_tokens_per_expert, is_token_in_rank,
|
||||
_) = self.ep_engine.deepep_engine.get_dispatch_layout(
|
||||
topk_idx, self.num_experts)
|
||||
def dispatch(
|
||||
self,
|
||||
x: paddle.Tensor,
|
||||
topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
(
|
||||
num_tokens_per_rank,
|
||||
_,
|
||||
num_tokens_per_expert,
|
||||
is_token_in_rank,
|
||||
_,
|
||||
) = self.ep_engine.deepep_engine.get_dispatch_layout(topk_idx, self.num_experts)
|
||||
|
||||
x_scale_tensor = kwargs.get("x_scale_tensor", None)
|
||||
dispatch_args = {
|
||||
@@ -287,8 +299,12 @@ class EPPrefillRunner(EPRunner):
|
||||
}
|
||||
return self.ep_engine.deepep_engine.dispatch(**dispatch_args)
|
||||
|
||||
def combine(self, tmp_ffn_out: paddle.Tensor, handle: tuple,
|
||||
recv_topk_weights: paddle.Tensor):
|
||||
def combine(
|
||||
self,
|
||||
tmp_ffn_out: paddle.Tensor,
|
||||
handle: tuple,
|
||||
recv_topk_weights: paddle.Tensor,
|
||||
):
|
||||
combine_args = {
|
||||
"x": tmp_ffn_out,
|
||||
"handle": handle,
|
||||
@@ -296,8 +312,7 @@ class EPPrefillRunner(EPRunner):
|
||||
"async_finish": self.ep_engine.async_finish,
|
||||
"topk_weights": recv_topk_weights,
|
||||
}
|
||||
fused_moe_out, _, _ = (self.ep_engine.deepep_engine.combine(
|
||||
**combine_args))
|
||||
fused_moe_out, _, _ = self.ep_engine.deepep_engine.combine(**combine_args)
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
@@ -307,36 +322,46 @@ class EPDecoderRunner(EPRunner):
|
||||
EPPrefillRunner
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
num_max_dispatch_tokens_per_rank: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0):
|
||||
super().__init__(top_k,
|
||||
hidden,
|
||||
num_experts,
|
||||
MoEPhase.DECODER,
|
||||
num_max_dispatch_tokens_per_rank,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank)
|
||||
def __init__(
|
||||
self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
num_max_dispatch_tokens_per_rank: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
):
|
||||
super().__init__(
|
||||
top_k,
|
||||
hidden,
|
||||
num_experts,
|
||||
MoEPhase.DECODER,
|
||||
num_max_dispatch_tokens_per_rank,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
)
|
||||
|
||||
def dispatch(self, x: paddle.Tensor, topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor, *args, **kwargs):
|
||||
def dispatch(
|
||||
self,
|
||||
x: paddle.Tensor,
|
||||
topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
expertwise_scale = kwargs.get("expertwise_scale", None)
|
||||
use_fp8 = kwargs.get("use_fp8", False)
|
||||
|
||||
recv_hidden_states, recv_expert_count, handle, dispatch_hook = (
|
||||
self.ep_engine.low_latency_dispatch(x, topk_idx, expertwise_scale,
|
||||
use_fp8))
|
||||
recv_hidden_states, recv_expert_count, handle, dispatch_hook = self.ep_engine.low_latency_dispatch(
|
||||
x, topk_idx, expertwise_scale, use_fp8
|
||||
)
|
||||
if dispatch_hook is not None:
|
||||
dispatch_hook()
|
||||
|
||||
return recv_hidden_states, recv_expert_count, handle
|
||||
|
||||
def combine(self, ffn_out, topk_idx, topk_weights, handle):
|
||||
# TODO(@wufeisheng): Delete them when deepep in PaddlePaddle is fixed
|
||||
# TODO(@wufeisheng): Delete them when deepep in PaddlePaddle is fixed
|
||||
(
|
||||
src_info,
|
||||
layout_range,
|
||||
@@ -353,7 +378,8 @@ class EPDecoderRunner(EPRunner):
|
||||
)
|
||||
|
||||
combined_hidden_states, combine_hook = self.ep_engine.low_latency_combine(
|
||||
ffn_out, topk_idx, topk_weights, handle)
|
||||
ffn_out, topk_idx, topk_weights, handle
|
||||
)
|
||||
if combine_hook is not None:
|
||||
combine_hook()
|
||||
|
||||
|
Reference in New Issue
Block a user