mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
[Feature] Support_eplb (#2997)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* [Feature] support_eplb * [Feature] support_eplb * [Fix] fix mm ep
This commit is contained in:
@@ -77,7 +77,7 @@ class DeepEPEngine:
|
||||
elif moe_phase == MoEPhase.PREFILL:
|
||||
self.deepep_engine = deep_ep.Buffer(
|
||||
self.group,
|
||||
int(1e9),
|
||||
int(5e8),
|
||||
0,
|
||||
low_latency_mode=False,
|
||||
num_qps_per_rank=1,
|
||||
@@ -214,13 +214,15 @@ class EPRunner:
|
||||
num_max_dispatch_tokens_per_rank: int = 1,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
redundant_experts_num: int = 0,
|
||||
):
|
||||
self.top_k = top_k
|
||||
self.num_experts = num_experts
|
||||
self.redundant_experts_num = redundant_experts_num
|
||||
self.ep_engine = DeepEPEngine(
|
||||
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank,
|
||||
hidden=hidden,
|
||||
num_experts=num_experts,
|
||||
num_experts=num_experts + redundant_experts_num,
|
||||
moe_phase=moe_phase,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
@@ -230,13 +232,33 @@ class EPRunner:
|
||||
"""
|
||||
moe_select
|
||||
"""
|
||||
topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
self.top_k,
|
||||
True, # apply_norm_weight,
|
||||
False,
|
||||
)
|
||||
if layer.redundant_table_manger is not None:
|
||||
(
|
||||
ep_rank_to_expert_id_list,
|
||||
expert_id_to_ep_rank_array,
|
||||
expert_in_rank_num_list,
|
||||
tokens_per_expert_stats_list,
|
||||
) = layer.redundant_table_manger.get_ep_rank_to_expert_id_list_by_layer(layer.layer_idx)
|
||||
|
||||
topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_redundant_topk_select(
|
||||
gating_logits=gate_out,
|
||||
expert_id_to_ep_rank_array=expert_id_to_ep_rank_array,
|
||||
expert_in_rank_num_list=expert_in_rank_num_list,
|
||||
tokens_per_expert_stats_list=tokens_per_expert_stats_list,
|
||||
bias=layer.gate_correction_bias,
|
||||
moe_topk=self.top_k,
|
||||
apply_norm_weight=True, # apply_norm_weight
|
||||
enable_softmax_top_k_fused=False,
|
||||
redundant_ep_rank_num_plus_one=layer.fd_config.model_config.redundant_experts_num + 1,
|
||||
)
|
||||
else:
|
||||
topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
self.top_k,
|
||||
True, # apply_norm_weight,
|
||||
False,
|
||||
)
|
||||
return topk_idx, topk_weights
|
||||
|
||||
@abstractmethod
|
||||
@@ -266,6 +288,7 @@ class EPPrefillRunner(EPRunner):
|
||||
num_experts: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
redundant_experts_num: int = 0,
|
||||
):
|
||||
super().__init__(
|
||||
top_k,
|
||||
@@ -274,6 +297,7 @@ class EPPrefillRunner(EPRunner):
|
||||
MoEPhase.PREFILL,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
redundant_experts_num=redundant_experts_num,
|
||||
)
|
||||
|
||||
def dispatch(
|
||||
@@ -336,6 +360,7 @@ class EPDecoderRunner(EPRunner):
|
||||
num_max_dispatch_tokens_per_rank: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
redundant_experts_num: int = 0,
|
||||
):
|
||||
super().__init__(
|
||||
top_k,
|
||||
@@ -345,6 +370,7 @@ class EPDecoderRunner(EPRunner):
|
||||
num_max_dispatch_tokens_per_rank,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
redundant_experts_num=redundant_experts_num,
|
||||
)
|
||||
|
||||
def dispatch(
|
||||
|
Reference in New Issue
Block a user