mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-11-01 12:22:53 +08:00
remove dev sync in prefill (#4598)
This commit is contained in:
@@ -435,6 +435,7 @@ class EPPrefillRunner(EPRunner):
|
||||
x: paddle.Tensor,
|
||||
topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor,
|
||||
expert_alignment: int = 1,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
@@ -461,6 +462,7 @@ class EPPrefillRunner(EPRunner):
|
||||
"async_finish": self.ep_engine.async_finish,
|
||||
"topk_idx": topk_idx,
|
||||
"topk_weights": topk_weights,
|
||||
"expert_alignment": expert_alignment,
|
||||
}
|
||||
return buffer.dispatch(**dispatch_args)
|
||||
|
||||
|
||||
@@ -335,7 +335,9 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
recv_num_tokens_per_expert_list,
|
||||
handle,
|
||||
_,
|
||||
) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights, x_scale_tensor=x_scale_tensor)
|
||||
) = self.ep_prefill_runner.dispatch(
|
||||
x, topk_idx, topk_weights, x_scale_tensor=x_scale_tensor, expert_alignment=128
|
||||
)
|
||||
|
||||
token_all_num = sum(recv_num_tokens_per_expert_list)
|
||||
|
||||
@@ -345,7 +347,6 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
(recv_x, recv_x_scale) = recv_x
|
||||
|
||||
token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
|
||||
token_nums_this_rank_padded = sum(token_nums_this_rank[1].numpy().tolist())
|
||||
|
||||
(
|
||||
permute_input,
|
||||
@@ -365,7 +366,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
token_nums_this_rank[0],
|
||||
token_nums_this_rank[1],
|
||||
True, # use_in_ep
|
||||
token_nums_this_rank_padded,
|
||||
token_all_num,
|
||||
)
|
||||
|
||||
permute_scale = permute_scale.transpose([1, 0]).contiguous()
|
||||
|
||||
Reference in New Issue
Block a user