From aa35ce449d7461ebe05732be957c409c0d1d85d8 Mon Sep 17 00:00:00 2001 From: chen <103103266+ckl117@users.noreply.github.com> Date: Mon, 1 Dec 2025 21:10:40 +0800 Subject: [PATCH] [Optimization] EP empty_input_forward Remove Communication (#5254) --- custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu | 5 +++++ .../model_executor/layers/moe/fused_moe_cutlass_backend.py | 4 +++- fastdeploy/model_executor/models/glm4_moe.py | 4 ++-- fastdeploy/model_executor/models/qwen3moe.py | 2 +- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu index 369a92ee2..4416d2045 100644 --- a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu +++ b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu @@ -58,6 +58,11 @@ __VA_ARGS__ \ break; \ } \ + case 20: { \ + constexpr size_t NUM_EXPERTS_PER_RANK = 20; \ + __VA_ARGS__ \ + break; \ + } \ case 32: { \ constexpr size_t NUM_EXPERTS_PER_RANK = 32; \ __VA_ARGS__ \ diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index d87894b81..4802a6aab 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -146,8 +146,10 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod): recv_topk_weights, recv_num_tokens_per_expert_list, handle, - _, + event, ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights) + if self.ep_prefill_runner.ep_engine.async_finish: + event.current_stream_wait() token_all_num = sum(recv_num_tokens_per_expert_list) # 3. Compute ffn diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index a095b7b04..9cd0c7003 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -498,8 +498,8 @@ class Glm4MoeForCausalLM(ModelForCasualLM): """ empty_input_forward """ - fake_hidden_states = paddle.ones( - shape=[1, self.fd_config.model_config.hidden_size], + fake_hidden_states = paddle.empty( + shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) for i in range( diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 3e9a72d76..74adb5cc3 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -421,7 +421,7 @@ class Qwen3MoeForCausalLM(ModelForCasualLM): empty_input_forward """ fake_hidden_states = paddle.empty( - shape=[1, self.fd_config.model_config.hidden_size], + shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) for i in range(