From 209970836eaea19478dc18b69ae7da90d871703a Mon Sep 17 00:00:00 2001 From: chen <103103266+ckl117@users.noreply.github.com> Date: Wed, 26 Nov 2025 19:16:22 +0800 Subject: [PATCH] [BugFix] BF16 MoE Cutlass Backend Support EP (#5242) --- fastdeploy/config.py | 2 ++ .../layers/moe/fused_moe_cutlass_backend.py | 7 +++++-- .../layers/moe/fused_moe_triton_backend.py | 2 +- fastdeploy/model_executor/models/glm4_moe.py | 14 ++++++++++++++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 4cf05ea98..e3297e4d1 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -304,6 +304,8 @@ class ModelConfig: if hasattr(self, "num_experts") and getattr(self, "moe_num_experts") is None: self.moe_num_experts = self.num_experts + if hasattr(self, "n_routed_experts") and getattr(self, "moe_num_experts") is None: + self.moe_num_experts = self.n_routed_experts def read_from_env(self): """ diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index d00dd45a0..b83cc339e 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -206,7 +206,10 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod): tmp_ffn_out = recv_x # 4. EP combine - return self.ep_prefill_runner.combine(tmp_ffn_out, handle, recv_topk_weights) + tmp_ffn_out, event = self.ep_prefill_runner.combine(tmp_ffn_out, handle, recv_topk_weights) + if self.ep_prefill_runner.ep_engine.async_finish: + event.current_stream_wait() + return tmp_ffn_out def apply_ep_decode( self, @@ -242,7 +245,7 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod): if self.moe_quant_type == "w4a8" or self.moe_quant_type == "w4afp8": num_local_experts, max_num, _ = permute_input.shape expert_idx_per_token = paddle.arange(num_local_experts)[:, None].tile([1, max_num]) - elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]: + elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4", "w16a16"]: expert_idx_per_token = None else: raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index b3155e100..e26a051a7 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -808,7 +808,7 @@ class Wfp8Afp8MoEMethod(QuantMethodBase): N=hidden_size, K=moe_intermediate_size, stride_am=x_q.strides[0], - stride_ak=x_scale.strides[1], + stride_ak=x_q.strides[1], stride_be=layer.down_proj_weight.strides[0], stride_bk=layer.down_proj_weight.strides[2], stride_bn=layer.down_proj_weight.strides[1], diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 8850ce812..a095b7b04 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -494,6 +494,20 @@ class Glm4MoeForCausalLM(ModelForCasualLM): return logits + def empty_input_forward(self): + """ + empty_input_forward + """ + fake_hidden_states = paddle.ones( + shape=[1, self.fd_config.model_config.hidden_size], + dtype=paddle.get_default_dtype(), + ) + for i in range( + self.fd_config.model_config.first_k_dense_replace, + self.fd_config.model_config.num_hidden_layers, + ): + self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate) + def forward( self, ids_remove_padding: paddle.Tensor,