From 710753377f0fda73870ec684305b92d4e46096cd Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Wed, 26 Nov 2025 17:51:10 +0800
Subject: [PATCH] [Cherry-Pick] Fix eplb noaux(#5239) (#5240)

* fix eplb noaux

* fix eplb noaux
---
 custom_ops/gpu_ops/noauxtc_kernel.h | 48 -----------------------------
 fastdeploy/eplb/experts_manager.py  |  1 -
 2 files changed, 49 deletions(-)
diff --git a/custom_ops/gpu_ops/noauxtc_kernel.h b/custom_ops/gpu_ops/noauxtc_kernel.h
index 7ac3bb174..c65ae1c5f 100644
--- a/custom_ops/gpu_ops/noauxtc_kernel.h
+++ b/custom_ops/gpu_ops/noauxtc_kernel.h
@@ -969,33 +969,12 @@ void invokeNoAuxTcRedundant(T* scores,
   int64_t num_cases = num_tokens * n_group;
   int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
   topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
       group_scores,
       scores_with_bias,
       num_cases,
       n_group,
       num_experts / n_group);
-#else
-  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
-  cudaLaunchConfig_t config;
-  config.gridDim = topk_with_k2_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = 0;
-  config.stream = stream;
-  cudaLaunchAttribute attrs[1];
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = false;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config,
-                     kernel_instance1,
-                     group_scores,
-                     scores_with_bias,
-                     num_cases,
-                     n_group,
-                     num_experts / n_group);
-#endif
 
   int64_t topk_with_k_group_num_blocks =
       (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -1003,7 +982,6 @@ void invokeNoAuxTcRedundant(T* scores,
       warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                            topk);
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
   group_idx_and_topk_idx_redundant_kernel<T>
       <<<topk_with_k_group_num_blocks,
          BLOCK_SIZE,
@@ -1025,32 +1003,6 @@ void invokeNoAuxTcRedundant(T* scores,
                    num_experts / n_group,
                    routed_scaling_factor,
                    redundant_ep_rank_num_plus_one);
-#else
-  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
-  config.gridDim = topk_with_k_group_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = dynamic_smem_in_bytes;
-  config.stream = stream;
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = false;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config,
-                     kernel_instance2,
-                     scores,
-                     group_scores,
-                     topk_values,
-                     topk_indices,
-                     scores_with_bias,
-                     num_tokens,
-                     n_group,
-                     topk_group,
-                     topk,
-                     num_experts,
-                     num_experts / n_group,
-                     renormalize,
-                     routed_scaling_factor);
-#endif
 }
 
 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                      \
diff --git a/fastdeploy/eplb/experts_manager.py b/fastdeploy/eplb/experts_manager.py
index 3c3e91547..e140b3a89 100644
--- a/fastdeploy/eplb/experts_manager.py
+++ b/fastdeploy/eplb/experts_manager.py
@@ -448,7 +448,6 @@ class RedundantExpertManager:
             if (
                 self.fd_config.scheduler_config.splitwise_role == "mixed"
                 or self.fd_config.scheduler_config.splitwise_role == "decode"
-                or self.fd_config.scheduler_config.splitwise_role == "prefill"
                 or not self.eplb_config.redundant_expert_enable_schedule_cordon
             ):
                 self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")