[Cherry-Pick] Fix eplb noaux(#5239) (#5240)

* fix eplb noaux * fix eplb noaux
2025-12-24 13:28:13 +08:00 · 2025-11-26 17:51:10 +08:00
parent 49be443d02
commit 710753377f
2 changed files with 0 additions and 49 deletions
--- a/custom_ops/gpu_ops/noauxtc_kernel.h
+++ b/custom_ops/gpu_ops/noauxtc_kernel.h
@@ -969,33 +969,12 @@ void invokeNoAuxTcRedundant(T* scores,
  int64_t num_cases = num_tokens * n_group;
  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;

-#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
  topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
      group_scores,
      scores_with_bias,
      num_cases,
      n_group,
      num_experts / n_group);
-#else
-  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
-  cudaLaunchConfig_t config;
-  config.gridDim = topk_with_k2_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = 0;
-  config.stream = stream;
-  cudaLaunchAttribute attrs[1];
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = false;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config,
-                     kernel_instance1,
-                     group_scores,
-                     scores_with_bias,
-                     num_cases,
-                     n_group,
-                     num_experts / n_group);
-#endif

  int64_t topk_with_k_group_num_blocks =
      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -1003,7 +982,6 @@ void invokeNoAuxTcRedundant(T* scores,
      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                           topk);

-#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
  group_idx_and_topk_idx_redundant_kernel<T>
      <<<topk_with_k_group_num_blocks,
         BLOCK_SIZE,
@@ -1025,32 +1003,6 @@ void invokeNoAuxTcRedundant(T* scores,
                   num_experts / n_group,
                   routed_scaling_factor,
                   redundant_ep_rank_num_plus_one);
-#else
-  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
-  config.gridDim = topk_with_k_group_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = dynamic_smem_in_bytes;
-  config.stream = stream;
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = false;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config,
-                     kernel_instance2,
-                     scores,
-                     group_scores,
-                     topk_values,
-                     topk_indices,
-                     scores_with_bias,
-                     num_tokens,
-                     n_group,
-                     topk_group,
-                     topk,
-                     num_experts,
-                     num_experts / n_group,
-                     renormalize,
-                     routed_scaling_factor);
-#endif
 }

 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                      \
--- a/fastdeploy/eplb/experts_manager.py
+++ b/fastdeploy/eplb/experts_manager.py
@@ -448,7 +448,6 @@ class RedundantExpertManager:
            if (
                self.fd_config.scheduler_config.splitwise_role == "mixed"
                or self.fd_config.scheduler_config.splitwise_role == "decode"
-                or self.fd_config.scheduler_config.splitwise_role == "prefill"
                or not self.eplb_config.redundant_expert_enable_schedule_cordon
            ):
                self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")