From 710753377f0fda73870ec684305b92d4e46096cd Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Wed, 26 Nov 2025 17:51:10 +0800 Subject: [PATCH] [Cherry-Pick] Fix eplb noaux(#5239) (#5240) * fix eplb noaux * fix eplb noaux --- custom_ops/gpu_ops/noauxtc_kernel.h | 48 ----------------------------- fastdeploy/eplb/experts_manager.py | 1 - 2 files changed, 49 deletions(-) diff --git a/custom_ops/gpu_ops/noauxtc_kernel.h b/custom_ops/gpu_ops/noauxtc_kernel.h index 7ac3bb174..c65ae1c5f 100644 --- a/custom_ops/gpu_ops/noauxtc_kernel.h +++ b/custom_ops/gpu_ops/noauxtc_kernel.h @@ -969,33 +969,12 @@ void invokeNoAuxTcRedundant(T* scores, int64_t num_cases = num_tokens * n_group; int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1; -#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU topk_with_k2_kernel<<>>( group_scores, scores_with_bias, num_cases, n_group, num_experts / n_group); -#else - auto* kernel_instance1 = &topk_with_k2_kernel; - cudaLaunchConfig_t config; - config.gridDim = topk_with_k2_num_blocks; - config.blockDim = BLOCK_SIZE; - config.dynamicSmemBytes = 0; - config.stream = stream; - cudaLaunchAttribute attrs[1]; - attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = false; - config.numAttrs = 1; - config.attrs = attrs; - cudaLaunchKernelEx(&config, - kernel_instance1, - group_scores, - scores_with_bias, - num_cases, - n_group, - num_experts / n_group); -#endif int64_t topk_with_k_group_num_blocks = (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1; @@ -1003,7 +982,6 @@ void invokeNoAuxTcRedundant(T* scores, warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK, topk); -#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU group_idx_and_topk_idx_redundant_kernel <<; - config.gridDim = topk_with_k_group_num_blocks; - config.blockDim = BLOCK_SIZE; - config.dynamicSmemBytes = dynamic_smem_in_bytes; - config.stream = stream; - attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = false; - config.numAttrs = 1; - config.attrs = attrs; - cudaLaunchKernelEx(&config, - kernel_instance2, - scores, - group_scores, - topk_values, - topk_indices, - scores_with_bias, - num_tokens, - n_group, - topk_group, - topk, - num_experts, - num_experts / n_group, - renormalize, - routed_scaling_factor); -#endif } #define INSTANTIATE_NOAUX_TC(T, IdxT) \ diff --git a/fastdeploy/eplb/experts_manager.py b/fastdeploy/eplb/experts_manager.py index 3c3e91547..e140b3a89 100644 --- a/fastdeploy/eplb/experts_manager.py +++ b/fastdeploy/eplb/experts_manager.py @@ -448,7 +448,6 @@ class RedundantExpertManager: if ( self.fd_config.scheduler_config.splitwise_role == "mixed" or self.fd_config.scheduler_config.splitwise_role == "decode" - or self.fd_config.scheduler_config.splitwise_role == "prefill" or not self.eplb_config.redundant_expert_enable_schedule_cordon ): self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")