diff --git a/custom_ops/gpu_ops/noauxtc_kernel.h b/custom_ops/gpu_ops/noauxtc_kernel.h index 7ac3bb174..c65ae1c5f 100644 --- a/custom_ops/gpu_ops/noauxtc_kernel.h +++ b/custom_ops/gpu_ops/noauxtc_kernel.h @@ -969,33 +969,12 @@ void invokeNoAuxTcRedundant(T* scores, int64_t num_cases = num_tokens * n_group; int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1; -#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU topk_with_k2_kernel<<>>( group_scores, scores_with_bias, num_cases, n_group, num_experts / n_group); -#else - auto* kernel_instance1 = &topk_with_k2_kernel; - cudaLaunchConfig_t config; - config.gridDim = topk_with_k2_num_blocks; - config.blockDim = BLOCK_SIZE; - config.dynamicSmemBytes = 0; - config.stream = stream; - cudaLaunchAttribute attrs[1]; - attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = false; - config.numAttrs = 1; - config.attrs = attrs; - cudaLaunchKernelEx(&config, - kernel_instance1, - group_scores, - scores_with_bias, - num_cases, - n_group, - num_experts / n_group); -#endif int64_t topk_with_k_group_num_blocks = (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1; @@ -1003,7 +982,6 @@ void invokeNoAuxTcRedundant(T* scores, warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK, topk); -#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU group_idx_and_topk_idx_redundant_kernel <<; - config.gridDim = topk_with_k_group_num_blocks; - config.blockDim = BLOCK_SIZE; - config.dynamicSmemBytes = dynamic_smem_in_bytes; - config.stream = stream; - attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = false; - config.numAttrs = 1; - config.attrs = attrs; - cudaLaunchKernelEx(&config, - kernel_instance2, - scores, - group_scores, - topk_values, - topk_indices, - scores_with_bias, - num_tokens, - n_group, - topk_group, - topk, - num_experts, - num_experts / n_group, - renormalize, - routed_scaling_factor); -#endif } #define INSTANTIATE_NOAUX_TC(T, IdxT) \ diff --git a/fastdeploy/eplb/experts_manager.py b/fastdeploy/eplb/experts_manager.py index 3c3e91547..e140b3a89 100644 --- a/fastdeploy/eplb/experts_manager.py +++ b/fastdeploy/eplb/experts_manager.py @@ -448,7 +448,6 @@ class RedundantExpertManager: if ( self.fd_config.scheduler_config.splitwise_role == "mixed" or self.fd_config.scheduler_config.splitwise_role == "decode" - or self.fd_config.scheduler_config.splitwise_role == "prefill" or not self.eplb_config.redundant_expert_enable_schedule_cordon ): self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")