[Cherry-Pick] Fix eplb noaux(#5239) (#5240)

* fix eplb noaux

* fix eplb noaux
This commit is contained in:
xiaoxiaohehe001
2025-11-26 17:51:10 +08:00
committed by GitHub
parent 49be443d02
commit 710753377f
2 changed files with 0 additions and 49 deletions

View File

@@ -969,33 +969,12 @@ void invokeNoAuxTcRedundant(T* scores,
int64_t num_cases = num_tokens * n_group;
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
group_scores,
scores_with_bias,
num_cases,
n_group,
num_experts / n_group);
#else
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
cudaLaunchConfig_t config;
config.gridDim = topk_with_k2_num_blocks;
config.blockDim = BLOCK_SIZE;
config.dynamicSmemBytes = 0;
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = false;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config,
kernel_instance1,
group_scores,
scores_with_bias,
num_cases,
n_group,
num_experts / n_group);
#endif
int64_t topk_with_k_group_num_blocks =
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -1003,7 +982,6 @@ void invokeNoAuxTcRedundant(T* scores,
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
topk);
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
group_idx_and_topk_idx_redundant_kernel<T>
<<<topk_with_k_group_num_blocks,
BLOCK_SIZE,
@@ -1025,32 +1003,6 @@ void invokeNoAuxTcRedundant(T* scores,
num_experts / n_group,
routed_scaling_factor,
redundant_ep_rank_num_plus_one);
#else
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
config.gridDim = topk_with_k_group_num_blocks;
config.blockDim = BLOCK_SIZE;
config.dynamicSmemBytes = dynamic_smem_in_bytes;
config.stream = stream;
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = false;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config,
kernel_instance2,
scores,
group_scores,
topk_values,
topk_indices,
scores_with_bias,
num_tokens,
n_group,
topk_group,
topk,
num_experts,
num_experts / n_group,
renormalize,
routed_scaling_factor);
#endif
}
#define INSTANTIATE_NOAUX_TC(T, IdxT) \

View File

@@ -448,7 +448,6 @@ class RedundantExpertManager:
if (
self.fd_config.scheduler_config.splitwise_role == "mixed"
or self.fd_config.scheduler_config.splitwise_role == "decode"
or self.fd_config.scheduler_config.splitwise_role == "prefill"
or not self.eplb_config.redundant_expert_enable_schedule_cordon
):
self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")