mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
* fix eplb noaux * fix eplb noaux
This commit is contained in:
@@ -969,33 +969,12 @@ void invokeNoAuxTcRedundant(T* scores,
|
||||
int64_t num_cases = num_tokens * n_group;
|
||||
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
|
||||
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
|
||||
topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
|
||||
group_scores,
|
||||
scores_with_bias,
|
||||
num_cases,
|
||||
n_group,
|
||||
num_experts / n_group);
|
||||
#else
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
|
||||
cudaLaunchConfig_t config;
|
||||
config.gridDim = topk_with_k2_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = 0;
|
||||
config.stream = stream;
|
||||
cudaLaunchAttribute attrs[1];
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = false;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
cudaLaunchKernelEx(&config,
|
||||
kernel_instance1,
|
||||
group_scores,
|
||||
scores_with_bias,
|
||||
num_cases,
|
||||
n_group,
|
||||
num_experts / n_group);
|
||||
#endif
|
||||
|
||||
int64_t topk_with_k_group_num_blocks =
|
||||
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
@@ -1003,7 +982,6 @@ void invokeNoAuxTcRedundant(T* scores,
|
||||
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
|
||||
topk);
|
||||
|
||||
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
|
||||
group_idx_and_topk_idx_redundant_kernel<T>
|
||||
<<<topk_with_k_group_num_blocks,
|
||||
BLOCK_SIZE,
|
||||
@@ -1025,32 +1003,6 @@ void invokeNoAuxTcRedundant(T* scores,
|
||||
num_experts / n_group,
|
||||
routed_scaling_factor,
|
||||
redundant_ep_rank_num_plus_one);
|
||||
#else
|
||||
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
|
||||
config.gridDim = topk_with_k_group_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = dynamic_smem_in_bytes;
|
||||
config.stream = stream;
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = false;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
cudaLaunchKernelEx(&config,
|
||||
kernel_instance2,
|
||||
scores,
|
||||
group_scores,
|
||||
topk_values,
|
||||
topk_indices,
|
||||
scores_with_bias,
|
||||
num_tokens,
|
||||
n_group,
|
||||
topk_group,
|
||||
topk,
|
||||
num_experts,
|
||||
num_experts / n_group,
|
||||
renormalize,
|
||||
routed_scaling_factor);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
|
||||
|
||||
@@ -448,7 +448,6 @@ class RedundantExpertManager:
|
||||
if (
|
||||
self.fd_config.scheduler_config.splitwise_role == "mixed"
|
||||
or self.fd_config.scheduler_config.splitwise_role == "decode"
|
||||
or self.fd_config.scheduler_config.splitwise_role == "prefill"
|
||||
or not self.eplb_config.redundant_expert_enable_schedule_cordon
|
||||
):
|
||||
self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")
|
||||
|
||||
Reference in New Issue
Block a user