diff --git a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu index 2e2e8c7ba..d00e63875 100644 --- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu +++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu @@ -331,7 +331,7 @@ void GetBlockShapeAndSplitKVBlock( // decoder if (max_dec_len_this_time > 0) { - const bool mla_use_tensorcore = GetMlaUseTensorcore(); + const bool mla_use_tensorcore = true; //GetMlaUseTensorcore(); if (mla_use_tensorcore && group_size <= 64) { const int set_chunk_size = get_mla_dec_chunk_size(bsz); diff --git a/custom_ops/gpu_ops/multi_head_latent_attention.cu b/custom_ops/gpu_ops/multi_head_latent_attention.cu index 126b014b8..6e804f3eb 100644 --- a/custom_ops/gpu_ops/multi_head_latent_attention.cu +++ b/custom_ops/gpu_ops/multi_head_latent_attention.cu @@ -66,7 +66,7 @@ std::vector MultiHeadLatentAttentionKernel( // int chunk_size = decoder_chunk_size_cpu.data()[0]; // - const bool mla_use_tensorcore = get_mla_use_tensorcore(); + const bool mla_use_tensorcore = true; //get_mla_use_tensorcore(); auto sm_version = GetSMVersion(); if ((speculate_decoder || mla_use_tensorcore) && sm_version < 90) { PD_THROW("Please use speculate_decoder=0 and FLAGS_mla_use_tensorcore=0 when sm < 90.");