fast compile

2025-12-24 13:28:13 +08:00 · 2025-09-17 20:12:30 +08:00
parent fdf49de161
commit 834639a7ff
18 changed files with 147 additions and 1324 deletions
--- a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
@@ -232,113 +232,113 @@ void CascadeAppendAttentionKernel(
                                                enable_prefill,
                                                stream,
                                                out);
-    } else if (cache_quant_type_str == "cache_int8") {
-        CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                cache_quant_type_str,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
-        CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                cache_quant_type_str,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_int4_zp") {
-        CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                stream,
-                                                out);
+    // } else if (cache_quant_type_str == "cache_int8") {
+    //     CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             cache_quant_type_str,
+    //                                             stream,
+    //                                             out);
+    // } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
+    //     CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             cache_quant_type_str,
+    //                                             stream,
+    //                                             out);
+    // } else if (cache_quant_type_str == "cache_int4_zp") {
+    //     CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             stream,
+    //                                             out);
    } else {
        PD_THROW(
            "cache_quant_type_str should be one of [none, cache_int8, "
--- a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu
@@ -517,44 +517,44 @@ void DecodeMLAAttentionKernel(
                  block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})});
 }

-template void DecodeMLAAttentionKernel<paddle::bfloat16>(
-  const AppendAttnMetaData& meta_data,
-  const paddle::Tensor &q, // [token_num, num_heads, head_dim]
-  const paddle::Tensor &cache_k,
-  const paddle::Tensor &cache_v,
-  const paddle::optional<paddle::Tensor>& attn_mask,
-  const paddle::optional<paddle::Tensor>& shift_bias,
-  const paddle::optional<paddle::Tensor>& smooth_weight,
-  const paddle::Tensor &seq_lens_q, // q_seq_len is 1
-  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &batch_id_per_token,
-  const paddle::Tensor &cu_seqlens_q,
-  const paddle::Tensor &block_table,
-  int max_seq_len,
-  int max_dec_len,
-  float softmax_scale,
-  float in_scale,
-  bool causal,
-  cudaStream_t &stream,
-  paddle::Tensor *out);
+// template void DecodeMLAAttentionKernel<paddle::bfloat16>(
+//   const AppendAttnMetaData& meta_data,
+//   const paddle::Tensor &q, // [token_num, num_heads, head_dim]
+//   const paddle::Tensor &cache_k,
+//   const paddle::Tensor &cache_v,
+//   const paddle::optional<paddle::Tensor>& attn_mask,
+//   const paddle::optional<paddle::Tensor>& shift_bias,
+//   const paddle::optional<paddle::Tensor>& smooth_weight,
+//   const paddle::Tensor &seq_lens_q, // q_seq_len is 1
+//   const paddle::Tensor &seq_lens_kv,
+//   const paddle::Tensor &batch_id_per_token,
+//   const paddle::Tensor &cu_seqlens_q,
+//   const paddle::Tensor &block_table,
+//   int max_seq_len,
+//   int max_dec_len,
+//   float softmax_scale,
+//   float in_scale,
+//   bool causal,
+//   cudaStream_t &stream,
+//   paddle::Tensor *out);

-template void DecodeMLAAttentionKernel<paddle::float16>(
-  const AppendAttnMetaData& meta_data,
-  const paddle::Tensor &q, // [token_num, num_heads, head_dim]
-  const paddle::Tensor &cache_k,
-  const paddle::Tensor &cache_v,
-  const paddle::optional<paddle::Tensor>& attn_mask,
-  const paddle::optional<paddle::Tensor>& shift_bias,
-  const paddle::optional<paddle::Tensor>& smooth_weight,
-  const paddle::Tensor &seq_lens_q, // q_seq_len is 1
-  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &batch_id_per_token,
-  const paddle::Tensor &cu_seqlens_q,
-  const paddle::Tensor &block_table,
-  int max_seq_len,
-  int max_dec_len,
- float softmax_scale,
-  float in_scale,
-  bool causal,
-  cudaStream_t &stream,
-  paddle::Tensor *out);
+// template void DecodeMLAAttentionKernel<paddle::float16>(
+//   const AppendAttnMetaData& meta_data,
+//   const paddle::Tensor &q, // [token_num, num_heads, head_dim]
+//   const paddle::Tensor &cache_k,
+//   const paddle::Tensor &cache_v,
+//   const paddle::optional<paddle::Tensor>& attn_mask,
+//   const paddle::optional<paddle::Tensor>& shift_bias,
+//   const paddle::optional<paddle::Tensor>& smooth_weight,
+//   const paddle::Tensor &seq_lens_q, // q_seq_len is 1
+//   const paddle::Tensor &seq_lens_kv,
+//   const paddle::Tensor &batch_id_per_token,
+//   const paddle::Tensor &cu_seqlens_q,
+//   const paddle::Tensor &block_table,
+//   int max_seq_len,
+//   int max_dec_len,
+//  float softmax_scale,
+//   float in_scale,
+//   bool causal,
+//   cudaStream_t &stream,
+//   paddle::Tensor *out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4m3fn>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu
@@ -1,59 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-
-template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m3fn>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::float16, int8_t>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu
@@ -1,109 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-
-template void
-CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-
-template void
-CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu
@@ -1,104 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu
@@ -1,106 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu
@@ -1,106 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu
@@ -1,105 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu
@@ -1,105 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    const std::string& cache_quant_type_str,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
@@ -1,49 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void
-EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
@@ -1,48 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
@@ -1,48 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
@@ -1,48 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);