diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
index 815d0947d..375052431 100644
--- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
@@ -998,8 +998,10 @@ void MultiQueryAppendAttention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
-                         num_heads);  // 128k is too large
+        // dim3 grids_merge(min(sm_count * 4, token_num),
+        //                  num_heads);  // 128k is too large
+        printf("\nsm_count * 4: %d\t token_num:%d", sm_count * 4, token_num);
+        dim3 grids_merge(sm_count * 4, num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
                                      vec_size,
@@ -1233,8 +1235,10 @@ void MultiQueryAppendAttention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
-                          num_heads);
+        // dim3 grids_merge(min(sm_count * 4, token_num),
+        //                   num_heads);
+        printf("\nsm_count * 4: %d\t token_num:%d", sm_count * 4, token_num);
+        dim3 grids_merge(sm_count * 4, num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
                                       vec_size,
diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
index 6f403ed47..1c980b09e 100644
--- a/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
@@ -1210,8 +1210,9 @@ void MultiQueryAppendC4Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
-                         num_heads);
+        // dim3 grids_merge(min(sm_count * 4, token_num),
+        //                   num_heads);
+        dim3 grids_merge(sm_count * 4, num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
                                      vec_size,
@@ -1472,8 +1473,9 @@ void MultiQueryAppendC4Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
-                          num_heads);
+        // dim3 grids_merge(min(sm_count * 4, token_num),
+        //                   num_heads);
+        dim3 grids_merge(sm_count * 4, num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
                                       vec_size,
diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
index a17d5ae29..1128f5548 100644
--- a/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
@@ -1170,8 +1170,9 @@ void MultiQueryAppendC8Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
-                         num_heads);
+        // dim3 grids_merge(min(sm_count * 4, token_num),
+        //                   num_heads);
+        dim3 grids_merge(sm_count * 4, num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
                                      vec_size,
@@ -1444,8 +1445,9 @@ void MultiQueryAppendC8Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
-                          num_heads);
+        // dim3 grids_merge(min(sm_count * 4, token_num),
+        //                   num_heads);
+        dim3 grids_merge(sm_count * 4, num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
                                       vec_size,
diff --git a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
index 8799c0a70..f539839a0 100644
--- a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
@@ -231,111 +231,6 @@ void CascadeAppendAttentionKernel(
                                                 enable_prefill,
                                                 stream,
                                                 out);
-    } else if (cache_quant_type_str == "cache_int8") {
-        CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_fp8") {
-        CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_int4_zp") {
-        CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                stream,
-                                                out);
     } else {
         PD_THROW(
             "cache_quant_type_str should be one of [none, cache_int8, "
diff --git a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu
index 701ba42df..69493ce0d 100644
--- a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu
@@ -517,44 +517,44 @@ void DecodeMLAAttentionKernel(
                   block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})});
 }
 
-template void DecodeMLAAttentionKernel<paddle::bfloat16>(
-  const AppendAttnMetaData& meta_data,
-  const paddle::Tensor &q, // [token_num, num_heads, head_dim]
-  const paddle::Tensor &cache_k,
-  const paddle::Tensor &cache_v,
-  const paddle::optional<paddle::Tensor>& attn_mask,
-  const paddle::optional<paddle::Tensor>& shift_bias,
-  const paddle::optional<paddle::Tensor>& smooth_weight,
-  const paddle::Tensor &seq_lens_q, // q_seq_len is 1
-  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &batch_id_per_token,
-  const paddle::Tensor &cu_seqlens_q,
-  const paddle::Tensor &block_table,
-  int max_seq_len,
-  int max_dec_len,
-  float softmax_scale,
-  float in_scale,
-  bool causal,
-  cudaStream_t &stream,
-  paddle::Tensor *out);
+// template void DecodeMLAAttentionKernel<paddle::bfloat16>(
+//   const AppendAttnMetaData& meta_data,
+//   const paddle::Tensor &q, // [token_num, num_heads, head_dim]
+//   const paddle::Tensor &cache_k,
+//   const paddle::Tensor &cache_v,
+//   const paddle::optional<paddle::Tensor>& attn_mask,
+//   const paddle::optional<paddle::Tensor>& shift_bias,
+//   const paddle::optional<paddle::Tensor>& smooth_weight,
+//   const paddle::Tensor &seq_lens_q, // q_seq_len is 1
+//   const paddle::Tensor &seq_lens_kv,
+//   const paddle::Tensor &batch_id_per_token,
+//   const paddle::Tensor &cu_seqlens_q,
+//   const paddle::Tensor &block_table,
+//   int max_seq_len,
+//   int max_dec_len,
+//   float softmax_scale,
+//   float in_scale,
+//   bool causal,
+//   cudaStream_t &stream,
+//   paddle::Tensor *out);
 
-template void DecodeMLAAttentionKernel<paddle::float16>(
-  const AppendAttnMetaData& meta_data,
-  const paddle::Tensor &q, // [token_num, num_heads, head_dim]
-  const paddle::Tensor &cache_k,
-  const paddle::Tensor &cache_v,
-  const paddle::optional<paddle::Tensor>& attn_mask,
-  const paddle::optional<paddle::Tensor>& shift_bias,
-  const paddle::optional<paddle::Tensor>& smooth_weight,
-  const paddle::Tensor &seq_lens_q, // q_seq_len is 1
-  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &batch_id_per_token,
-  const paddle::Tensor &cu_seqlens_q,
-  const paddle::Tensor &block_table,
-  int max_seq_len,
-  int max_dec_len,
- float softmax_scale,
-  float in_scale,
-  bool causal,
-  cudaStream_t &stream,
-  paddle::Tensor *out);
+// template void DecodeMLAAttentionKernel<paddle::float16>(
+//   const AppendAttnMetaData& meta_data,
+//   const paddle::Tensor &q, // [token_num, num_heads, head_dim]
+//   const paddle::Tensor &cache_k,
+//   const paddle::Tensor &cache_v,
+//   const paddle::optional<paddle::Tensor>& attn_mask,
+//   const paddle::optional<paddle::Tensor>& shift_bias,
+//   const paddle::optional<paddle::Tensor>& smooth_weight,
+//   const paddle::Tensor &seq_lens_q, // q_seq_len is 1
+//   const paddle::Tensor &seq_lens_kv,
+//   const paddle::Tensor &batch_id_per_token,
+//   const paddle::Tensor &cu_seqlens_q,
+//   const paddle::Tensor &block_table,
+//   int max_seq_len,
+//   int max_dec_len,
+//  float softmax_scale,
+//   float in_scale,
+//   bool causal,
+//   cudaStream_t &stream,
+//   paddle::Tensor *out);
diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
index 57612c458..e693deaf1 100644
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
@@ -226,6 +226,7 @@ __global__ void append_speculate_cache_rope_kernel(
 
     const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
     const int block_idx = block_table_now[write_seq_id / block_size];
+    // printf("\nblock_idx:%d\t", block_idx);
     if (block_idx < 0) {
       printf(
           "Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var "
@@ -236,6 +237,7 @@ __global__ void append_speculate_cache_rope_kernel(
           seq_lens_decoder[ori_bi],
           token_id,
           cu_seqlens_q[ori_bi]);
+      return ;
     }
     const int block_offset = write_seq_id % block_size;
 
@@ -283,8 +285,10 @@ __global__ void append_speculate_cache_rope_kernel(
     }
     if (hi < num_heads) {
       // write q
+      // printf("\nwrite q ");
       Store<T, VecSize>(bias_vec, &q_out[write_q_idx]);
     } else {
+      // printf("\nwrite k/v ");
       //  write k/v
       const int kv_head_idx = (hi - num_heads) % gqa_group_size;
       const int tgt_idx = (block_idx * gqa_group_size * block_size * head_size +
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu
deleted file mode 100644
index 923f9b0d3..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu
deleted file mode 100644
index 888c410bb..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4m3fn>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu
deleted file mode 100644
index fcef546ea..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu
deleted file mode 100644
index 656374937..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-
-template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu
deleted file mode 100644
index fba62df2b..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m3fn>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu
deleted file mode 100644
index 7a6e21fa7..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c4_impl.cuh"
-
-template void CascadeAppendAttentionC4Kernel<paddle::float16, int8_t>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu
deleted file mode 100644
index e860a0462..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-
-template void
-CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-
-template void
-CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu
deleted file mode 100644
index 3b61ecd16..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu
deleted file mode 100644
index e864722b5..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu
deleted file mode 100644
index 4d7b11d99..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu
deleted file mode 100644
index d03d618b2..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu
deleted file mode 100644
index 1ab83eb52..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../append_attention_c8_impl.cuh"
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
-
-
-template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor& qkv,  // [token_num, num_heads, head_dim]
-    const paddle::Tensor&
-        cache_k,  // [max_block_num, num_heads, block_size, head_dim]
-    const paddle::Tensor&
-        cache_v,  // [max_block_num, num_heads, head_dim, block_size]
-    const paddle::optional<paddle::Tensor>& attn_mask,
-    const paddle::optional<paddle::Tensor>&
-        cache_k_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_scale,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_k_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        cache_v_zp,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        shift_bias,  // [num_kv_heads, head_dim]
-    const paddle::optional<paddle::Tensor>&
-        smooth_weight,  // [num_kv_heads, head_dim]
-    const paddle::Tensor& seq_lens_q,
-    const paddle::Tensor& seq_lens_kv,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_table,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids_per_batch,
-    const int num_blocks,
-    const int block_shape_q,
-    const int max_seq_len,
-    const int max_dec_len,
-    const float quant_max_bound,
-    const float quant_min_bound,
-    const float in_scale,
-    const int max_partition_size,
-    const int encoder_max_partition_size,
-    const int speculate_max_draft_token_num,
-    const bool causal,
-    const bool is_decoder,
-    const bool enable_prefill,
-    cudaStream_t& stream,
-    paddle::Tensor* out);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
deleted file mode 100644
index 915039908..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void
-EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
deleted file mode 100644
index 3f3539b8a..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
deleted file mode 100644
index a559ec77f..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
deleted file mode 100644
index 3318a3647..000000000
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "../encoder_write_cache_with_rope_kernel.h"
-
-template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
-    const AppendAttnMetaData& meta_data,
-    const paddle::Tensor&
-        qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
-              // kv_num_heads, head_dim] if GQA)
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& batch_id_per_token,
-    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& block_tables,
-    const paddle::Tensor& batch_ids,
-    const paddle::Tensor& tile_ids,
-    const paddle::optional<paddle::Tensor>& rotary_embs,
-    const paddle::optional<paddle::Tensor>& qkv_out_scales,
-    const paddle::optional<paddle::Tensor>& qkv_biases,
-    const paddle::optional<paddle::Tensor>& cache_k_scale,
-    const paddle::optional<paddle::Tensor>& cache_v_scale,
-    const paddle::optional<paddle::Tensor>& cache_k_zp,
-    const paddle::optional<paddle::Tensor>& cache_v_zp,
-    const paddle::optional<paddle::Tensor>& kv_signal_data,
-    const std::string& cache_quant_type_str,
-    const int num_blocks,
-    const int max_seq_len,
-    const bool use_neox_style,
-    const bool rope_3d,
-    cudaStream_t& stream,
-    paddle::Tensor* qkv_out,
-    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out,
-    const paddle::optional<paddle::Tensor>& q_norm_weight,
-    const paddle::optional<paddle::Tensor>& k_norm_weight,
-    const float rms_norm_eps);
diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h
index 468aff1fc..fe30081f1 100644
--- a/custom_ops/gpu_ops/helper.h
+++ b/custom_ops/gpu_ops/helper.h
@@ -137,6 +137,10 @@ inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) {
   *num_blocks = std::max<int>(
       1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize,
                            sm_count * tpm / kBlockSize * kNumWaves));
+  // int64_t tmp_num1 = (n + kBlockSize - 1) / kBlockSize;       // dynamic
+  // int64_t tmp_num2 = sm_count * tpm / kBlockSize * kNumWaves; // static
+  // *num_blocks = std::max<int>(1,tmp_num2);
+  // printf("\nGetNumBlocks tmp_num1: %d \ttmp_num2: %d", tmp_num1, tmp_num2);
   return cudaSuccess;
 }
 
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 9c59b8bab..527df844d 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -539,6 +539,7 @@ class GraphOptimizationConfig:
         draft_capture_sizes += [32 * i for i in range(17, 33)]
 
         draft_capture_sizes.append(max_num_seqs)
+        draft_capture_sizes.append(max_num_seqs * 2)  # for spec decode
         self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
 
     def to_json_string(self):
@@ -950,7 +951,7 @@ class FDConfig:
         # Initialize cuda graph capture list
         if self.graph_opt_config.cudagraph_capture_sizes is None:
             self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)
-        self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=self.parallel_config.max_num_seqs)
+        self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=(self.parallel_config.max_num_seqs * 2))
 
         # TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
         if self.graph_opt_config.graph_opt_level == 2:
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
index 1b1bebebc..869f0593f 100644
--- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -77,7 +77,7 @@ class CudaGraphPiecewiseBackend:
         ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
         real_shape = ids_remove_padding.shape[0]
         padding_real_shape = self.real_shape_to_captured_size[real_shape]
-        logger.debug(
+        logger.info(
             f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
             f"The padded shape is :{padding_real_shape}"
         )
@@ -86,7 +86,7 @@ class CudaGraphPiecewiseBackend:
         assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
         if entry.runnable is None:
             entry.runnable = self.runnable
-            logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
+            logger.info(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
 
         if not entry.use_cudagraph:
             return entry.runnable(**kwargs)
@@ -97,7 +97,7 @@ class CudaGraphPiecewiseBackend:
             for n in range(entry.num_finished_warmup, self.warm_up_size):
                 entry.num_finished_warmup += 1
                 entry.runnable(**kwargs)
-                logger.debug(
+                logger.info(
                     f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
                     f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                 )
@@ -122,9 +122,19 @@ class CudaGraphPiecewiseBackend:
             output._clear
 
             paddle.device.synchronize()
-            logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
+            self.save_cudagrpah_dot_files(entry)
+            logger.info(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
 
         # Replay
         entry.cuda_graph.replay()
-        logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
+        logger.info(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
         return entry.output_buffer
+
+    def save_cudagrpah_dot_files(self, entry):
+        """Print CUDAGrpah to dot files"""
+        if entry.cuda_graph:
+            print("save graph")
+            entry.cuda_graph.print_to_dot_files(
+                f"/root/paddlejob/workspace/env_run/output/gongshaotian/FastDeploy/GraphDotFiles/backend{id(self)}_shape{entry.runtime_bs}",
+                1 << 0,
+            )  # backend{id(self)}_shape{entry.runtime_bs}
diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
index 9f56d313c..211106d41 100644
--- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
@@ -115,7 +115,7 @@ class GraphOptBackend:
         self.runnable = runnable
         self.fd_config = fd_config
 
-        self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
+        self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
         if self.fd_config.graph_opt_config.graph_opt_level > 0:
             # 1. Prepare cuda grpah input buffers (contain output of subgraphs)
 
@@ -138,9 +138,11 @@ class GraphOptBackend:
             )
 
         assert kwargs["forward_meta"].ids_remove_padding is not None
-        batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0]
+        real_shape = kwargs["forward_meta"].ids_remove_padding.shape[0]
+        print(f"[GraphOptBackend] real_shape :{real_shape}")
 
-        if (not kwargs["forward_meta"].step_use_cudagraph) or (batch_size > self.max_captre_batch):
+        if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.max_captre_size):
+            print("[GraphOptBackend] run dynamic graph")
             return self.runnable(**kwargs)
         else:
             return self.cudagraph_piecewise_backend.__call__(**kwargs)
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index af567cba1..b0e8d4bad 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1051,6 +1051,7 @@ class GPUModelRunner(ModelRunnerBase):
             self.padding_cudagraph_inputs()
 
             # 3. Run model
+            print(f"[Dummy] step use cuda graph:{self.forward_meta.step_use_cudagraph}")
             if self.enable_mm:
                 model_output = self.model(
                     self.share_inputs["ids_remove_padding"],
@@ -1315,6 +1316,7 @@ class GPUModelRunner(ModelRunnerBase):
 
         # 2. Padding inputs for cuda graph
         self.padding_cudagraph_inputs()
+        print(f"[Debug] step use cuda graph:{self.forward_meta.step_use_cudagraph}")
 
         # 3. Execute model
         if self.enable_mm:
@@ -1329,6 +1331,7 @@ class GPUModelRunner(ModelRunnerBase):
                 ids_remove_padding=self.share_inputs["ids_remove_padding"],
                 forward_meta=self.forward_meta,
             )
+            paddle.device.synchronize()
             hidden_states = rebuild_padding(
                 model_output,
                 self.share_inputs["cu_seqlens_q"],
@@ -1427,6 +1430,7 @@ class GPUModelRunner(ModelRunnerBase):
                 self.proposer.run(full_hidden_states=model_output)
             else:
                 self.proposer.run(share_inputs=self.share_inputs)
+                print("proposer run")
 
         # 7. Updata 'infer_seed' and step_cuda()
         self.share_inputs["infer_seed"].add_(self.infer_seed_increment)