diff --git a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h index 2cc069592..859b0365d 100644 --- a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h +++ b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h @@ -232,113 +232,113 @@ void CascadeAppendAttentionKernel( enable_prefill, stream, out); - } else if (cache_quant_type_str == "cache_int8") { - CascadeAppendAttentionC8Kernel(meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - cache_k_scale, - cache_v_scale, - cache_k_zp, - cache_v_zp, - shift_bias, - smooth_weight, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - block_shape_q, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - causal, - is_decoder, - enable_prefill, - cache_quant_type_str, - stream, - out); - } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") { - CascadeAppendAttentionC8Kernel(meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - cache_k_scale, - cache_v_scale, - cache_k_zp, - cache_v_zp, - shift_bias, - smooth_weight, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - block_shape_q, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - causal, - is_decoder, - enable_prefill, - cache_quant_type_str, - stream, - out); - } else if (cache_quant_type_str == "cache_int4_zp") { - CascadeAppendAttentionC4Kernel(meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - cache_k_scale, - cache_v_scale, - cache_k_zp, - cache_v_zp, - shift_bias, - smooth_weight, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - block_shape_q, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - causal, - is_decoder, - enable_prefill, - stream, - out); + // } else if (cache_quant_type_str == "cache_int8") { + // CascadeAppendAttentionC8Kernel(meta_data, + // qkv, + // cache_k, + // cache_v, + // attn_mask, + // cache_k_scale, + // cache_v_scale, + // cache_k_zp, + // cache_v_zp, + // shift_bias, + // smooth_weight, + // seq_lens_q, + // seq_lens_kv, + // seq_lens_encoder, + // batch_id_per_token, + // cu_seqlens_q, + // block_table, + // batch_ids, + // tile_ids_per_batch, + // num_blocks, + // block_shape_q, + // max_seq_len, + // max_dec_len, + // quant_max_bound, + // quant_min_bound, + // in_scale, + // max_partition_size, + // encoder_max_partition_size, + // speculate_max_draft_token_num, + // causal, + // is_decoder, + // enable_prefill, + // cache_quant_type_str, + // stream, + // out); + // } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") { + // CascadeAppendAttentionC8Kernel(meta_data, + // qkv, + // cache_k, + // cache_v, + // attn_mask, + // cache_k_scale, + // cache_v_scale, + // cache_k_zp, + // cache_v_zp, + // shift_bias, + // smooth_weight, + // seq_lens_q, + // seq_lens_kv, + // seq_lens_encoder, + // batch_id_per_token, + // cu_seqlens_q, + // block_table, + // batch_ids, + // tile_ids_per_batch, + // num_blocks, + // block_shape_q, + // max_seq_len, + // max_dec_len, + // quant_max_bound, + // quant_min_bound, + // in_scale, + // max_partition_size, + // encoder_max_partition_size, + // speculate_max_draft_token_num, + // causal, + // is_decoder, + // enable_prefill, + // cache_quant_type_str, + // stream, + // out); + // } else if (cache_quant_type_str == "cache_int4_zp") { + // CascadeAppendAttentionC4Kernel(meta_data, + // qkv, + // cache_k, + // cache_v, + // attn_mask, + // cache_k_scale, + // cache_v_scale, + // cache_k_zp, + // cache_v_zp, + // shift_bias, + // smooth_weight, + // seq_lens_q, + // seq_lens_kv, + // seq_lens_encoder, + // batch_id_per_token, + // cu_seqlens_q, + // block_table, + // batch_ids, + // tile_ids_per_batch, + // num_blocks, + // block_shape_q, + // max_seq_len, + // max_dec_len, + // quant_max_bound, + // quant_min_bound, + // in_scale, + // max_partition_size, + // encoder_max_partition_size, + // speculate_max_draft_token_num, + // causal, + // is_decoder, + // enable_prefill, + // stream, + // out); } else { PD_THROW( "cache_quant_type_str should be one of [none, cache_int8, " diff --git a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu index 701ba42df..69493ce0d 100644 --- a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu @@ -517,44 +517,44 @@ void DecodeMLAAttentionKernel( block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})}); } -template void DecodeMLAAttentionKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor &q, // [token_num, num_heads, head_dim] - const paddle::Tensor &cache_k, - const paddle::Tensor &cache_v, - const paddle::optional& attn_mask, - const paddle::optional& shift_bias, - const paddle::optional& smooth_weight, - const paddle::Tensor &seq_lens_q, // q_seq_len is 1 - const paddle::Tensor &seq_lens_kv, - const paddle::Tensor &batch_id_per_token, - const paddle::Tensor &cu_seqlens_q, - const paddle::Tensor &block_table, - int max_seq_len, - int max_dec_len, - float softmax_scale, - float in_scale, - bool causal, - cudaStream_t &stream, - paddle::Tensor *out); +// template void DecodeMLAAttentionKernel( +// const AppendAttnMetaData& meta_data, +// const paddle::Tensor &q, // [token_num, num_heads, head_dim] +// const paddle::Tensor &cache_k, +// const paddle::Tensor &cache_v, +// const paddle::optional& attn_mask, +// const paddle::optional& shift_bias, +// const paddle::optional& smooth_weight, +// const paddle::Tensor &seq_lens_q, // q_seq_len is 1 +// const paddle::Tensor &seq_lens_kv, +// const paddle::Tensor &batch_id_per_token, +// const paddle::Tensor &cu_seqlens_q, +// const paddle::Tensor &block_table, +// int max_seq_len, +// int max_dec_len, +// float softmax_scale, +// float in_scale, +// bool causal, +// cudaStream_t &stream, +// paddle::Tensor *out); -template void DecodeMLAAttentionKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor &q, // [token_num, num_heads, head_dim] - const paddle::Tensor &cache_k, - const paddle::Tensor &cache_v, - const paddle::optional& attn_mask, - const paddle::optional& shift_bias, - const paddle::optional& smooth_weight, - const paddle::Tensor &seq_lens_q, // q_seq_len is 1 - const paddle::Tensor &seq_lens_kv, - const paddle::Tensor &batch_id_per_token, - const paddle::Tensor &cu_seqlens_q, - const paddle::Tensor &block_table, - int max_seq_len, - int max_dec_len, - float softmax_scale, - float in_scale, - bool causal, - cudaStream_t &stream, - paddle::Tensor *out); +// template void DecodeMLAAttentionKernel( +// const AppendAttnMetaData& meta_data, +// const paddle::Tensor &q, // [token_num, num_heads, head_dim] +// const paddle::Tensor &cache_k, +// const paddle::Tensor &cache_v, +// const paddle::optional& attn_mask, +// const paddle::optional& shift_bias, +// const paddle::optional& smooth_weight, +// const paddle::Tensor &seq_lens_q, // q_seq_len is 1 +// const paddle::Tensor &seq_lens_kv, +// const paddle::Tensor &batch_id_per_token, +// const paddle::Tensor &cu_seqlens_q, +// const paddle::Tensor &block_table, +// int max_seq_len, +// int max_dec_len, +// float softmax_scale, +// float in_scale, +// bool causal, +// cudaStream_t &stream, +// paddle::Tensor *out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu deleted file mode 100644 index 923f9b0d3..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu deleted file mode 100644 index 888c410bb..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu deleted file mode 100644 index fcef546ea..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu deleted file mode 100644 index 656374937..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu deleted file mode 100644 index fba62df2b..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu deleted file mode 100644 index 7a6e21fa7..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu deleted file mode 100644 index 757cccaf9..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - - -template void -CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); - - - -template void -CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu deleted file mode 100644 index 54b0b0be4..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu deleted file mode 100644 index c6bd95576..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); - - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu deleted file mode 100644 index 153b81ee0..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); - - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu deleted file mode 100644 index 7e2539b0a..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu deleted file mode 100644 index e46fb31c1..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - const std::string& cache_quant_type_str, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu deleted file mode 100644 index 915039908..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void -EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu deleted file mode 100644 index 3f3539b8a..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu deleted file mode 100644 index a559ec77f..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu deleted file mode 100644 index 3318a3647..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps);