diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh index 815d0947d..375052431 100644 --- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh @@ -998,8 +998,10 @@ void MultiQueryAppendAttention( } else { constexpr int blockx = HEAD_DIM / vec_size; constexpr int blocky = (128 + blockx - 1) / blockx; - dim3 grids_merge(min(sm_count * 4, token_num), - num_heads); // 128k is too large + // dim3 grids_merge(min(sm_count * 4, token_num), + // num_heads); // 128k is too large + printf("\nsm_count * 4: %d\t token_num:%d", sm_count * 4, token_num); + dim3 grids_merge(sm_count * 4, num_heads); dim3 blocks_merge(blockx, blocky); merge_multi_chunks_v2_kernel(meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - cache_k_scale, - cache_v_scale, - cache_k_zp, - cache_v_zp, - shift_bias, - smooth_weight, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - block_shape_q, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - causal, - is_decoder, - enable_prefill, - stream, - out); - } else if (cache_quant_type_str == "cache_fp8") { - CascadeAppendAttentionC8Kernel(meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - cache_k_scale, - cache_v_scale, - cache_k_zp, - cache_v_zp, - shift_bias, - smooth_weight, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - block_shape_q, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - causal, - is_decoder, - enable_prefill, - stream, - out); - } else if (cache_quant_type_str == "cache_int4_zp") { - CascadeAppendAttentionC4Kernel(meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - cache_k_scale, - cache_v_scale, - cache_k_zp, - cache_v_zp, - shift_bias, - smooth_weight, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - block_shape_q, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - causal, - is_decoder, - enable_prefill, - stream, - out); } else { PD_THROW( "cache_quant_type_str should be one of [none, cache_int8, " diff --git a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu index 701ba42df..69493ce0d 100644 --- a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu @@ -517,44 +517,44 @@ void DecodeMLAAttentionKernel( block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})}); } -template void DecodeMLAAttentionKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor &q, // [token_num, num_heads, head_dim] - const paddle::Tensor &cache_k, - const paddle::Tensor &cache_v, - const paddle::optional& attn_mask, - const paddle::optional& shift_bias, - const paddle::optional& smooth_weight, - const paddle::Tensor &seq_lens_q, // q_seq_len is 1 - const paddle::Tensor &seq_lens_kv, - const paddle::Tensor &batch_id_per_token, - const paddle::Tensor &cu_seqlens_q, - const paddle::Tensor &block_table, - int max_seq_len, - int max_dec_len, - float softmax_scale, - float in_scale, - bool causal, - cudaStream_t &stream, - paddle::Tensor *out); +// template void DecodeMLAAttentionKernel( +// const AppendAttnMetaData& meta_data, +// const paddle::Tensor &q, // [token_num, num_heads, head_dim] +// const paddle::Tensor &cache_k, +// const paddle::Tensor &cache_v, +// const paddle::optional& attn_mask, +// const paddle::optional& shift_bias, +// const paddle::optional& smooth_weight, +// const paddle::Tensor &seq_lens_q, // q_seq_len is 1 +// const paddle::Tensor &seq_lens_kv, +// const paddle::Tensor &batch_id_per_token, +// const paddle::Tensor &cu_seqlens_q, +// const paddle::Tensor &block_table, +// int max_seq_len, +// int max_dec_len, +// float softmax_scale, +// float in_scale, +// bool causal, +// cudaStream_t &stream, +// paddle::Tensor *out); -template void DecodeMLAAttentionKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor &q, // [token_num, num_heads, head_dim] - const paddle::Tensor &cache_k, - const paddle::Tensor &cache_v, - const paddle::optional& attn_mask, - const paddle::optional& shift_bias, - const paddle::optional& smooth_weight, - const paddle::Tensor &seq_lens_q, // q_seq_len is 1 - const paddle::Tensor &seq_lens_kv, - const paddle::Tensor &batch_id_per_token, - const paddle::Tensor &cu_seqlens_q, - const paddle::Tensor &block_table, - int max_seq_len, - int max_dec_len, - float softmax_scale, - float in_scale, - bool causal, - cudaStream_t &stream, - paddle::Tensor *out); +// template void DecodeMLAAttentionKernel( +// const AppendAttnMetaData& meta_data, +// const paddle::Tensor &q, // [token_num, num_heads, head_dim] +// const paddle::Tensor &cache_k, +// const paddle::Tensor &cache_v, +// const paddle::optional& attn_mask, +// const paddle::optional& shift_bias, +// const paddle::optional& smooth_weight, +// const paddle::Tensor &seq_lens_q, // q_seq_len is 1 +// const paddle::Tensor &seq_lens_kv, +// const paddle::Tensor &batch_id_per_token, +// const paddle::Tensor &cu_seqlens_q, +// const paddle::Tensor &block_table, +// int max_seq_len, +// int max_dec_len, +// float softmax_scale, +// float in_scale, +// bool causal, +// cudaStream_t &stream, +// paddle::Tensor *out); diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh index 57612c458..e693deaf1 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh @@ -226,6 +226,7 @@ __global__ void append_speculate_cache_rope_kernel( const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq; const int block_idx = block_table_now[write_seq_id / block_size]; + // printf("\nblock_idx:%d\t", block_idx); if (block_idx < 0) { printf( "Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var " @@ -236,6 +237,7 @@ __global__ void append_speculate_cache_rope_kernel( seq_lens_decoder[ori_bi], token_id, cu_seqlens_q[ori_bi]); + return ; } const int block_offset = write_seq_id % block_size; @@ -283,8 +285,10 @@ __global__ void append_speculate_cache_rope_kernel( } if (hi < num_heads) { // write q + // printf("\nwrite q "); Store(bias_vec, &q_out[write_q_idx]); } else { + // printf("\nwrite k/v "); // write k/v const int kv_head_idx = (hi - num_heads) % gqa_group_size; const int tgt_idx = (block_idx * gqa_group_size * block_size * head_size + diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu deleted file mode 100644 index 923f9b0d3..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu deleted file mode 100644 index 888c410bb..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu deleted file mode 100644 index fcef546ea..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu deleted file mode 100644 index 656374937..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu deleted file mode 100644 index fba62df2b..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu deleted file mode 100644 index 7a6e21fa7..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c4_impl.cuh" - -template void CascadeAppendAttentionC4Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu deleted file mode 100644 index e860a0462..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - - -template void -CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); - - - -template void -CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu deleted file mode 100644 index 3b61ecd16..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu deleted file mode 100644 index e864722b5..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); - - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu deleted file mode 100644 index 4d7b11d99..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); - - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu deleted file mode 100644 index d03d618b2..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu deleted file mode 100644 index 1ab83eb52..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../append_attention_c8_impl.cuh" - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); - - -template void CascadeAppendAttentionC8Kernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& qkv, // [token_num, num_heads, head_dim] - const paddle::Tensor& - cache_k, // [max_block_num, num_heads, block_size, head_dim] - const paddle::Tensor& - cache_v, // [max_block_num, num_heads, head_dim, block_size] - const paddle::optional& attn_mask, - const paddle::optional& - cache_k_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_scale, // [num_kv_heads, head_dim] - const paddle::optional& - cache_k_zp, // [num_kv_heads, head_dim] - const paddle::optional& - cache_v_zp, // [num_kv_heads, head_dim] - const paddle::optional& - shift_bias, // [num_kv_heads, head_dim] - const paddle::optional& - smooth_weight, // [num_kv_heads, head_dim] - const paddle::Tensor& seq_lens_q, - const paddle::Tensor& seq_lens_kv, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_table, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids_per_batch, - const int num_blocks, - const int block_shape_q, - const int max_seq_len, - const int max_dec_len, - const float quant_max_bound, - const float quant_min_bound, - const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, - const int speculate_max_draft_token_num, - const bool causal, - const bool is_decoder, - const bool enable_prefill, - cudaStream_t& stream, - paddle::Tensor* out); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu deleted file mode 100644 index 915039908..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void -EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu deleted file mode 100644 index 3f3539b8a..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu deleted file mode 100644 index a559ec77f..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu deleted file mode 100644 index 3318a3647..000000000 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "../encoder_write_cache_with_rope_kernel.h" - -template void EncoderWriteCacheWithRopeKernel( - const AppendAttnMetaData& meta_data, - const paddle::Tensor& - qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * - // kv_num_heads, head_dim] if GQA) - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const paddle::Tensor& batch_ids, - const paddle::Tensor& tile_ids, - const paddle::optional& rotary_embs, - const paddle::optional& qkv_out_scales, - const paddle::optional& qkv_biases, - const paddle::optional& cache_k_scale, - const paddle::optional& cache_v_scale, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& kv_signal_data, - const std::string& cache_quant_type_str, - const int num_blocks, - const int max_seq_len, - const bool use_neox_style, - const bool rope_3d, - cudaStream_t& stream, - paddle::Tensor* qkv_out, - paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index 468aff1fc..fe30081f1 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -137,6 +137,10 @@ inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { *num_blocks = std::max( 1, std::min((n + kBlockSize - 1) / kBlockSize, sm_count * tpm / kBlockSize * kNumWaves)); + // int64_t tmp_num1 = (n + kBlockSize - 1) / kBlockSize; // dynamic + // int64_t tmp_num2 = sm_count * tpm / kBlockSize * kNumWaves; // static + // *num_blocks = std::max(1,tmp_num2); + // printf("\nGetNumBlocks tmp_num1: %d \ttmp_num2: %d", tmp_num1, tmp_num2); return cudaSuccess; } diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 9c59b8bab..527df844d 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -539,6 +539,7 @@ class GraphOptimizationConfig: draft_capture_sizes += [32 * i for i in range(17, 33)] draft_capture_sizes.append(max_num_seqs) + draft_capture_sizes.append(max_num_seqs * 2) # for spec decode self.cudagraph_capture_sizes = sorted(draft_capture_sizes) def to_json_string(self): @@ -950,7 +951,7 @@ class FDConfig: # Initialize cuda graph capture list if self.graph_opt_config.cudagraph_capture_sizes is None: self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs) - self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=self.parallel_config.max_num_seqs) + self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=(self.parallel_config.max_num_seqs * 2)) # TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn if self.graph_opt_config.graph_opt_level == 2: diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 1b1bebebc..869f0593f 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -77,7 +77,7 @@ class CudaGraphPiecewiseBackend: ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"] real_shape = ids_remove_padding.shape[0] padding_real_shape = self.real_shape_to_captured_size[real_shape] - logger.debug( + logger.info( f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, " f"The padded shape is :{padding_real_shape}" ) @@ -86,7 +86,7 @@ class CudaGraphPiecewiseBackend: assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list." if entry.runnable is None: entry.runnable = self.runnable - logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}") + logger.info(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}") if not entry.use_cudagraph: return entry.runnable(**kwargs) @@ -97,7 +97,7 @@ class CudaGraphPiecewiseBackend: for n in range(entry.num_finished_warmup, self.warm_up_size): entry.num_finished_warmup += 1 entry.runnable(**kwargs) - logger.debug( + logger.info( f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, " f"finished ({n + 1}/{entry.num_finished_warmup}) times" ) @@ -122,9 +122,19 @@ class CudaGraphPiecewiseBackend: output._clear paddle.device.synchronize() - logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}") + self.save_cudagrpah_dot_files(entry) + logger.info(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}") # Replay entry.cuda_graph.replay() - logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}") + logger.info(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}") return entry.output_buffer + + def save_cudagrpah_dot_files(self, entry): + """Print CUDAGrpah to dot files""" + if entry.cuda_graph: + print("save graph") + entry.cuda_graph.print_to_dot_files( + f"/root/paddlejob/workspace/env_run/output/gongshaotian/FastDeploy/GraphDotFiles/backend{id(self)}_shape{entry.runtime_bs}", + 1 << 0, + ) # backend{id(self)}_shape{entry.runtime_bs} diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py index 9f56d313c..211106d41 100644 --- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py +++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py @@ -115,7 +115,7 @@ class GraphOptBackend: self.runnable = runnable self.fd_config = fd_config - self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0] + self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0] if self.fd_config.graph_opt_config.graph_opt_level > 0: # 1. Prepare cuda grpah input buffers (contain output of subgraphs) @@ -138,9 +138,11 @@ class GraphOptBackend: ) assert kwargs["forward_meta"].ids_remove_padding is not None - batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0] + real_shape = kwargs["forward_meta"].ids_remove_padding.shape[0] + print(f"[GraphOptBackend] real_shape :{real_shape}") - if (not kwargs["forward_meta"].step_use_cudagraph) or (batch_size > self.max_captre_batch): + if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.max_captre_size): + print("[GraphOptBackend] run dynamic graph") return self.runnable(**kwargs) else: return self.cudagraph_piecewise_backend.__call__(**kwargs) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index af567cba1..b0e8d4bad 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1051,6 +1051,7 @@ class GPUModelRunner(ModelRunnerBase): self.padding_cudagraph_inputs() # 3. Run model + print(f"[Dummy] step use cuda graph:{self.forward_meta.step_use_cudagraph}") if self.enable_mm: model_output = self.model( self.share_inputs["ids_remove_padding"], @@ -1315,6 +1316,7 @@ class GPUModelRunner(ModelRunnerBase): # 2. Padding inputs for cuda graph self.padding_cudagraph_inputs() + print(f"[Debug] step use cuda graph:{self.forward_meta.step_use_cudagraph}") # 3. Execute model if self.enable_mm: @@ -1329,6 +1331,7 @@ class GPUModelRunner(ModelRunnerBase): ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) + paddle.device.synchronize() hidden_states = rebuild_padding( model_output, self.share_inputs["cu_seqlens_q"], @@ -1427,6 +1430,7 @@ class GPUModelRunner(ModelRunnerBase): self.proposer.run(full_hidden_states=model_output) else: self.proposer.run(share_inputs=self.share_inputs) + print("proposer run") # 7. Updata 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment)