success run ngram

This commit is contained in:
gongshaotian
2025-08-20 16:27:29 +08:00
parent 9ff2dfb162
commit 8351e83993
27 changed files with 94 additions and 1331 deletions

View File

@@ -998,8 +998,10 @@ void MultiQueryAppendAttention(
} else {
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads); // 128k is too large
// dim3 grids_merge(min(sm_count * 4, token_num),
// num_heads); // 128k is too large
printf("\nsm_count * 4: %d\t token_num:%d", sm_count * 4, token_num);
dim3 grids_merge(sm_count * 4, num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,
@@ -1233,8 +1235,10 @@ void MultiQueryAppendAttention(
} else {
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
// dim3 grids_merge(min(sm_count * 4, token_num),
// num_heads);
printf("\nsm_count * 4: %d\t token_num:%d", sm_count * 4, token_num);
dim3 grids_merge(sm_count * 4, num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,

View File

@@ -1210,8 +1210,9 @@ void MultiQueryAppendC4Attention(
} else {
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
// dim3 grids_merge(min(sm_count * 4, token_num),
// num_heads);
dim3 grids_merge(sm_count * 4, num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,
@@ -1472,8 +1473,9 @@ void MultiQueryAppendC4Attention(
} else {
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
// dim3 grids_merge(min(sm_count * 4, token_num),
// num_heads);
dim3 grids_merge(sm_count * 4, num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,

View File

@@ -1170,8 +1170,9 @@ void MultiQueryAppendC8Attention(
} else {
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
// dim3 grids_merge(min(sm_count * 4, token_num),
// num_heads);
dim3 grids_merge(sm_count * 4, num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,
@@ -1444,8 +1445,9 @@ void MultiQueryAppendC8Attention(
} else {
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
// dim3 grids_merge(min(sm_count * 4, token_num),
// num_heads);
dim3 grids_merge(sm_count * 4, num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,

View File

@@ -231,111 +231,6 @@ void CascadeAppendAttentionKernel(
enable_prefill,
stream,
out);
} else if (cache_quant_type_str == "cache_int8") {
CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale,
cache_v_scale,
cache_k_zp,
cache_v_zp,
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
block_shape_q,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
is_decoder,
enable_prefill,
stream,
out);
} else if (cache_quant_type_str == "cache_fp8") {
CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale,
cache_v_scale,
cache_k_zp,
cache_v_zp,
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
block_shape_q,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
is_decoder,
enable_prefill,
stream,
out);
} else if (cache_quant_type_str == "cache_int4_zp") {
CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale,
cache_v_scale,
cache_k_zp,
cache_v_zp,
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
block_shape_q,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
is_decoder,
enable_prefill,
stream,
out);
} else {
PD_THROW(
"cache_quant_type_str should be one of [none, cache_int8, "

View File

@@ -517,44 +517,44 @@ void DecodeMLAAttentionKernel(
block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})});
}
template void DecodeMLAAttentionKernel<paddle::bfloat16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
const paddle::Tensor &cache_k,
const paddle::Tensor &cache_v,
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>& shift_bias,
const paddle::optional<paddle::Tensor>& smooth_weight,
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &block_table,
int max_seq_len,
int max_dec_len,
float softmax_scale,
float in_scale,
bool causal,
cudaStream_t &stream,
paddle::Tensor *out);
// template void DecodeMLAAttentionKernel<paddle::bfloat16>(
// const AppendAttnMetaData& meta_data,
// const paddle::Tensor &q, // [token_num, num_heads, head_dim]
// const paddle::Tensor &cache_k,
// const paddle::Tensor &cache_v,
// const paddle::optional<paddle::Tensor>& attn_mask,
// const paddle::optional<paddle::Tensor>& shift_bias,
// const paddle::optional<paddle::Tensor>& smooth_weight,
// const paddle::Tensor &seq_lens_q, // q_seq_len is 1
// const paddle::Tensor &seq_lens_kv,
// const paddle::Tensor &batch_id_per_token,
// const paddle::Tensor &cu_seqlens_q,
// const paddle::Tensor &block_table,
// int max_seq_len,
// int max_dec_len,
// float softmax_scale,
// float in_scale,
// bool causal,
// cudaStream_t &stream,
// paddle::Tensor *out);
template void DecodeMLAAttentionKernel<paddle::float16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
const paddle::Tensor &cache_k,
const paddle::Tensor &cache_v,
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>& shift_bias,
const paddle::optional<paddle::Tensor>& smooth_weight,
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &block_table,
int max_seq_len,
int max_dec_len,
float softmax_scale,
float in_scale,
bool causal,
cudaStream_t &stream,
paddle::Tensor *out);
// template void DecodeMLAAttentionKernel<paddle::float16>(
// const AppendAttnMetaData& meta_data,
// const paddle::Tensor &q, // [token_num, num_heads, head_dim]
// const paddle::Tensor &cache_k,
// const paddle::Tensor &cache_v,
// const paddle::optional<paddle::Tensor>& attn_mask,
// const paddle::optional<paddle::Tensor>& shift_bias,
// const paddle::optional<paddle::Tensor>& smooth_weight,
// const paddle::Tensor &seq_lens_q, // q_seq_len is 1
// const paddle::Tensor &seq_lens_kv,
// const paddle::Tensor &batch_id_per_token,
// const paddle::Tensor &cu_seqlens_q,
// const paddle::Tensor &block_table,
// int max_seq_len,
// int max_dec_len,
// float softmax_scale,
// float in_scale,
// bool causal,
// cudaStream_t &stream,
// paddle::Tensor *out);

View File

@@ -226,6 +226,7 @@ __global__ void append_speculate_cache_rope_kernel(
const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
const int block_idx = block_table_now[write_seq_id / block_size];
// printf("\nblock_idx:%d\t", block_idx);
if (block_idx < 0) {
printf(
"Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var "
@@ -236,6 +237,7 @@ __global__ void append_speculate_cache_rope_kernel(
seq_lens_decoder[ori_bi],
token_id,
cu_seqlens_q[ori_bi]);
return ;
}
const int block_offset = write_seq_id % block_size;
@@ -283,8 +285,10 @@ __global__ void append_speculate_cache_rope_kernel(
}
if (hi < num_heads) {
// write q
// printf("\nwrite q ");
Store<T, VecSize>(bias_vec, &q_out[write_q_idx]);
} else {
// printf("\nwrite k/v ");
// write k/v
const int kv_head_idx = (hi - num_heads) % gqa_group_size;
const int tgt_idx = (block_idx * gqa_group_size * block_size * head_size +

View File

@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4m3fn>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,59 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m3fn>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::float16, int8_t>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,107 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c8_impl.cuh"
template void
CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
template void
CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,102 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c8_impl.cuh"
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, false>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, true>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,104 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c8_impl.cuh"
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,104 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c8_impl.cuh"
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, false>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, true>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,103 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c8_impl.cuh"
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, false>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, true>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,103 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c8_impl.cuh"
template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -1,49 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../encoder_write_cache_with_rope_kernel.h"
template void
EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor&
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
const paddle::optional<paddle::Tensor>& qkv_biases,
const paddle::optional<paddle::Tensor>& cache_k_scale,
const paddle::optional<paddle::Tensor>& cache_v_scale,
const paddle::optional<paddle::Tensor>& cache_k_zp,
const paddle::optional<paddle::Tensor>& cache_v_zp,
const paddle::optional<paddle::Tensor>& kv_signal_data,
const std::string& cache_quant_type_str,
const int num_blocks,
const int max_seq_len,
const bool use_neox_style,
const bool rope_3d,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -1,48 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../encoder_write_cache_with_rope_kernel.h"
template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor&
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
const paddle::optional<paddle::Tensor>& qkv_biases,
const paddle::optional<paddle::Tensor>& cache_k_scale,
const paddle::optional<paddle::Tensor>& cache_v_scale,
const paddle::optional<paddle::Tensor>& cache_k_zp,
const paddle::optional<paddle::Tensor>& cache_v_zp,
const paddle::optional<paddle::Tensor>& kv_signal_data,
const std::string& cache_quant_type_str,
const int num_blocks,
const int max_seq_len,
const bool use_neox_style,
const bool rope_3d,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -1,48 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../encoder_write_cache_with_rope_kernel.h"
template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor&
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
const paddle::optional<paddle::Tensor>& qkv_biases,
const paddle::optional<paddle::Tensor>& cache_k_scale,
const paddle::optional<paddle::Tensor>& cache_v_scale,
const paddle::optional<paddle::Tensor>& cache_k_zp,
const paddle::optional<paddle::Tensor>& cache_v_zp,
const paddle::optional<paddle::Tensor>& kv_signal_data,
const std::string& cache_quant_type_str,
const int num_blocks,
const int max_seq_len,
const bool use_neox_style,
const bool rope_3d,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -1,48 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../encoder_write_cache_with_rope_kernel.h"
template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor&
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
const paddle::optional<paddle::Tensor>& qkv_biases,
const paddle::optional<paddle::Tensor>& cache_k_scale,
const paddle::optional<paddle::Tensor>& cache_v_scale,
const paddle::optional<paddle::Tensor>& cache_k_zp,
const paddle::optional<paddle::Tensor>& cache_v_zp,
const paddle::optional<paddle::Tensor>& kv_signal_data,
const std::string& cache_quant_type_str,
const int num_blocks,
const int max_seq_len,
const bool use_neox_style,
const bool rope_3d,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -137,6 +137,10 @@ inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) {
*num_blocks = std::max<int>(
1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize,
sm_count * tpm / kBlockSize * kNumWaves));
// int64_t tmp_num1 = (n + kBlockSize - 1) / kBlockSize; // dynamic
// int64_t tmp_num2 = sm_count * tpm / kBlockSize * kNumWaves; // static
// *num_blocks = std::max<int>(1,tmp_num2);
// printf("\nGetNumBlocks tmp_num1: %d \ttmp_num2: %d", tmp_num1, tmp_num2);
return cudaSuccess;
}

View File

@@ -539,6 +539,7 @@ class GraphOptimizationConfig:
draft_capture_sizes += [32 * i for i in range(17, 33)]
draft_capture_sizes.append(max_num_seqs)
draft_capture_sizes.append(max_num_seqs * 2) # for spec decode
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
def to_json_string(self):
@@ -950,7 +951,7 @@ class FDConfig:
# Initialize cuda graph capture list
if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)
self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=self.parallel_config.max_num_seqs)
self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=(self.parallel_config.max_num_seqs * 2))
# TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
if self.graph_opt_config.graph_opt_level == 2:

View File

@@ -77,7 +77,7 @@ class CudaGraphPiecewiseBackend:
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
real_shape = ids_remove_padding.shape[0]
padding_real_shape = self.real_shape_to_captured_size[real_shape]
logger.debug(
logger.info(
f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
f"The padded shape is :{padding_real_shape}"
)
@@ -86,7 +86,7 @@ class CudaGraphPiecewiseBackend:
assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
if entry.runnable is None:
entry.runnable = self.runnable
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
logger.info(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
if not entry.use_cudagraph:
return entry.runnable(**kwargs)
@@ -97,7 +97,7 @@ class CudaGraphPiecewiseBackend:
for n in range(entry.num_finished_warmup, self.warm_up_size):
entry.num_finished_warmup += 1
entry.runnable(**kwargs)
logger.debug(
logger.info(
f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
)
@@ -122,9 +122,19 @@ class CudaGraphPiecewiseBackend:
output._clear
paddle.device.synchronize()
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
self.save_cudagrpah_dot_files(entry)
logger.info(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
# Replay
entry.cuda_graph.replay()
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
logger.info(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
return entry.output_buffer
def save_cudagrpah_dot_files(self, entry):
"""Print CUDAGrpah to dot files"""
if entry.cuda_graph:
print("save graph")
entry.cuda_graph.print_to_dot_files(
f"/root/paddlejob/workspace/env_run/output/gongshaotian/FastDeploy/GraphDotFiles/backend{id(self)}_shape{entry.runtime_bs}",
1 << 0,
) # backend{id(self)}_shape{entry.runtime_bs}

View File

@@ -115,7 +115,7 @@ class GraphOptBackend:
self.runnable = runnable
self.fd_config = fd_config
self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
if self.fd_config.graph_opt_config.graph_opt_level > 0:
# 1. Prepare cuda grpah input buffers (contain output of subgraphs)
@@ -138,9 +138,11 @@ class GraphOptBackend:
)
assert kwargs["forward_meta"].ids_remove_padding is not None
batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0]
real_shape = kwargs["forward_meta"].ids_remove_padding.shape[0]
print(f"[GraphOptBackend] real_shape :{real_shape}")
if (not kwargs["forward_meta"].step_use_cudagraph) or (batch_size > self.max_captre_batch):
if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.max_captre_size):
print("[GraphOptBackend] run dynamic graph")
return self.runnable(**kwargs)
else:
return self.cudagraph_piecewise_backend.__call__(**kwargs)

View File

@@ -1051,6 +1051,7 @@ class GPUModelRunner(ModelRunnerBase):
self.padding_cudagraph_inputs()
# 3. Run model
print(f"[Dummy] step use cuda graph:{self.forward_meta.step_use_cudagraph}")
if self.enable_mm:
model_output = self.model(
self.share_inputs["ids_remove_padding"],
@@ -1315,6 +1316,7 @@ class GPUModelRunner(ModelRunnerBase):
# 2. Padding inputs for cuda graph
self.padding_cudagraph_inputs()
print(f"[Debug] step use cuda graph:{self.forward_meta.step_use_cudagraph}")
# 3. Execute model
if self.enable_mm:
@@ -1329,6 +1331,7 @@ class GPUModelRunner(ModelRunnerBase):
ids_remove_padding=self.share_inputs["ids_remove_padding"],
forward_meta=self.forward_meta,
)
paddle.device.synchronize()
hidden_states = rebuild_padding(
model_output,
self.share_inputs["cu_seqlens_q"],
@@ -1427,6 +1430,7 @@ class GPUModelRunner(ModelRunnerBase):
self.proposer.run(full_hidden_states=model_output)
else:
self.proposer.run(share_inputs=self.share_inputs)
print("proposer run")
# 7. Updata 'infer_seed' and step_cuda()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)