mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
success run ngram
This commit is contained in:
@@ -998,8 +998,10 @@ void MultiQueryAppendAttention(
|
||||
} else {
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads); // 128k is too large
|
||||
// dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
// num_heads); // 128k is too large
|
||||
printf("\nsm_count * 4: %d\t token_num:%d", sm_count * 4, token_num);
|
||||
dim3 grids_merge(sm_count * 4, num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
@@ -1233,8 +1235,10 @@ void MultiQueryAppendAttention(
|
||||
} else {
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
// dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
// num_heads);
|
||||
printf("\nsm_count * 4: %d\t token_num:%d", sm_count * 4, token_num);
|
||||
dim3 grids_merge(sm_count * 4, num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
|
||||
@@ -1210,8 +1210,9 @@ void MultiQueryAppendC4Attention(
|
||||
} else {
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
// dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
// num_heads);
|
||||
dim3 grids_merge(sm_count * 4, num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
@@ -1472,8 +1473,9 @@ void MultiQueryAppendC4Attention(
|
||||
} else {
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
// dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
// num_heads);
|
||||
dim3 grids_merge(sm_count * 4, num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
|
||||
@@ -1170,8 +1170,9 @@ void MultiQueryAppendC8Attention(
|
||||
} else {
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
// dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
// num_heads);
|
||||
dim3 grids_merge(sm_count * 4, num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
@@ -1444,8 +1445,9 @@ void MultiQueryAppendC8Attention(
|
||||
} else {
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
// dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
// num_heads);
|
||||
dim3 grids_merge(sm_count * 4, num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
|
||||
@@ -231,111 +231,6 @@ void CascadeAppendAttentionKernel(
|
||||
enable_prefill,
|
||||
stream,
|
||||
out);
|
||||
} else if (cache_quant_type_str == "cache_int8") {
|
||||
CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
|
||||
qkv,
|
||||
cache_k,
|
||||
cache_v,
|
||||
attn_mask,
|
||||
cache_k_scale,
|
||||
cache_v_scale,
|
||||
cache_k_zp,
|
||||
cache_v_zp,
|
||||
shift_bias,
|
||||
smooth_weight,
|
||||
seq_lens_q,
|
||||
seq_lens_kv,
|
||||
seq_lens_encoder,
|
||||
batch_id_per_token,
|
||||
cu_seqlens_q,
|
||||
block_table,
|
||||
batch_ids,
|
||||
tile_ids_per_batch,
|
||||
num_blocks,
|
||||
block_shape_q,
|
||||
max_seq_len,
|
||||
max_dec_len,
|
||||
quant_max_bound,
|
||||
quant_min_bound,
|
||||
in_scale,
|
||||
max_partition_size,
|
||||
encoder_max_partition_size,
|
||||
speculate_max_draft_token_num,
|
||||
causal,
|
||||
is_decoder,
|
||||
enable_prefill,
|
||||
stream,
|
||||
out);
|
||||
} else if (cache_quant_type_str == "cache_fp8") {
|
||||
CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
|
||||
qkv,
|
||||
cache_k,
|
||||
cache_v,
|
||||
attn_mask,
|
||||
cache_k_scale,
|
||||
cache_v_scale,
|
||||
cache_k_zp,
|
||||
cache_v_zp,
|
||||
shift_bias,
|
||||
smooth_weight,
|
||||
seq_lens_q,
|
||||
seq_lens_kv,
|
||||
seq_lens_encoder,
|
||||
batch_id_per_token,
|
||||
cu_seqlens_q,
|
||||
block_table,
|
||||
batch_ids,
|
||||
tile_ids_per_batch,
|
||||
num_blocks,
|
||||
block_shape_q,
|
||||
max_seq_len,
|
||||
max_dec_len,
|
||||
quant_max_bound,
|
||||
quant_min_bound,
|
||||
in_scale,
|
||||
max_partition_size,
|
||||
encoder_max_partition_size,
|
||||
speculate_max_draft_token_num,
|
||||
causal,
|
||||
is_decoder,
|
||||
enable_prefill,
|
||||
stream,
|
||||
out);
|
||||
} else if (cache_quant_type_str == "cache_int4_zp") {
|
||||
CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
|
||||
qkv,
|
||||
cache_k,
|
||||
cache_v,
|
||||
attn_mask,
|
||||
cache_k_scale,
|
||||
cache_v_scale,
|
||||
cache_k_zp,
|
||||
cache_v_zp,
|
||||
shift_bias,
|
||||
smooth_weight,
|
||||
seq_lens_q,
|
||||
seq_lens_kv,
|
||||
seq_lens_encoder,
|
||||
batch_id_per_token,
|
||||
cu_seqlens_q,
|
||||
block_table,
|
||||
batch_ids,
|
||||
tile_ids_per_batch,
|
||||
num_blocks,
|
||||
block_shape_q,
|
||||
max_seq_len,
|
||||
max_dec_len,
|
||||
quant_max_bound,
|
||||
quant_min_bound,
|
||||
in_scale,
|
||||
max_partition_size,
|
||||
encoder_max_partition_size,
|
||||
speculate_max_draft_token_num,
|
||||
causal,
|
||||
is_decoder,
|
||||
enable_prefill,
|
||||
stream,
|
||||
out);
|
||||
} else {
|
||||
PD_THROW(
|
||||
"cache_quant_type_str should be one of [none, cache_int8, "
|
||||
|
||||
@@ -517,44 +517,44 @@ void DecodeMLAAttentionKernel(
|
||||
block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})});
|
||||
}
|
||||
|
||||
template void DecodeMLAAttentionKernel<paddle::bfloat16>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor &cache_k,
|
||||
const paddle::Tensor &cache_v,
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>& shift_bias,
|
||||
const paddle::optional<paddle::Tensor>& smooth_weight,
|
||||
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
|
||||
const paddle::Tensor &seq_lens_kv,
|
||||
const paddle::Tensor &batch_id_per_token,
|
||||
const paddle::Tensor &cu_seqlens_q,
|
||||
const paddle::Tensor &block_table,
|
||||
int max_seq_len,
|
||||
int max_dec_len,
|
||||
float softmax_scale,
|
||||
float in_scale,
|
||||
bool causal,
|
||||
cudaStream_t &stream,
|
||||
paddle::Tensor *out);
|
||||
// template void DecodeMLAAttentionKernel<paddle::bfloat16>(
|
||||
// const AppendAttnMetaData& meta_data,
|
||||
// const paddle::Tensor &q, // [token_num, num_heads, head_dim]
|
||||
// const paddle::Tensor &cache_k,
|
||||
// const paddle::Tensor &cache_v,
|
||||
// const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
// const paddle::optional<paddle::Tensor>& shift_bias,
|
||||
// const paddle::optional<paddle::Tensor>& smooth_weight,
|
||||
// const paddle::Tensor &seq_lens_q, // q_seq_len is 1
|
||||
// const paddle::Tensor &seq_lens_kv,
|
||||
// const paddle::Tensor &batch_id_per_token,
|
||||
// const paddle::Tensor &cu_seqlens_q,
|
||||
// const paddle::Tensor &block_table,
|
||||
// int max_seq_len,
|
||||
// int max_dec_len,
|
||||
// float softmax_scale,
|
||||
// float in_scale,
|
||||
// bool causal,
|
||||
// cudaStream_t &stream,
|
||||
// paddle::Tensor *out);
|
||||
|
||||
template void DecodeMLAAttentionKernel<paddle::float16>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor &cache_k,
|
||||
const paddle::Tensor &cache_v,
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>& shift_bias,
|
||||
const paddle::optional<paddle::Tensor>& smooth_weight,
|
||||
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
|
||||
const paddle::Tensor &seq_lens_kv,
|
||||
const paddle::Tensor &batch_id_per_token,
|
||||
const paddle::Tensor &cu_seqlens_q,
|
||||
const paddle::Tensor &block_table,
|
||||
int max_seq_len,
|
||||
int max_dec_len,
|
||||
float softmax_scale,
|
||||
float in_scale,
|
||||
bool causal,
|
||||
cudaStream_t &stream,
|
||||
paddle::Tensor *out);
|
||||
// template void DecodeMLAAttentionKernel<paddle::float16>(
|
||||
// const AppendAttnMetaData& meta_data,
|
||||
// const paddle::Tensor &q, // [token_num, num_heads, head_dim]
|
||||
// const paddle::Tensor &cache_k,
|
||||
// const paddle::Tensor &cache_v,
|
||||
// const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
// const paddle::optional<paddle::Tensor>& shift_bias,
|
||||
// const paddle::optional<paddle::Tensor>& smooth_weight,
|
||||
// const paddle::Tensor &seq_lens_q, // q_seq_len is 1
|
||||
// const paddle::Tensor &seq_lens_kv,
|
||||
// const paddle::Tensor &batch_id_per_token,
|
||||
// const paddle::Tensor &cu_seqlens_q,
|
||||
// const paddle::Tensor &block_table,
|
||||
// int max_seq_len,
|
||||
// int max_dec_len,
|
||||
// float softmax_scale,
|
||||
// float in_scale,
|
||||
// bool causal,
|
||||
// cudaStream_t &stream,
|
||||
// paddle::Tensor *out);
|
||||
|
||||
@@ -226,6 +226,7 @@ __global__ void append_speculate_cache_rope_kernel(
|
||||
|
||||
const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
|
||||
const int block_idx = block_table_now[write_seq_id / block_size];
|
||||
// printf("\nblock_idx:%d\t", block_idx);
|
||||
if (block_idx < 0) {
|
||||
printf(
|
||||
"Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var "
|
||||
@@ -236,6 +237,7 @@ __global__ void append_speculate_cache_rope_kernel(
|
||||
seq_lens_decoder[ori_bi],
|
||||
token_id,
|
||||
cu_seqlens_q[ori_bi]);
|
||||
return ;
|
||||
}
|
||||
const int block_offset = write_seq_id % block_size;
|
||||
|
||||
@@ -283,8 +285,10 @@ __global__ void append_speculate_cache_rope_kernel(
|
||||
}
|
||||
if (hi < num_heads) {
|
||||
// write q
|
||||
// printf("\nwrite q ");
|
||||
Store<T, VecSize>(bias_vec, &q_out[write_q_idx]);
|
||||
} else {
|
||||
// printf("\nwrite k/v ");
|
||||
// write k/v
|
||||
const int kv_head_idx = (hi - num_heads) % gqa_group_size;
|
||||
const int tgt_idx = (block_idx * gqa_group_size * block_size * head_size +
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c4_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,58 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c4_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4m3fn>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,58 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c4_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,59 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c4_impl.cuh"
|
||||
|
||||
|
||||
template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,58 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c4_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m3fn>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,58 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c4_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC4Kernel<paddle::float16, int8_t>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,107 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c8_impl.cuh"
|
||||
|
||||
|
||||
template void
|
||||
CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
|
||||
|
||||
|
||||
template void
|
||||
CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,102 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c8_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, false>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4m3fn, true>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,104 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c8_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
|
||||
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,104 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c8_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, false>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
|
||||
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, true>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,103 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c8_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, false>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m3fn, true>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,103 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../append_attention_c8_impl.cuh"
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
|
||||
|
||||
template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
const paddle::Tensor&
|
||||
cache_v, // [max_block_num, num_heads, head_dim, block_size]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_scale, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_k_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
cache_v_zp, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
shift_bias, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>&
|
||||
smooth_weight, // [num_kv_heads, head_dim]
|
||||
const paddle::Tensor& seq_lens_q,
|
||||
const paddle::Tensor& seq_lens_kv,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids_per_batch,
|
||||
const int num_blocks,
|
||||
const int block_shape_q,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float quant_max_bound,
|
||||
const float quant_min_bound,
|
||||
const float in_scale,
|
||||
const int max_partition_size,
|
||||
const int encoder_max_partition_size,
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool is_decoder,
|
||||
const bool enable_prefill,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* out);
|
||||
@@ -1,49 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../encoder_write_cache_with_rope_kernel.h"
|
||||
|
||||
template void
|
||||
EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor&
|
||||
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
|
||||
// kv_num_heads, head_dim] if GQA)
|
||||
const paddle::Tensor& seq_lens_this_time,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& seq_lens_decoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_tables,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids,
|
||||
const paddle::optional<paddle::Tensor>& rotary_embs,
|
||||
const paddle::optional<paddle::Tensor>& qkv_out_scales,
|
||||
const paddle::optional<paddle::Tensor>& qkv_biases,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_zp,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_zp,
|
||||
const paddle::optional<paddle::Tensor>& kv_signal_data,
|
||||
const std::string& cache_quant_type_str,
|
||||
const int num_blocks,
|
||||
const int max_seq_len,
|
||||
const bool use_neox_style,
|
||||
const bool rope_3d,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* qkv_out,
|
||||
paddle::Tensor* key_cache_out,
|
||||
paddle::Tensor* value_cache_out,
|
||||
const paddle::optional<paddle::Tensor>& q_norm_weight,
|
||||
const paddle::optional<paddle::Tensor>& k_norm_weight,
|
||||
const float rms_norm_eps);
|
||||
@@ -1,48 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../encoder_write_cache_with_rope_kernel.h"
|
||||
|
||||
template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor&
|
||||
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
|
||||
// kv_num_heads, head_dim] if GQA)
|
||||
const paddle::Tensor& seq_lens_this_time,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& seq_lens_decoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_tables,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids,
|
||||
const paddle::optional<paddle::Tensor>& rotary_embs,
|
||||
const paddle::optional<paddle::Tensor>& qkv_out_scales,
|
||||
const paddle::optional<paddle::Tensor>& qkv_biases,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_zp,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_zp,
|
||||
const paddle::optional<paddle::Tensor>& kv_signal_data,
|
||||
const std::string& cache_quant_type_str,
|
||||
const int num_blocks,
|
||||
const int max_seq_len,
|
||||
const bool use_neox_style,
|
||||
const bool rope_3d,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* qkv_out,
|
||||
paddle::Tensor* key_cache_out,
|
||||
paddle::Tensor* value_cache_out,
|
||||
const paddle::optional<paddle::Tensor>& q_norm_weight,
|
||||
const paddle::optional<paddle::Tensor>& k_norm_weight,
|
||||
const float rms_norm_eps);
|
||||
@@ -1,48 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../encoder_write_cache_with_rope_kernel.h"
|
||||
|
||||
template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor&
|
||||
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
|
||||
// kv_num_heads, head_dim] if GQA)
|
||||
const paddle::Tensor& seq_lens_this_time,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& seq_lens_decoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_tables,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids,
|
||||
const paddle::optional<paddle::Tensor>& rotary_embs,
|
||||
const paddle::optional<paddle::Tensor>& qkv_out_scales,
|
||||
const paddle::optional<paddle::Tensor>& qkv_biases,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_zp,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_zp,
|
||||
const paddle::optional<paddle::Tensor>& kv_signal_data,
|
||||
const std::string& cache_quant_type_str,
|
||||
const int num_blocks,
|
||||
const int max_seq_len,
|
||||
const bool use_neox_style,
|
||||
const bool rope_3d,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* qkv_out,
|
||||
paddle::Tensor* key_cache_out,
|
||||
paddle::Tensor* value_cache_out,
|
||||
const paddle::optional<paddle::Tensor>& q_norm_weight,
|
||||
const paddle::optional<paddle::Tensor>& k_norm_weight,
|
||||
const float rms_norm_eps);
|
||||
@@ -1,48 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "../encoder_write_cache_with_rope_kernel.h"
|
||||
|
||||
template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
const paddle::Tensor&
|
||||
qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
|
||||
// kv_num_heads, head_dim] if GQA)
|
||||
const paddle::Tensor& seq_lens_this_time,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& seq_lens_decoder,
|
||||
const paddle::Tensor& batch_id_per_token,
|
||||
const paddle::Tensor& cu_seqlens_q,
|
||||
const paddle::Tensor& block_tables,
|
||||
const paddle::Tensor& batch_ids,
|
||||
const paddle::Tensor& tile_ids,
|
||||
const paddle::optional<paddle::Tensor>& rotary_embs,
|
||||
const paddle::optional<paddle::Tensor>& qkv_out_scales,
|
||||
const paddle::optional<paddle::Tensor>& qkv_biases,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_scale,
|
||||
const paddle::optional<paddle::Tensor>& cache_k_zp,
|
||||
const paddle::optional<paddle::Tensor>& cache_v_zp,
|
||||
const paddle::optional<paddle::Tensor>& kv_signal_data,
|
||||
const std::string& cache_quant_type_str,
|
||||
const int num_blocks,
|
||||
const int max_seq_len,
|
||||
const bool use_neox_style,
|
||||
const bool rope_3d,
|
||||
cudaStream_t& stream,
|
||||
paddle::Tensor* qkv_out,
|
||||
paddle::Tensor* key_cache_out,
|
||||
paddle::Tensor* value_cache_out,
|
||||
const paddle::optional<paddle::Tensor>& q_norm_weight,
|
||||
const paddle::optional<paddle::Tensor>& k_norm_weight,
|
||||
const float rms_norm_eps);
|
||||
@@ -137,6 +137,10 @@ inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) {
|
||||
*num_blocks = std::max<int>(
|
||||
1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize,
|
||||
sm_count * tpm / kBlockSize * kNumWaves));
|
||||
// int64_t tmp_num1 = (n + kBlockSize - 1) / kBlockSize; // dynamic
|
||||
// int64_t tmp_num2 = sm_count * tpm / kBlockSize * kNumWaves; // static
|
||||
// *num_blocks = std::max<int>(1,tmp_num2);
|
||||
// printf("\nGetNumBlocks tmp_num1: %d \ttmp_num2: %d", tmp_num1, tmp_num2);
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -539,6 +539,7 @@ class GraphOptimizationConfig:
|
||||
draft_capture_sizes += [32 * i for i in range(17, 33)]
|
||||
|
||||
draft_capture_sizes.append(max_num_seqs)
|
||||
draft_capture_sizes.append(max_num_seqs * 2) # for spec decode
|
||||
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
|
||||
|
||||
def to_json_string(self):
|
||||
@@ -950,7 +951,7 @@ class FDConfig:
|
||||
# Initialize cuda graph capture list
|
||||
if self.graph_opt_config.cudagraph_capture_sizes is None:
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=self.parallel_config.max_num_seqs)
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=(self.parallel_config.max_num_seqs * 2))
|
||||
|
||||
# TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
|
||||
if self.graph_opt_config.graph_opt_level == 2:
|
||||
|
||||
@@ -77,7 +77,7 @@ class CudaGraphPiecewiseBackend:
|
||||
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
|
||||
real_shape = ids_remove_padding.shape[0]
|
||||
padding_real_shape = self.real_shape_to_captured_size[real_shape]
|
||||
logger.debug(
|
||||
logger.info(
|
||||
f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
|
||||
f"The padded shape is :{padding_real_shape}"
|
||||
)
|
||||
@@ -86,7 +86,7 @@ class CudaGraphPiecewiseBackend:
|
||||
assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
|
||||
if entry.runnable is None:
|
||||
entry.runnable = self.runnable
|
||||
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
|
||||
logger.info(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
|
||||
|
||||
if not entry.use_cudagraph:
|
||||
return entry.runnable(**kwargs)
|
||||
@@ -97,7 +97,7 @@ class CudaGraphPiecewiseBackend:
|
||||
for n in range(entry.num_finished_warmup, self.warm_up_size):
|
||||
entry.num_finished_warmup += 1
|
||||
entry.runnable(**kwargs)
|
||||
logger.debug(
|
||||
logger.info(
|
||||
f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
|
||||
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
|
||||
)
|
||||
@@ -122,9 +122,19 @@ class CudaGraphPiecewiseBackend:
|
||||
output._clear
|
||||
|
||||
paddle.device.synchronize()
|
||||
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
|
||||
self.save_cudagrpah_dot_files(entry)
|
||||
logger.info(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
|
||||
|
||||
# Replay
|
||||
entry.cuda_graph.replay()
|
||||
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
|
||||
logger.info(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
|
||||
return entry.output_buffer
|
||||
|
||||
def save_cudagrpah_dot_files(self, entry):
|
||||
"""Print CUDAGrpah to dot files"""
|
||||
if entry.cuda_graph:
|
||||
print("save graph")
|
||||
entry.cuda_graph.print_to_dot_files(
|
||||
f"/root/paddlejob/workspace/env_run/output/gongshaotian/FastDeploy/GraphDotFiles/backend{id(self)}_shape{entry.runtime_bs}",
|
||||
1 << 0,
|
||||
) # backend{id(self)}_shape{entry.runtime_bs}
|
||||
|
||||
@@ -115,7 +115,7 @@ class GraphOptBackend:
|
||||
self.runnable = runnable
|
||||
self.fd_config = fd_config
|
||||
|
||||
self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
|
||||
self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
|
||||
if self.fd_config.graph_opt_config.graph_opt_level > 0:
|
||||
# 1. Prepare cuda grpah input buffers (contain output of subgraphs)
|
||||
|
||||
@@ -138,9 +138,11 @@ class GraphOptBackend:
|
||||
)
|
||||
|
||||
assert kwargs["forward_meta"].ids_remove_padding is not None
|
||||
batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0]
|
||||
real_shape = kwargs["forward_meta"].ids_remove_padding.shape[0]
|
||||
print(f"[GraphOptBackend] real_shape :{real_shape}")
|
||||
|
||||
if (not kwargs["forward_meta"].step_use_cudagraph) or (batch_size > self.max_captre_batch):
|
||||
if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.max_captre_size):
|
||||
print("[GraphOptBackend] run dynamic graph")
|
||||
return self.runnable(**kwargs)
|
||||
else:
|
||||
return self.cudagraph_piecewise_backend.__call__(**kwargs)
|
||||
|
||||
@@ -1051,6 +1051,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.padding_cudagraph_inputs()
|
||||
|
||||
# 3. Run model
|
||||
print(f"[Dummy] step use cuda graph:{self.forward_meta.step_use_cudagraph}")
|
||||
if self.enable_mm:
|
||||
model_output = self.model(
|
||||
self.share_inputs["ids_remove_padding"],
|
||||
@@ -1315,6 +1316,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
# 2. Padding inputs for cuda graph
|
||||
self.padding_cudagraph_inputs()
|
||||
print(f"[Debug] step use cuda graph:{self.forward_meta.step_use_cudagraph}")
|
||||
|
||||
# 3. Execute model
|
||||
if self.enable_mm:
|
||||
@@ -1329,6 +1331,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
||||
forward_meta=self.forward_meta,
|
||||
)
|
||||
paddle.device.synchronize()
|
||||
hidden_states = rebuild_padding(
|
||||
model_output,
|
||||
self.share_inputs["cu_seqlens_q"],
|
||||
@@ -1427,6 +1430,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.proposer.run(full_hidden_states=model_output)
|
||||
else:
|
||||
self.proposer.run(share_inputs=self.share_inputs)
|
||||
print("proposer run")
|
||||
|
||||
# 7. Updata 'infer_seed' and step_cuda()
|
||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||
|
||||
Reference in New Issue
Block a user