fast compile

This commit is contained in:
gongshaotian
2025-09-17 20:12:30 +08:00
parent fdf49de161
commit 834639a7ff
18 changed files with 147 additions and 1324 deletions

View File

@@ -232,113 +232,113 @@ void CascadeAppendAttentionKernel(
enable_prefill,
stream,
out);
} else if (cache_quant_type_str == "cache_int8") {
CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale,
cache_v_scale,
cache_k_zp,
cache_v_zp,
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
block_shape_q,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
is_decoder,
enable_prefill,
cache_quant_type_str,
stream,
out);
} else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale,
cache_v_scale,
cache_k_zp,
cache_v_zp,
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
block_shape_q,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
is_decoder,
enable_prefill,
cache_quant_type_str,
stream,
out);
} else if (cache_quant_type_str == "cache_int4_zp") {
CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale,
cache_v_scale,
cache_k_zp,
cache_v_zp,
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
block_shape_q,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
is_decoder,
enable_prefill,
stream,
out);
// } else if (cache_quant_type_str == "cache_int8") {
// CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
// qkv,
// cache_k,
// cache_v,
// attn_mask,
// cache_k_scale,
// cache_v_scale,
// cache_k_zp,
// cache_v_zp,
// shift_bias,
// smooth_weight,
// seq_lens_q,
// seq_lens_kv,
// seq_lens_encoder,
// batch_id_per_token,
// cu_seqlens_q,
// block_table,
// batch_ids,
// tile_ids_per_batch,
// num_blocks,
// block_shape_q,
// max_seq_len,
// max_dec_len,
// quant_max_bound,
// quant_min_bound,
// in_scale,
// max_partition_size,
// encoder_max_partition_size,
// speculate_max_draft_token_num,
// causal,
// is_decoder,
// enable_prefill,
// cache_quant_type_str,
// stream,
// out);
// } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
// CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
// qkv,
// cache_k,
// cache_v,
// attn_mask,
// cache_k_scale,
// cache_v_scale,
// cache_k_zp,
// cache_v_zp,
// shift_bias,
// smooth_weight,
// seq_lens_q,
// seq_lens_kv,
// seq_lens_encoder,
// batch_id_per_token,
// cu_seqlens_q,
// block_table,
// batch_ids,
// tile_ids_per_batch,
// num_blocks,
// block_shape_q,
// max_seq_len,
// max_dec_len,
// quant_max_bound,
// quant_min_bound,
// in_scale,
// max_partition_size,
// encoder_max_partition_size,
// speculate_max_draft_token_num,
// causal,
// is_decoder,
// enable_prefill,
// cache_quant_type_str,
// stream,
// out);
// } else if (cache_quant_type_str == "cache_int4_zp") {
// CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
// qkv,
// cache_k,
// cache_v,
// attn_mask,
// cache_k_scale,
// cache_v_scale,
// cache_k_zp,
// cache_v_zp,
// shift_bias,
// smooth_weight,
// seq_lens_q,
// seq_lens_kv,
// seq_lens_encoder,
// batch_id_per_token,
// cu_seqlens_q,
// block_table,
// batch_ids,
// tile_ids_per_batch,
// num_blocks,
// block_shape_q,
// max_seq_len,
// max_dec_len,
// quant_max_bound,
// quant_min_bound,
// in_scale,
// max_partition_size,
// encoder_max_partition_size,
// speculate_max_draft_token_num,
// causal,
// is_decoder,
// enable_prefill,
// stream,
// out);
} else {
PD_THROW(
"cache_quant_type_str should be one of [none, cache_int8, "