fast compile

2025-12-24 13:28:13 +08:00 · 2025-09-17 20:12:30 +08:00
parent fdf49de161
commit 834639a7ff
18 changed files with 147 additions and 1324 deletions
--- a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
@@ -232,113 +232,113 @@ void CascadeAppendAttentionKernel(
                                                enable_prefill,
                                                stream,
                                                out);
-    } else if (cache_quant_type_str == "cache_int8") {
-        CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                cache_quant_type_str,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
-        CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                cache_quant_type_str,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_int4_zp") {
-        CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                stream,
-                                                out);
+    // } else if (cache_quant_type_str == "cache_int8") {
+    //     CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             cache_quant_type_str,
+    //                                             stream,
+    //                                             out);
+    // } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
+    //     CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             cache_quant_type_str,
+    //                                             stream,
+    //                                             out);
+    // } else if (cache_quant_type_str == "cache_int4_zp") {
+    //     CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             stream,
+    //                                             out);
    } else {
        PD_THROW(
            "cache_quant_type_str should be one of [none, cache_int8, "