[Excutor] Experiment Feature-Support Prefill in cudagraph (#3459)

* Support prefill in Cudagraph * Refactor GetBlockShapeAndSplitKVBlock Kernel V2 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.1 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.2 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.3 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.4 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.5 * Solve problem about encoder_num_blocks_x_cpu * Add early-exit mechanism for attention kernel * fix test case about append-attention * Update testcode, Add annotations to related tensors * move get_input_length_list * solve test_code * Add annotations about early-exit for attention kernel * Add annotations about early-exit for attention kernel2 * solve comment * solve mtp --------- Co-authored-by: RAM <gstian5555@outlook.com>
2025-11-03 11:02:01 +08:00 · 2025-09-08 13:12:24 +08:00
parent 472402bf4e
commit 3d0aaa5923
21 changed files with 528 additions and 260 deletions
--- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
@@ -52,6 +52,7 @@ __global__ void multi_query_append_attention_kernel(
    const float quant_min_bound,
    const float in_scale,
    const uint32_t chunk_size,
+    const int num_blocks_x_cpu,
    T *__restrict__ tmp_workspace,  // split kv [token_num, num_chunks,
                                    // num_heads, head_dim]
    float *__restrict__ tmp_m,      // [token_num, num_chunks, num_heads]
@@ -74,6 +75,11 @@ __global__ void multi_query_append_attention_kernel(

  block_table_now = block_table + batch_id * max_block_num_per_seq;

+  //When cudagraph capture prefill, may launch more gridDim.x
+  if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
+    return;
+  }
+
  const uint32_t q_len = seq_lens[batch_id];
  if (q_len <= 0) {
    return;
@@ -422,6 +428,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
    const float quant_min_bound,
    const float in_scale,
    const uint32_t chunk_size,
+    const int num_blocks_x_cpu,
    T *__restrict__ tmp_workspace,  // split kv [token_num, num_chunks,
                                    // num_heads, head_dim]
    float *__restrict__ tmp_m,      // [token_num, num_chunks, num_heads]
@@ -445,6 +452,11 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
  const uint32_t num_rows_per_block = num_frags_x * 16;
  const int *block_table_now = block_table + batch_id * max_block_num_per_seq;

+  //When cudagraph capture prefill, may launch more gridDim.x
+  if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
+    return;
+  }
+
  const uint32_t q_len = seq_lens[batch_id];
  if (q_len <= 0) {
    return;
@@ -902,6 +914,7 @@ void MultiQueryAppendAttention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          nullptr,
          nullptr,
          nullptr,
@@ -960,6 +973,7 @@ void MultiQueryAppendAttention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
          static_cast<float *>(tmp_m->ptr()),
          static_cast<float *>(tmp_d->ptr()),
@@ -1134,6 +1148,7 @@ void MultiQueryAppendAttention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          nullptr,
          nullptr,
          nullptr,
@@ -1206,6 +1221,7 @@ void MultiQueryAppendAttention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
          static_cast<float *>(tmp_m->ptr()),
          static_cast<float *>(tmp_d->ptr()),
--- a/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
@@ -57,6 +57,7 @@ __global__ void multi_query_append_attention_c4_kernel(
    const float quant_min_bound,
    const float in_scale,
    const uint32_t chunk_size,
+    const int num_blocks_x_cpu,
    T *__restrict__ tmp_workspace,  // split kv [token_num, num_chunks,
                                    // num_heads, head_dim]
    float *__restrict__ tmp_m,      // [token_num, num_chunks, num_heads]
@@ -85,6 +86,11 @@ __global__ void multi_query_append_attention_c4_kernel(

  block_table_now = block_table + batch_id * max_block_num_per_seq;

+  //When cudagraph capture prefill, may launch more gridDim.x
+  if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
+    return;
+  }
+
  const uint32_t q_len = seq_lens[batch_id];
  if (q_len <= 0) {
    return;
@@ -520,6 +526,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
    const float quant_min_bound,
    const float in_scale,
    const uint32_t chunk_size,
+    const int num_blocks_x_cpu,
    T *__restrict__ tmp_workspace,  // split kv [token_num, num_chunks,
                                    // num_heads, head_dim]
    float *__restrict__ tmp_m,      // [token_num, num_chunks, num_heads]
@@ -549,6 +556,11 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
  const uint32_t num_rows_per_block = num_frags_x * 16;
  const int *block_table_now = block_table + batch_id * max_block_num_per_seq;

+  //When cudagraph capture prefill, may launch more gridDim.x
+  if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
+    return;
+  }
+
  const uint32_t q_len = seq_lens[batch_id];
  if (q_len <= 0) {
    return;
@@ -1107,6 +1119,7 @@ void MultiQueryAppendC4Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          nullptr,
          nullptr,
          nullptr,
@@ -1171,6 +1184,7 @@ void MultiQueryAppendC4Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
          static_cast<float *>(tmp_m->ptr()),
          static_cast<float *>(tmp_d->ptr()),
@@ -1365,6 +1379,7 @@ void MultiQueryAppendC4Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          nullptr,
          nullptr,
          nullptr,
@@ -1445,6 +1460,7 @@ void MultiQueryAppendC4Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
          static_cast<float *>(tmp_m->ptr()),
          static_cast<float *>(tmp_d->ptr()),
--- a/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
@@ -58,6 +58,7 @@ __global__ void multi_query_append_attention_c8_kernel(
    const float quant_min_bound,
    const float in_scale,
    const uint32_t chunk_size,
+    const int num_blocks_x_cpu,
    T *__restrict__ tmp_workspace,  // split kv [token_num, num_chunks,
                                    // num_heads, head_dim]
    float *__restrict__ tmp_m,      // [token_num, num_chunks, num_heads]
@@ -87,6 +88,11 @@ __global__ void multi_query_append_attention_c8_kernel(

  block_table_now = block_table + batch_id * max_block_num_per_seq;

+  //When cudagraph capture prefill, may launch more gridDim.x
+  if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
+    return;
+  }
+
  const uint32_t q_len = seq_lens[batch_id];
  if (q_len <= 0) {
    return;
@@ -527,6 +533,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
    const float quant_min_bound,
    const float in_scale,
    const uint32_t chunk_size,
+    const int num_blocks_x_cpu,
    T *__restrict__ tmp_workspace,  // split kv [token_num, num_chunks,
                                    // num_heads, head_dim]
    float *__restrict__ tmp_m,      // [token_num, num_chunks, num_heads]
@@ -556,6 +563,11 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
  const uint32_t num_rows_per_block = num_frags_x * 16;
  const int *block_table_now = block_table + batch_id * max_block_num_per_seq;

+  //When cudagraph capture prefill, may launch more gridDim.x
+  if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
+    return;
+  }
+
  const uint32_t q_len = seq_lens[batch_id];
  if (q_len <= 0) {
    return;
@@ -1159,6 +1171,7 @@ void MultiQueryAppendC8Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          nullptr,
          nullptr,
          nullptr,
@@ -1217,6 +1230,7 @@ void MultiQueryAppendC8Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
          static_cast<float *>(tmp_m->ptr()),
          static_cast<float *>(tmp_d->ptr()),
@@ -1443,6 +1457,7 @@ void MultiQueryAppendC8Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          nullptr,
          nullptr,
          nullptr,
@@ -1517,6 +1532,7 @@ void MultiQueryAppendC8Attention(
          quant_min_bound,
          in_scale,
          chunk_size,
+          num_blocks_x_cpu,
          reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
          static_cast<float *>(tmp_m->ptr()),
          static_cast<float *>(tmp_d->ptr()),
--- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
+++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
@@ -191,14 +191,21 @@ get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
  }
 }

-std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
+void GetBlockShapeAndSplitKVBlock(
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
    const paddle::Tensor &seq_lens_this_time,
    paddle::Tensor &decoder_batch_ids,          // Inplace
    paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
    paddle::Tensor &decoder_num_blocks_x_cpu,   // Inplace, Pinned Memory
-    paddle::Tensor &max_len_tensor_cpu,         // Inplace, Pinned Memory
+    paddle::Tensor &max_len_tensor_cpu,         // Inplace, CPU
+    paddle::Tensor &encoder_batch_ids,          // Inplace
+    paddle::Tensor &encoder_tile_ids_per_batch, // Inplace
+    paddle::Tensor &encoder_num_blocks_x_cpu,   // Inplace, CPU
+    paddle::Tensor &kv_batch_ids,               // Inplace
+    paddle::Tensor &kv_tile_ids_per_batch,      // Inplace
+    paddle::Tensor &kv_num_blocks_x_cpu,        // Inplace, CPU
+    paddle::Tensor &max_len_kv_cpu,             // Inplace, CPU
    const int encoder_block_shape_q,
    const int decoder_block_shape_q,
    const int group_size,
@@ -223,13 +230,7 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
  int max_system_len = max_len_cpu_ptr[6];
  int max_just_dec_len_without_system = max_len_cpu_ptr[7];

-  paddle::Tensor encoder_batch_ids;
-  paddle::Tensor encoder_tile_ids_per_batch;
-  paddle::Tensor encoder_num_blocks_x_cpu;  /*cpu*/
-  paddle::Tensor kv_batch_ids;
-  paddle::Tensor kv_tile_ids_per_batch;
-  paddle::Tensor kv_num_blocks_x_cpu;       /*cpu*/
-  paddle::Tensor max_len_kv_cpu;            /*cpu*/
+

  auto max_len_kv =
      GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
@@ -237,17 +238,14 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
      max_len_kv.data<int>(), seq_lens_this_time.data<int>(),
      seq_lens_decoder.data<int>(), bsz);

-  max_len_kv_cpu = max_len_kv.copy_to(paddle::CPUPlace(), false);
+
+  max_len_kv_cpu.copy_(max_len_kv, max_len_kv_cpu.place(), false);

  if (max_enc_len_this_time > 0) {
-    const uint32_t max_tile_size_per_bs_kv =
-        div_up(max_enc_dec_len_this_time, block_size);
-    kv_batch_ids =
-        GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
-                       seq_lens_encoder.place());
-    kv_tile_ids_per_batch =
-        GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
-                       seq_lens_encoder.place());
+    const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size);
+    const uint32_t kv_batch_shape = bsz * max_tile_size_per_bs_kv;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(kv_batch_ids.data<int>(), 0, kv_batch_shape * sizeof(int32_t), stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(kv_tile_ids_per_batch.data<int>(), 0, kv_batch_shape * sizeof(int32_t), stream));
    auto kv_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());

@@ -258,16 +256,12 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
        kv_tile_ids_per_batch.data<int>(), kv_num_blocks_x.data<int>(), bsz,
        block_size, block_size);

-    kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
-
-    const uint32_t encoder_max_tile_size_per_bs_q =
-        div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
-    encoder_batch_ids =
-        GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
-                       paddle::DataType::INT32, seq_lens_encoder.place());
-    encoder_tile_ids_per_batch =
-        GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
-                       paddle::DataType::INT32, seq_lens_encoder.place());
+    kv_num_blocks_x_cpu.copy_(kv_num_blocks_x, kv_num_blocks_x_cpu.place(), false);
+    // Clear buffer
+    const uint32_t encoder_max_tile_size_per_bs_q = div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
+    const uint32_t encoder_batch_shape = bsz * encoder_max_tile_size_per_bs_q;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(encoder_batch_ids.data<int>(), 0, encoder_batch_shape * sizeof(int32_t), stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(encoder_tile_ids_per_batch.data<int>(), 0, encoder_batch_shape * sizeof(int32_t), stream));
    auto encoder_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
    split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(), nullptr,
@@ -275,21 +269,7 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
                                        encoder_tile_ids_per_batch.data<int>(),
                                        encoder_num_blocks_x.data<int>(), bsz,
                                        encoder_block_shape_q, group_size);
-    encoder_num_blocks_x_cpu =
-        encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
-  } else {
-    encoder_batch_ids =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    encoder_tile_ids_per_batch =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    encoder_num_blocks_x_cpu =
-        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
-    kv_batch_ids =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    kv_tile_ids_per_batch =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    kv_num_blocks_x_cpu =
-        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
+    encoder_num_blocks_x_cpu.copy_(encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), false);
  }

  if (max_just_dec_len_this_time > 0) {
@@ -314,15 +294,6 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    decoder_num_blocks_x_cpu.copy_(decoder_num_blocks_x, decoder_num_blocks_x_cpu.place(), false);
  }

-  return {
-    encoder_batch_ids,
-    encoder_tile_ids_per_batch,
-    encoder_num_blocks_x_cpu, /*cpu*/
-    kv_batch_ids,
-    kv_tile_ids_per_batch,
-    kv_num_blocks_x_cpu,      /*cpu*/
-    max_len_kv_cpu,           /*cpu*/
-  };
 }

 PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
@@ -333,16 +304,17 @@ PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
      "decoder_batch_ids",
      "decoder_tile_ids_per_batch",
      "decoder_num_blocks_x_cpu",
-      "max_len_tensor_cpu"
+      "max_len_tensor_cpu",
+      "encoder_batch_ids",
+      "encoder_tile_ids_per_batch",
+      "encoder_num_blocks_x_cpu",
+      "kv_batch_ids",
+      "kv_tile_ids_per_batch",
+      "kv_num_blocks_x_cpu",
+      "max_len_kv_cpu"
    })
    .Outputs({
-      paddle::Optional("encoder_batch_ids"),
-      paddle::Optional("encoder_tile_ids_per_batch"),
-      paddle::Optional("encoder_num_blocks_x_cpu"),
-      paddle::Optional("kv_batch_ids"),
-      paddle::Optional("kv_tile_ids_per_batch"),
-      paddle::Optional("kv_num_blocks_x_cpu"),
-      "max_len_kv_cpu"
+
    })
    .Attrs({
      "encoder_block_shape_q: int",
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -299,7 +299,7 @@ paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id,
 paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
                                       const int layer_id);

-std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
+void GetBlockShapeAndSplitKVBlock(
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
    const paddle::Tensor &seq_lens_this_time,
@@ -307,6 +307,13 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
    paddle::Tensor &decoder_num_blocks_x_cpu,   // Inplace, Pinned Memory
    paddle::Tensor &max_len_tensor_cpu,         // Inplace, Pinned Memory
+    paddle::Tensor &encoder_batch_ids,          // Inplace
+    paddle::Tensor &encoder_tile_ids_per_batch, // Inplace
+    paddle::Tensor &encoder_num_blocks_x_cpu,   // Inplace, Pinned Memory
+    paddle::Tensor &kv_batch_ids,               // Inplace
+    paddle::Tensor &kv_tile_ids_per_batch,      // Inplace
+    paddle::Tensor &kv_num_blocks_x_cpu,        // Inplace, Pinned Memory
+    paddle::Tensor &max_len_kv_cpu,             // Inplace, Pinned Memory
    const int encoder_block_shape_q,
    const int decoder_block_shape_q,
    const int group_size,