From 249feca65a354229823e955863c65c2fa4d47acc Mon Sep 17 00:00:00 2001
From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com>
Date: Fri, 14 Nov 2025 15:52:51 +0800
Subject: [PATCH] [BugFix] Revert skip capture (#5023)

* Revert "[BugFix][Metax] Fix metax compile issue in get_block_shape_and_split_kv_block (#5000)"

This reverts commit 05da8e34c0f4f567aaf0150bc89e3c80321efabc.

* Revert "skip DtoH capture (#4988)"

This reverts commit 5b24013d4609e61970fbaf5eb0ba9f5242a06a36.
---
 .../get_block_shape_and_split_kv_block.cu     | 21 ++++++-------------
 1 file changed, 6 insertions(+), 15 deletions(-)
diff --git a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
index 3368eb620..4a42235f5 100644
--- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
+++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
@@ -15,7 +15,6 @@
 #include "helper.h"
 #include "paddle/extension.h"
 #ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
-#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/core/memory/memcpy.h"
 #endif
 #include "utils.cuh"
@@ -288,13 +287,9 @@ void GetBlockShapeAndSplitKVBlock(
                                                 seq_lens_encoder.data<int>(),
                                                 max_len_tensor_gpu.data<int>(),
                                                 bsz);
-  // Note (sunxin): Skip capturing the DtoH copy (it's time-consuming); CPU data
-  // is only for branching in attention.
-#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
-  if (!phi::backends::gpu::IsCUDAGraphCapturing())
-#endif
-    max_len_tensor_cpu.copy_(
-        max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
+
+  max_len_tensor_cpu.copy_(
+      max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
 
   auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>();
   int max_len_this_time = max_len_cpu_ptr[0];
@@ -403,13 +398,9 @@ void GetBlockShapeAndSplitKVBlock(
           bsz,
           decoder_block_shape_q,
           group_size);
-      // Note (sunxin): Skip capturing the DtoH copy (it's time-consuming); CPU
-      // data is only for branching in attention.
-#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
-      if (!phi::backends::gpu::IsCUDAGraphCapturing())
-#endif
-        decoder_num_blocks_cpu.copy_(
-            decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
+
+      decoder_num_blocks_cpu.copy_(
+          decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
       PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
           decoder_chunk_size_device.data<int>(), 64, sizeof(int32_t), stream));
     }