From 249feca65a354229823e955863c65c2fa4d47acc Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com> Date: Fri, 14 Nov 2025 15:52:51 +0800 Subject: [PATCH] [BugFix] Revert skip capture (#5023) * Revert "[BugFix][Metax] Fix metax compile issue in get_block_shape_and_split_kv_block (#5000)" This reverts commit 05da8e34c0f4f567aaf0150bc89e3c80321efabc. * Revert "skip DtoH capture (#4988)" This reverts commit 5b24013d4609e61970fbaf5eb0ba9f5242a06a36. --- .../get_block_shape_and_split_kv_block.cu | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu index 3368eb620..4a42235f5 100644 --- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu +++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu @@ -15,7 +15,6 @@ #include "helper.h" #include "paddle/extension.h" #ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU -#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" #include "paddle/phi/core/memory/memcpy.h" #endif #include "utils.cuh" @@ -288,13 +287,9 @@ void GetBlockShapeAndSplitKVBlock( seq_lens_encoder.data(), max_len_tensor_gpu.data(), bsz); - // Note (sunxin): Skip capturing the DtoH copy (it's time-consuming); CPU data - // is only for branching in attention. -#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU - if (!phi::backends::gpu::IsCUDAGraphCapturing()) -#endif - max_len_tensor_cpu.copy_( - max_len_tensor_gpu, max_len_tensor_cpu.place(), false); + + max_len_tensor_cpu.copy_( + max_len_tensor_gpu, max_len_tensor_cpu.place(), false); auto max_len_cpu_ptr = max_len_tensor_cpu.data(); int max_len_this_time = max_len_cpu_ptr[0]; @@ -403,13 +398,9 @@ void GetBlockShapeAndSplitKVBlock( bsz, decoder_block_shape_q, group_size); - // Note (sunxin): Skip capturing the DtoH copy (it's time-consuming); CPU - // data is only for branching in attention. -#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU - if (!phi::backends::gpu::IsCUDAGraphCapturing()) -#endif - decoder_num_blocks_cpu.copy_( - decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false); + + decoder_num_blocks_cpu.copy_( + decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( decoder_chunk_size_device.data(), 64, sizeof(int32_t), stream)); }