From de8638b1e95ee9e7c4f9964c5e44aa541ed6fa0e Mon Sep 17 00:00:00 2001 From: Yuan Xiaolan <845594810@qq.com> Date: Tue, 16 Sep 2025 20:21:49 +0800 Subject: [PATCH] fix dynamic Cfp8 computing error (#4119) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> --- .../append_attn/decoder_write_cache_with_rope_impl.cuh | 2 +- tests/layers/test_append_attention.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh index 0e4b98966..4c6614152 100644 --- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh @@ -1049,7 +1049,7 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel( local_max = __hmax(local_max, __habs(out_vec2[i])); } #pragma unroll - for (int m_offset = 16; m_offset > 1; m_offset /= 2) { + for (int m_offset = 16; m_offset > 0; m_offset /= 2) { local_max = __hmax(local_max, __shfl_xor_sync(0xffffffff, local_max, m_offset)); } diff --git a/tests/layers/test_append_attention.py b/tests/layers/test_append_attention.py index 31b12e539..6da6681e7 100644 --- a/tests/layers/test_append_attention.py +++ b/tests/layers/test_append_attention.py @@ -379,7 +379,7 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): ) self.max_enc_len_this_time = paddle.to_tensor([self.max_enc_len_this_time], "int32", place=paddle.CPUPlace()) self.max_dec_len_this_time = paddle.to_tensor([self.max_dec_len_this_time], "int32", place=paddle.CPUPlace()) - self.seq_lens_this_time = self.seq_lens_encoder + self.seq_lens_this_time = copy.deepcopy(self.seq_lens_encoder) decode_max_tile_size = 1024 * self.batch_size * np.ceil((2 * 10) / 12) self.decoder_batch_ids = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") self.decoder_tile_ids_per_batch = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") @@ -652,7 +652,7 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): ) # encoder # self.seq_lens_encoder,self.seq_lens_decoder,self.max_enc_len_this_time,self.max_dec_len_this_time=get_encoder_decoder_len(self.batch_size,self.seq_len) - self.seq_lens_this_time = self.seq_lens_encoder + self.seq_lens_this_time = copy.deepcopy(self.seq_lens_encoder) if self.use_mask_offset: print("encoder mask_offset: ", self.mask_offset) self.cmp_append_attention(attn_mask=self.attention_mask)