fix dynamic Cfp8 computing error (#4119)

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-27 04:46:16 +08:00 · 2025-09-16 20:21:49 +08:00
parent 4f8901489c
commit de8638b1e9
2 changed files with 3 additions and 3 deletions
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
@@ -1049,7 +1049,7 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel(
      local_max = __hmax(local_max, __habs(out_vec2[i]));
    }
 #pragma unroll
-    for (int m_offset = 16; m_offset > 1; m_offset /= 2) {
+    for (int m_offset = 16; m_offset > 0; m_offset /= 2) {
      local_max = __hmax(local_max, __shfl_xor_sync(0xffffffff, local_max, m_offset));
    }

--- a/tests/layers/test_append_attention.py
+++ b/tests/layers/test_append_attention.py
@@ -379,7 +379,7 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
        )
        self.max_enc_len_this_time = paddle.to_tensor([self.max_enc_len_this_time], "int32", place=paddle.CPUPlace())
        self.max_dec_len_this_time = paddle.to_tensor([self.max_dec_len_this_time], "int32", place=paddle.CPUPlace())
-        self.seq_lens_this_time = self.seq_lens_encoder
+        self.seq_lens_this_time = copy.deepcopy(self.seq_lens_encoder)
        decode_max_tile_size = 1024 * self.batch_size * np.ceil((2 * 10) / 12)
        self.decoder_batch_ids = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
        self.decoder_tile_ids_per_batch = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
@@ -652,7 +652,7 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
        )
        # encoder
        # self.seq_lens_encoder,self.seq_lens_decoder,self.max_enc_len_this_time,self.max_dec_len_this_time=get_encoder_decoder_len(self.batch_size,self.seq_len)
-        self.seq_lens_this_time = self.seq_lens_encoder
+        self.seq_lens_this_time = copy.deepcopy(self.seq_lens_encoder)
        if self.use_mask_offset:
            print("encoder mask_offset: ", self.mask_offset)
        self.cmp_append_attention(attn_mask=self.attention_mask)