Modify mask_offset‘s format (#3525)

* modify mask_offset in decode * modify mask_offset unittest --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-09-02 18:05:35 +08:00
parent f296aff6cf
commit 7a521bbf62
5 changed files with 13 additions and 11 deletions
--- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
@@ -142,7 +142,7 @@ __global__ void multi_query_append_attention_kernel(
  } else {
    o_base_ptr_int8 = out + o_offset;
  }
-  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
  smem_t qo_smem(smem);

  uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -511,7 +511,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
          tid % 8 * num_elems_per_128b<T>();
    }
  }
-  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
  smem_t qo_smem(smem);

  uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
--- a/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
@@ -173,7 +173,7 @@ __global__ void multi_query_append_attention_c4_kernel(
  } else {
    o_base_ptr_int8 = out + o_offset;
  }
-  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
  smem_t qo_smem(smem);

  uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -635,7 +635,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
          tid % 8 * num_elems_per_128b<T>();
    }
  }
-  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
  smem_t qo_smem(smem);

  uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
--- a/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
@@ -180,7 +180,7 @@ __global__ void multi_query_append_attention_c8_kernel(
  } else {
    o_base_ptr_int8 = out + o_offset;
  }
-  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
  smem_t qo_smem(smem);

  uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -609,7 +609,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
          tid % 8 * num_elems_per_128b<T>();
    }
  }
-  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
  smem_t qo_smem(smem);

  uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
--- a/custom_ops/gpu_ops/append_attn/append_attention_func.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_func.cuh
@@ -929,7 +929,7 @@ __device__ __forceinline__ void mask_s(const bool* attn_mask,
                                  8 * (reg_id / 4) + reg_id % 2;
          bool out_of_boundary;
          if (mask_offset) {
-            out_of_boundary = q_idx < qo_len ? (kv_idx > mask_offset[q_idx]) : true;
+            out_of_boundary = q_idx < qo_len ? (kv_idx >= mask_offset[q_idx * 2 + 1] || kv_idx < mask_offset[q_idx * 2]) : true;
          } else {
            out_of_boundary =
                (causal
--- a/tests/layers/test_append_attention.py
+++ b/tests/layers/test_append_attention.py
@@ -407,10 +407,11 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
        self.token_num = self.padding_offset.shape[0]
        self.mask_offset = None
        if self.use_mask_offset:
-            self.mask_offset = paddle.full(self.seq_len * self.batch_size, 0, "int32")
+            self.mask_offset = paddle.full(self.batch_size * self.seq_len * 2, 0, "int32")
            for i in range(self.batch_size):
                for j in range(self.seq_len):
-                    self.mask_offset[i * self.seq_len + j] = j
+                    self.mask_offset[i * self.seq_len * 2 + j * 2] = 0
+                    self.mask_offset[i * self.seq_len * 2 + j * 2 + 1] = j + 1

    def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask=None):
        paddle.disable_static()
@@ -601,9 +602,10 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
            self.cu_seqlens_k,
        ) = get_padding_offset(self.batch_size, 1, self.seq_lens_this_time)
        if self.use_mask_offset:
-            self.mask_offset = paddle.full(self.batch_size, 0, "int32")
+            self.mask_offset = paddle.full(self.batch_size * 2, 0, "int32")
            for i in range(self.batch_size):
-                self.mask_offset[i] = self.seq_lens_dec[i]
+                self.mask_offset[i * 2] = 0
+                self.mask_offset[i * 2 + 1] = self.seq_lens_dec[i] + 1
            print("decoder mask_offset: ", self.mask_offset)
        self.cmp_append_attention(naive_cache_k, naive_cache_v, None)