【Hackathon 9th No.24】add rebuild_padding (#4107)

This commit is contained in:
co63oc
2025-09-24 12:08:17 +08:00
committed by GitHub
parent b455fd39f3
commit a1c5d930bb

View File

@@ -0,0 +1,200 @@
import unittest
import numpy as np
import paddle
from fastdeploy.model_executor.ops.gpu import rebuild_padding
def RebuildPaddingKernel(
out,
tmp_out,
cu_seqlens_q,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
bsz,
):
for bi in range(bsz):
seq_id = 0
if seq_len_this_time[bi] == 0:
continue
if seq_lens_decoder[bi] == 0 and seq_lens_encoder[bi] == 0:
continue
if seq_lens_encoder[bi] > 0:
seq_id = seq_lens_encoder[bi] - 1
out[bi] = tmp_out[cu_seqlens_q[bi] + seq_id][:]
def RebuildAppendPaddingKernel(
out,
tmp_out,
cu_seqlens_q,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
token_num,
need_delete_token_num,
):
for token_id in range(token_num - need_delete_token_num):
bi = int(token_id / max_input_length)
if seq_len_this_time[bi] == 0 or (seq_lens_decoder[bi] == 0 and seq_lens_encoder[bi] == 0):
continue
ori_token_id = token_id + output_padding_offset[token_id]
seq_id = 0
if seq_lens_encoder[bi] > 0:
seq_id = seq_lens_encoder[bi] - 1
cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi]
input_token_id = ori_token_id - cum_offset_bi + seq_id
out[token_id] = tmp_out[input_token_id][:]
def rebuild_padding_ref(
tmp_out, # [token_num, dim_embed]
cu_seqlens_q, # [bsz+1, 1]
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
):
tmp_out_shape = tmp_out.shape
token_num = tmp_out_shape[0]
dim_embed = tmp_out_shape[1]
bsz = cu_seqlens_q.shape[0] - 1
out = np.zeros([bsz, dim_embed])
if output_padding_offset is not None:
need_delete_token_num = 0
for i in range(bsz):
if seq_lens_encoder[i] > 0:
need_delete_token_num += seq_lens_encoder[i] - 1
out = np.zeros([token_num - need_delete_token_num, dim_embed])
else:
out = np.zeros([bsz, dim_embed])
if output_padding_offset is not None:
RebuildAppendPaddingKernel(
out,
tmp_out,
cu_seqlens_q,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
token_num,
need_delete_token_num,
)
else:
RebuildPaddingKernel(
out,
tmp_out,
cu_seqlens_q,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
bsz,
)
return out
class TestRebuildPadding(unittest.TestCase):
# test no offset
def test_rebuild_padding_no_offset(self):
token_num = 100
dim_embed = 256
# bsz = 4
max_input_length = 512
# tmp_out: [token_num, dim_embed]
tmp_out = np.random.randn(token_num, dim_embed).astype(np.float32)
# cu_seqlens_q: [bsz + 1]accumulate the number of tokens for each batch.
cu_seqlens_q = np.array(
[0, 1, 21, 22, 42, 43, 63, 64, 84], dtype=np.int32
) # Assume there are 4 batches, and the total token_num = 100.
# Simulated sequence length information
seq_len_this_time = np.array([1, 20, 1, 20, 1, 20, 1, 20], dtype=np.int32)
seq_lens_encoder = np.array([0, 20, 0, 20, 0, 20, 0, 20], dtype=np.int32)
seq_lens_decoder = np.array([21, 0, 21, 0, 21, 0, 21, 0], dtype=np.int32)
out_no_offset_ref = rebuild_padding_ref(
tmp_out=tmp_out,
cu_seqlens_q=cu_seqlens_q,
seq_len_this_time=seq_len_this_time,
seq_lens_decoder=seq_lens_decoder,
seq_lens_encoder=seq_lens_encoder,
output_padding_offset=None,
max_input_length=max_input_length,
)
tmp_out = paddle.to_tensor(tmp_out)
cu_seqlens_q = paddle.to_tensor(cu_seqlens_q)
seq_len_this_time = paddle.to_tensor(seq_len_this_time)
seq_lens_decoder = paddle.to_tensor(seq_lens_decoder)
seq_lens_encoder = paddle.to_tensor(seq_lens_encoder)
out_no_offset = rebuild_padding(
tmp_out,
cu_seqlens_q,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
None,
max_input_length,
)
np.testing.assert_allclose(out_no_offset.numpy(), out_no_offset_ref)
# test with offset
def test_rebuild_padding_with_offset(self):
paddle.seed(42)
token_num = 100
dim_embed = 256
# bsz = 4
max_input_length = 512
# tmp_out: [token_num, dim_embed]
tmp_out = np.random.randn(token_num, dim_embed).astype(np.float32)
# cu_seqlens_q: [bsz + 1]accumulate the number of tokens for each batch.
cu_seqlens_q = np.array(
[0, 1, 21, 22, 42, 43, 63, 64, 84], dtype=np.int32
) # Assume there are 4 batches, and the total token_num = 100.
# Simulated sequence length information
seq_len_this_time = np.array([1, 20, 1, 20, 1, 20, 1, 20], dtype=np.int32)
seq_lens_encoder = np.array([0, 20, 0, 20, 0, 20, 0, 20], dtype=np.int32)
seq_lens_decoder = np.array([21, 0, 21, 0, 21, 0, 21, 0], dtype=np.int32)
num_output_tokens = 80
output_padding_offset = np.random.randint(0, 10, [num_output_tokens], dtype=np.int32)
out_with_offset_ref = rebuild_padding_ref(
tmp_out=tmp_out,
cu_seqlens_q=cu_seqlens_q,
seq_len_this_time=seq_len_this_time,
seq_lens_decoder=seq_lens_decoder,
seq_lens_encoder=seq_lens_encoder,
output_padding_offset=output_padding_offset,
max_input_length=max_input_length,
)
tmp_out = paddle.to_tensor(tmp_out)
cu_seqlens_q = paddle.to_tensor(cu_seqlens_q)
seq_len_this_time = paddle.to_tensor(seq_len_this_time)
seq_lens_decoder = paddle.to_tensor(seq_lens_decoder)
seq_lens_encoder = paddle.to_tensor(seq_lens_encoder)
output_padding_offset = paddle.to_tensor(output_padding_offset)
out_with_offset = rebuild_padding(
tmp_out,
cu_seqlens_q,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
)
np.testing.assert_allclose(out_with_offset.numpy(), out_with_offset_ref)
if __name__ == "__main__":
unittest.main()