import unittest import numpy as np import paddle from fastdeploy.model_executor.ops.gpu import rebuild_padding def RebuildPaddingKernel( out, tmp_out, cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, bsz, ): for bi in range(bsz): seq_id = 0 if seq_len_this_time[bi] == 0: continue if seq_lens_decoder[bi] == 0 and seq_lens_encoder[bi] == 0: continue if seq_lens_encoder[bi] > 0: seq_id = seq_lens_encoder[bi] - 1 out[bi] = tmp_out[cu_seqlens_q[bi] + seq_id][:] def RebuildAppendPaddingKernel( out, tmp_out, cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, output_padding_offset, max_input_length, token_num, need_delete_token_num, ): for token_id in range(token_num - need_delete_token_num): bi = int(token_id / max_input_length) if seq_len_this_time[bi] == 0 or (seq_lens_decoder[bi] == 0 and seq_lens_encoder[bi] == 0): continue ori_token_id = token_id + output_padding_offset[token_id] seq_id = 0 if seq_lens_encoder[bi] > 0: seq_id = seq_lens_encoder[bi] - 1 cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi] input_token_id = ori_token_id - cum_offset_bi + seq_id out[token_id] = tmp_out[input_token_id][:] def rebuild_padding_ref( tmp_out, # [token_num, dim_embed] cu_seqlens_q, # [bsz+1, 1] seq_len_this_time, seq_lens_decoder, seq_lens_encoder, output_padding_offset, max_input_length, ): tmp_out_shape = tmp_out.shape token_num = tmp_out_shape[0] dim_embed = tmp_out_shape[1] bsz = cu_seqlens_q.shape[0] - 1 out = np.zeros([bsz, dim_embed]) if output_padding_offset is not None: need_delete_token_num = 0 for i in range(bsz): if seq_lens_encoder[i] > 0: need_delete_token_num += seq_lens_encoder[i] - 1 out = np.zeros([token_num - need_delete_token_num, dim_embed]) else: out = np.zeros([bsz, dim_embed]) if output_padding_offset is not None: RebuildAppendPaddingKernel( out, tmp_out, cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, output_padding_offset, max_input_length, token_num, need_delete_token_num, ) else: RebuildPaddingKernel( out, tmp_out, cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, bsz, ) return out class TestRebuildPadding(unittest.TestCase): # test no offset def test_rebuild_padding_no_offset(self): token_num = 100 dim_embed = 256 # bsz = 4 max_input_length = 512 # tmp_out: [token_num, dim_embed] tmp_out = np.random.randn(token_num, dim_embed).astype(np.float32) # cu_seqlens_q: [bsz + 1],accumulate the number of tokens for each batch. cu_seqlens_q = np.array( [0, 1, 21, 22, 42, 43, 63, 64, 84], dtype=np.int32 ) # Assume there are 4 batches, and the total token_num = 100. # Simulated sequence length information seq_len_this_time = np.array([1, 20, 1, 20, 1, 20, 1, 20], dtype=np.int32) seq_lens_encoder = np.array([0, 20, 0, 20, 0, 20, 0, 20], dtype=np.int32) seq_lens_decoder = np.array([21, 0, 21, 0, 21, 0, 21, 0], dtype=np.int32) out_no_offset_ref = rebuild_padding_ref( tmp_out=tmp_out, cu_seqlens_q=cu_seqlens_q, seq_len_this_time=seq_len_this_time, seq_lens_decoder=seq_lens_decoder, seq_lens_encoder=seq_lens_encoder, output_padding_offset=None, max_input_length=max_input_length, ) tmp_out = paddle.to_tensor(tmp_out) cu_seqlens_q = paddle.to_tensor(cu_seqlens_q) seq_len_this_time = paddle.to_tensor(seq_len_this_time) seq_lens_decoder = paddle.to_tensor(seq_lens_decoder) seq_lens_encoder = paddle.to_tensor(seq_lens_encoder) out_no_offset = rebuild_padding( tmp_out, cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, None, max_input_length, ) np.testing.assert_allclose(out_no_offset.numpy(), out_no_offset_ref) # test with offset def test_rebuild_padding_with_offset(self): paddle.seed(42) token_num = 100 dim_embed = 256 # bsz = 4 max_input_length = 512 # tmp_out: [token_num, dim_embed] tmp_out = np.random.randn(token_num, dim_embed).astype(np.float32) # cu_seqlens_q: [bsz + 1],accumulate the number of tokens for each batch. cu_seqlens_q = np.array( [0, 1, 21, 22, 42, 43, 63, 64, 84], dtype=np.int32 ) # Assume there are 4 batches, and the total token_num = 100. # Simulated sequence length information seq_len_this_time = np.array([1, 20, 1, 20, 1, 20, 1, 20], dtype=np.int32) seq_lens_encoder = np.array([0, 20, 0, 20, 0, 20, 0, 20], dtype=np.int32) seq_lens_decoder = np.array([21, 0, 21, 0, 21, 0, 21, 0], dtype=np.int32) num_output_tokens = 80 output_padding_offset = np.random.randint(0, 10, [num_output_tokens], dtype=np.int32) out_with_offset_ref = rebuild_padding_ref( tmp_out=tmp_out, cu_seqlens_q=cu_seqlens_q, seq_len_this_time=seq_len_this_time, seq_lens_decoder=seq_lens_decoder, seq_lens_encoder=seq_lens_encoder, output_padding_offset=output_padding_offset, max_input_length=max_input_length, ) tmp_out = paddle.to_tensor(tmp_out) cu_seqlens_q = paddle.to_tensor(cu_seqlens_q) seq_len_this_time = paddle.to_tensor(seq_len_this_time) seq_lens_decoder = paddle.to_tensor(seq_lens_decoder) seq_lens_encoder = paddle.to_tensor(seq_lens_encoder) output_padding_offset = paddle.to_tensor(output_padding_offset) out_with_offset = rebuild_padding( tmp_out, cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, output_padding_offset, max_input_length, ) np.testing.assert_allclose(out_with_offset.numpy(), out_with_offset_ref) if __name__ == "__main__": unittest.main()