FastDeploy/custom_ops/xpu_ops/test/test_get_infer_param.py

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle

from fastdeploy.model_executor.ops.xpu import get_infer_param

seq_lens_encoder = paddle.to_tensor([100, 0, 0, 0, 300], dtype="int32")
seq_lens_decoder = paddle.to_tensor([0, 5, 0, 25, 64], dtype="int32")
seq_lens_this_time = paddle.to_tensor([100, 1, 0, 1, 300], dtype="int32")
block_table = paddle.arange(0, 40, dtype="int32")
block_table = block_table.reshape((5, 8))
(
    encoder_batch_map,
    decoder_batch_map,
    encoder_batch_idx,
    decoder_batch_idx,
    encoder_seq_lod,
    decoder_seq_lod,
    encoder_kv_lod,
    prefix_len,
    decoder_context_len,
    decoder_context_len_cache,
    prefix_block_tables,
    encoder_batch_map_cpu,
    decoder_batch_map_cpu,
    encoder_batch_idx_cpu,
    decoder_batch_idx_cpu,
    encoder_seq_lod_cpu,
    decoder_seq_lod_cpu,
    encoder_kv_lod_cpu,
    prefix_len_cpu,
    decoder_context_len_cpu,
    decoder_context_len_cache_cpu,
    len_info_cpu,
) = get_infer_param(
    seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, block_table, 64
)  # block_size

print("block_table", block_table)
print("encoder_batch_map", encoder_batch_map)  # [0, 4, 0, 0, 0]
print("decoder_batch_map", decoder_batch_map)  # [1, 3, 0, 0, 0]
print("encoder_batch_idx", encoder_batch_idx)  # [0, 3, 0, 0, 0]
print("decoder_batch_idx", decoder_batch_idx)  # [1, 2, 0, 0, 0]
print("encoder_seq_lod", encoder_seq_lod)  # [0, 100, 400 ,0 ,0 ,0]
print("decoder_seq_lod", decoder_seq_lod)  # [0, 1,   2   ,0 ,0 ,0]
print("encoder_kv_lod", encoder_kv_lod)  # [0, 100, 464, 0, 0, 0]
print("prefix_len", prefix_len)  # [0, 64, 0, 0, 0]
print("decoder_context_len", decoder_context_len)  # [6, 26, 0, 0, 0]
print("decoder_context_len_cache", decoder_context_len_cache)  # [5, 25, 0, 0, 0]
print("prefix_block_tables", prefix_block_tables)
print("encoder_batch_map_cpu", encoder_batch_map_cpu)  # [0, 4, 0, 0, 0]
print("decoder_batch_map_cpu", decoder_batch_map_cpu)  # [1, 3, 0, 0, 0]
print("encoder_batch_idx_cpu", encoder_batch_idx_cpu)  # [0, 3, 0, 0, 0]
print("decoder_batch_idx_cpu", decoder_batch_idx_cpu)  # [1, 2, 0, 0, 0]
print("encoder_seq_lod_cpu", encoder_seq_lod_cpu)  # [0, 100, 400 ,0 ,0 ,0]
print("decoder_seq_lod_cpu", decoder_seq_lod_cpu)  # [0, 1,   2   ,0 ,0 ,0]
print("encoder_kv_lod_cpu", encoder_kv_lod_cpu)  # [0, 100, 464, 0, 0, 0]
print("prefix_len_cpu", prefix_len_cpu)  # [0, 64, 0, 0, 0]
print("decoder_context_len_cpu", decoder_context_len_cpu)  # [6, 26, 0, 0, 0]
print("decoder_context_len_cache_cpu", decoder_context_len_cache_cpu)  # [5, 25, 0, 0, 0]
print(
    "len_info_cpu", len_info_cpu
)  # {enc_batch, dec_batch, total_enc_len, max_seq_len, max_kv_len, prefix_block_num_per_seq} = [2, 2, 400, 300, 364, 6]

"""
block_table Tensor(shape=[5, 8], dtype=int32, place=Place(xpu:0), stop_gradient=True,
       [[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ],
        [8 , 9 , 10, 11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20, 21, 22, 23],
        [24, 25, 26, 27, 28, 29, 30, 31],
        [32, 33, 34, 35, 36, 37, 38, 39]])

prefix_block_tables Tensor(shape=[5, 8], dtype=int32, place=Place(xpu:0), stop_gradient=True,
       [[ 0,  1, -1, -1, -1, -1, 32, 33],
        [34, 35, 36, 37, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1]])

The size of the prefix_block_tables tensor is same as block_table to avoid problems with InferShape of the prefix_block_tables.
However, the actual size used by prefix_block_tables is [block_bs, prefix_block_num_per_seq], where prefix_block_num_per_seq = ceil(max_kv_len / block_size).
Therefore, do not use the tensor shape of prefix_block_tables. Its shape is obtained through block_table.dims[0] and len_info_cpu[-1]
"""