mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-03 15:56:49 +08:00
[Excutor] Experiment Feature-Support Prefill in cudagraph (#3459)
* Support prefill in Cudagraph * Refactor GetBlockShapeAndSplitKVBlock Kernel V2 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.1 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.2 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.3 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.4 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.5 * Solve problem about encoder_num_blocks_x_cpu * Add early-exit mechanism for attention kernel * fix test case about append-attention * Update testcode, Add annotations to related tensors * move get_input_length_list * solve test_code * Add annotations about early-exit for attention kernel * Add annotations about early-exit for attention kernel2 * solve comment * solve mtp --------- Co-authored-by: RAM <gstian5555@outlook.com>
This commit is contained in:
@@ -382,6 +382,13 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
|
||||
self.decoder_tile_ids_per_batch = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
self.decoder_num_blocks_cpu = paddle.full([1], 0, dtype="int32").pin_memory()
|
||||
self.max_len_tensor_cpu = paddle.full([8], 0, dtype="int32").cpu()
|
||||
self.encoder_batch_ids = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
self.encoder_tile_ids_per_batch = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
self.encoder_num_blocks_x_cpu = paddle.full([1], 0, dtype="int32").cpu()
|
||||
self.kv_batch_ids = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
self.kv_tile_ids_per_batch = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
self.kv_num_blocks_x_cpu = paddle.full([1], 0, dtype="int32").cpu()
|
||||
self.max_len_kv_cpu = paddle.full([1], 0, dtype="int32").cpu()
|
||||
|
||||
self.cache_shape = (
|
||||
self.max_block_num,
|
||||
@@ -450,15 +457,7 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
|
||||
get_block_shape_and_split_kv_block,
|
||||
)
|
||||
|
||||
(
|
||||
encoder_batch_ids,
|
||||
encoder_tile_ids_per_batch,
|
||||
encoder_num_blocks,
|
||||
kv_batch_ids,
|
||||
kv_tile_ids_per_batch,
|
||||
kv_num_blocks,
|
||||
max_len_kv,
|
||||
) = get_block_shape_and_split_kv_block(
|
||||
get_block_shape_and_split_kv_block(
|
||||
self.seq_lens_encoder,
|
||||
self.seq_lens_decoder,
|
||||
self.seq_lens_this_time,
|
||||
@@ -466,6 +465,13 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
|
||||
self.decoder_tile_ids_per_batch,
|
||||
self.decoder_num_blocks_cpu,
|
||||
self.max_len_tensor_cpu,
|
||||
self.encoder_batch_ids,
|
||||
self.encoder_tile_ids_per_batch,
|
||||
self.encoder_num_blocks_x_cpu,
|
||||
self.kv_batch_ids,
|
||||
self.kv_tile_ids_per_batch,
|
||||
self.kv_num_blocks_x_cpu,
|
||||
self.max_len_kv_cpu,
|
||||
64,
|
||||
12,
|
||||
(self.q_num_head + 2 * self.kv_num_head) // self.kv_num_head,
|
||||
@@ -491,17 +497,17 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
|
||||
self.padding_offset,
|
||||
self.cum_offset,
|
||||
self.block_tables,
|
||||
encoder_batch_ids,
|
||||
encoder_tile_ids_per_batch,
|
||||
encoder_num_blocks,
|
||||
kv_batch_ids,
|
||||
kv_tile_ids_per_batch,
|
||||
kv_num_blocks,
|
||||
self.encoder_batch_ids,
|
||||
self.encoder_tile_ids_per_batch,
|
||||
self.encoder_num_blocks_x_cpu,
|
||||
self.kv_batch_ids,
|
||||
self.kv_tile_ids_per_batch,
|
||||
self.kv_num_blocks_x_cpu,
|
||||
self.decoder_batch_ids,
|
||||
self.decoder_tile_ids_per_batch,
|
||||
self.decoder_num_blocks_cpu,
|
||||
self.max_len_tensor_cpu,
|
||||
max_len_kv,
|
||||
self.max_len_kv_cpu,
|
||||
out,
|
||||
self.rope_emb, # rope_emb
|
||||
None, # attn_mask
|
||||
|
Reference in New Issue
Block a user