mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
【Inference Optimize】DeepSeek-V3-model MLA Optimize (#3886)
* support MLA chunk_size auto search & cuda_graph
This commit is contained in:
@@ -380,10 +380,12 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
|
||||
self.max_enc_len_this_time = paddle.to_tensor([self.max_enc_len_this_time], "int32", place=paddle.CPUPlace())
|
||||
self.max_dec_len_this_time = paddle.to_tensor([self.max_dec_len_this_time], "int32", place=paddle.CPUPlace())
|
||||
self.seq_lens_this_time = self.seq_lens_encoder
|
||||
|
||||
self.decoder_batch_ids = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
self.decoder_tile_ids_per_batch = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
decode_max_tile_size = 1024 * self.batch_size * np.ceil((2 * 10) / 12)
|
||||
self.decoder_batch_ids = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
self.decoder_tile_ids_per_batch = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
self.decoder_num_blocks_cpu = paddle.full([1], 0, dtype="int32").pin_memory()
|
||||
self.decoder_num_blocks_device = paddle.full([1], 0, dtype="int32")
|
||||
self.decoder_chunk_size_device = paddle.full([1], 64, dtype="int32")
|
||||
self.max_len_tensor_cpu = paddle.full([8], 0, dtype="int32").cpu()
|
||||
|
||||
self.encoder_batch_ids = paddle.full([self.batch_size], 0, dtype="int32")
|
||||
@@ -484,6 +486,8 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
|
||||
self.decoder_batch_ids,
|
||||
self.decoder_tile_ids_per_batch,
|
||||
self.decoder_num_blocks_cpu,
|
||||
self.decoder_num_blocks_device,
|
||||
self.decoder_chunk_size_device,
|
||||
self.max_len_tensor_cpu,
|
||||
self.encoder_batch_ids,
|
||||
self.encoder_tile_ids_per_batch,
|
||||
|
Reference in New Issue
Block a user