mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-15 05:01:00 +08:00
【Inference Optimize】DeepSeek-V3-model MLA Optimize (#3886)
* support MLA chunk_size auto search & cuda_graph
This commit is contained in:
@@ -30,7 +30,9 @@ def get_block_shape_and_split_kv_block(
|
||||
seq_lens_this_time: paddle.Tensor,
|
||||
decoder_batch_ids: paddle.Tensor,
|
||||
decoder_tile_ids_per_batch: paddle.Tensor,
|
||||
decoder_num_blocks_x_cpu: paddle.Tensor,
|
||||
decoder_num_blocks_cpu: paddle.Tensor,
|
||||
decoder_num_blocks_device: paddle.Tensor,
|
||||
decoder_chunk_size_device: paddle.Tensor,
|
||||
max_len_tensor_cpu: paddle.Tensor,
|
||||
encoder_batch_ids: paddle.Tensor,
|
||||
encoder_tile_ids_per_batch: paddle.Tensor,
|
||||
@@ -55,7 +57,9 @@ def get_block_shape_and_split_kv_block(
|
||||
seq_lens_this_time,
|
||||
decoder_batch_ids,
|
||||
decoder_tile_ids_per_batch,
|
||||
decoder_num_blocks_x_cpu,
|
||||
decoder_num_blocks_cpu,
|
||||
decoder_num_blocks_device,
|
||||
decoder_chunk_size_device,
|
||||
max_len_tensor_cpu,
|
||||
encoder_batch_ids,
|
||||
encoder_tile_ids_per_batch,
|
||||
|
Reference in New Issue
Block a user