【Inference Optimize】DeepSeek-V3-model MLA Optimize (#3886)

* support MLA chunk_size auto search & cuda_graph
This commit is contained in:
AIbin
2025-09-11 10:46:09 +08:00
committed by GitHub
parent 637d96c6ae
commit a7392a0ff9
23 changed files with 375 additions and 310 deletions

View File

@@ -99,6 +99,8 @@ class ForwardMeta:
decoder_batch_ids: Optional[paddle.Tensor] = None
# Maps the thread block index (blockIdx.x) to the specific data tile being processed within that batch for the decoder stage in multi_query_append_attention_warp1_4_kernel.
decoder_tile_ids_per_batch: Optional[paddle.Tensor] = None
# The number of blocks that attention backend can use in decode stage
decoder_num_blocks_device: Optional[paddle.Tensor] = None
# The number of CUDA blocks to launch in the x-dimension for the multi_query_append_attention_warp1_4_kernel, defining its grids.x.
decoder_num_blocks_cpu: Optional[paddle.Tensor] = None
# A tensor that holds multiple lengths related to prefill or decode stages.
@@ -118,6 +120,8 @@ class ForwardMeta:
# The maximum sequence length of the KV cache, which may represent the current maximum decoder length.
max_len_kv_cpu: Optional[paddle.Tensor] = None
decoder_chunk_size_device: Optional[paddle.Tensor] = None
# Sequence length of encoder for ever batch
seq_lens_encoder: Optional[paddle.Tensor] = None
# Sequence length of Encoder for ever batch