【Inference Optimize】DeepSeek-V3-model MLA Optimize (#3886)

* support MLA chunk_size auto search & cuda_graph
This commit is contained in:
AIbin
2025-09-11 10:46:09 +08:00
committed by GitHub
parent 637d96c6ae
commit a7392a0ff9
23 changed files with 375 additions and 310 deletions

View File

@@ -210,6 +210,12 @@ class MTPProposer(Proposer):
self.model_inputs["decoder_num_blocks_cpu"] = paddle.zeros_like(
self.target_model_inputs["decoder_num_blocks_cpu"]
).pin_memory()
self.model_inputs["decoder_num_blocks_device"] = paddle.zeros_like(
self.target_model_inputs["decoder_num_blocks_device"]
)
self.model_inputs["decoder_chunk_size_device"] = paddle.zeros_like(
self.target_model_inputs["decoder_chunk_size_device"]
)
self.model_inputs["max_len_tensor_cpu"] = paddle.zeros_like(
self.target_model_inputs["max_len_tensor_cpu"]
).cpu()
@@ -338,6 +344,8 @@ class MTPProposer(Proposer):
self.model_inputs["decoder_batch_ids"] = None
self.model_inputs["decoder_tile_ids_per_batch"] = None
self.model_inputs["decoder_num_blocks_cpu"] = None # Pinning Memory
self.model_inputs["decoder_num_blocks_device"] = None
self.model_inputs["decoder_chunk_size_device"] = None
self.model_inputs["max_len_tensor_cpu"] = None # CPU
self.model_inputs["encoder_batch_ids"] = None
self.model_inputs["encoder_tile_ids_per_batch"] = None
@@ -528,6 +536,8 @@ class MTPProposer(Proposer):
decoder_batch_ids=self.model_inputs["decoder_batch_ids"],
decoder_tile_ids_per_batch=self.model_inputs["decoder_tile_ids_per_batch"],
decoder_num_blocks_cpu=self.model_inputs["decoder_num_blocks_cpu"],
decoder_num_blocks_device=self.model_inputs["decoder_num_blocks_device"],
decoder_chunk_size_device=self.model_inputs["decoder_chunk_size_device"],
max_len_tensor_cpu=self.model_inputs["max_len_tensor_cpu"],
seq_lens_encoder=self.model_inputs["seq_lens_encoder"],
seq_lens_decoder=self.model_inputs["seq_lens_decoder"],