mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
【Inference Optimize】DeepSeek-V3-model MLA Optimize (#3886)
* support MLA chunk_size auto search & cuda_graph
This commit is contained in:
@@ -210,6 +210,12 @@ class MTPProposer(Proposer):
|
||||
self.model_inputs["decoder_num_blocks_cpu"] = paddle.zeros_like(
|
||||
self.target_model_inputs["decoder_num_blocks_cpu"]
|
||||
).pin_memory()
|
||||
self.model_inputs["decoder_num_blocks_device"] = paddle.zeros_like(
|
||||
self.target_model_inputs["decoder_num_blocks_device"]
|
||||
)
|
||||
self.model_inputs["decoder_chunk_size_device"] = paddle.zeros_like(
|
||||
self.target_model_inputs["decoder_chunk_size_device"]
|
||||
)
|
||||
self.model_inputs["max_len_tensor_cpu"] = paddle.zeros_like(
|
||||
self.target_model_inputs["max_len_tensor_cpu"]
|
||||
).cpu()
|
||||
@@ -338,6 +344,8 @@ class MTPProposer(Proposer):
|
||||
self.model_inputs["decoder_batch_ids"] = None
|
||||
self.model_inputs["decoder_tile_ids_per_batch"] = None
|
||||
self.model_inputs["decoder_num_blocks_cpu"] = None # Pinning Memory
|
||||
self.model_inputs["decoder_num_blocks_device"] = None
|
||||
self.model_inputs["decoder_chunk_size_device"] = None
|
||||
self.model_inputs["max_len_tensor_cpu"] = None # CPU
|
||||
self.model_inputs["encoder_batch_ids"] = None
|
||||
self.model_inputs["encoder_tile_ids_per_batch"] = None
|
||||
@@ -528,6 +536,8 @@ class MTPProposer(Proposer):
|
||||
decoder_batch_ids=self.model_inputs["decoder_batch_ids"],
|
||||
decoder_tile_ids_per_batch=self.model_inputs["decoder_tile_ids_per_batch"],
|
||||
decoder_num_blocks_cpu=self.model_inputs["decoder_num_blocks_cpu"],
|
||||
decoder_num_blocks_device=self.model_inputs["decoder_num_blocks_device"],
|
||||
decoder_chunk_size_device=self.model_inputs["decoder_chunk_size_device"],
|
||||
max_len_tensor_cpu=self.model_inputs["max_len_tensor_cpu"],
|
||||
seq_lens_encoder=self.model_inputs["seq_lens_encoder"],
|
||||
seq_lens_decoder=self.model_inputs["seq_lens_decoder"],
|
||||
|
Reference in New Issue
Block a user