mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 00:06:38 +08:00
[Excutor] Experiment Feature-Support Prefill in cudagraph (#3459)
* Support prefill in Cudagraph * Refactor GetBlockShapeAndSplitKVBlock Kernel V2 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.1 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.2 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.3 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.4 * Refactor GetBlockShapeAndSplitKVBlock Kernel V2.5 * Solve problem about encoder_num_blocks_x_cpu * Add early-exit mechanism for attention kernel * fix test case about append-attention * Update testcode, Add annotations to related tensors * move get_input_length_list * solve test_code * Add annotations about early-exit for attention kernel * Add annotations about early-exit for attention kernel2 * solve comment * solve mtp --------- Co-authored-by: RAM <gstian5555@outlook.com>
This commit is contained in:
@@ -212,6 +212,22 @@ class MTPProposer(Proposer):
|
||||
self.target_model_inputs["max_len_tensor_cpu"]
|
||||
).cpu()
|
||||
|
||||
self.model_inputs["encoder_batch_ids"] = paddle.zeros_like(self.target_model_inputs["encoder_batch_ids"])
|
||||
self.model_inputs["encoder_tile_ids_per_batch"] = paddle.zeros_like(
|
||||
self.target_model_inputs["encoder_tile_ids_per_batch"]
|
||||
)
|
||||
self.model_inputs["encoder_num_blocks_x_cpu"] = paddle.zeros_like(
|
||||
self.target_model_inputs["encoder_num_blocks_x_cpu"]
|
||||
).cpu()
|
||||
self.model_inputs["kv_batch_ids"] = paddle.zeros_like(self.target_model_inputs["kv_batch_ids"])
|
||||
self.model_inputs["kv_tile_ids_per_batch"] = paddle.zeros_like(
|
||||
self.target_model_inputs["kv_tile_ids_per_batch"]
|
||||
)
|
||||
self.model_inputs["kv_num_blocks_x_cpu"] = paddle.zeros_like(
|
||||
self.target_model_inputs["kv_num_blocks_x_cpu"]
|
||||
).cpu()
|
||||
self.model_inputs["max_len_kv_cpu"] = paddle.zeros_like(self.target_model_inputs["max_len_kv_cpu"]).cpu()
|
||||
|
||||
# Get the attention backend
|
||||
attn_cls = get_attention_backend()
|
||||
attn_backend = attn_cls(
|
||||
@@ -321,6 +337,13 @@ class MTPProposer(Proposer):
|
||||
self.model_inputs["decoder_tile_ids_per_batch"] = None
|
||||
self.model_inputs["decoder_num_blocks_cpu"] = None # Pinning Memory
|
||||
self.model_inputs["max_len_tensor_cpu"] = None # CPU
|
||||
self.model_inputs["encoder_batch_ids"] = None
|
||||
self.model_inputs["encoder_tile_ids_per_batch"] = None
|
||||
self.model_inputs["encoder_num_blocks_x_cpu"] = None # CPU
|
||||
self.model_inputs["kv_batch_ids"] = None
|
||||
self.model_inputs["kv_tile_ids_per_batch"] = None
|
||||
self.model_inputs["kv_num_blocks_x_cpu"] = None # CPU
|
||||
self.model_inputs["max_len_kv_cpu"] = None # CPU
|
||||
|
||||
# Input tokens
|
||||
self.model_inputs["draft_tokens"] = paddle.full(
|
||||
@@ -512,6 +535,13 @@ class MTPProposer(Proposer):
|
||||
cu_seqlens_k=self.model_inputs["cu_seqlens_k"],
|
||||
block_tables=self.model_inputs["block_tables"],
|
||||
caches=self.model_inputs["caches"],
|
||||
encoder_batch_ids=self.model_inputs["encoder_batch_ids"],
|
||||
encoder_tile_ids_per_batch=self.model_inputs["encoder_tile_ids_per_batch"],
|
||||
encoder_num_blocks_x_cpu=self.model_inputs["encoder_num_blocks_x_cpu"],
|
||||
kv_batch_ids=self.model_inputs["kv_batch_ids"],
|
||||
kv_tile_ids_per_batch=self.model_inputs["kv_tile_ids_per_batch"],
|
||||
kv_num_blocks_x_cpu=self.model_inputs["kv_num_blocks_x_cpu"],
|
||||
max_len_kv_cpu=self.model_inputs["max_len_kv_cpu"],
|
||||
)
|
||||
|
||||
# Initialzie attention meta data
|
||||
|
Reference in New Issue
Block a user