mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[ATTENTION] make buffer alloc as a function (#4945)
This commit is contained in:
@@ -38,6 +38,9 @@ from fastdeploy.model_executor.guided_decoding import (
|
||||
get_guided_backend,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
||||
from fastdeploy.model_executor.layers.attention.append_attn_backend import (
|
||||
allocate_launch_related_buffer,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||
AttentionBackend,
|
||||
)
|
||||
@@ -1497,41 +1500,20 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
)
|
||||
head_dim = self.model_config.head_dim
|
||||
|
||||
# Initialize AttentionBackend buffers
|
||||
encoder_block_shape_q = 64
|
||||
decoder_block_shape_q = 16
|
||||
decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1
|
||||
group_size = np.ceil(num_heads / self.model_config.kv_num_heads)
|
||||
|
||||
# NOTE: (changwenbin) When using auto_chunk,
|
||||
# decode_max_tile_size must take into account the maximum case, where *1024 can cover 128K.
|
||||
decode_max_tile_size = (
|
||||
1024
|
||||
* self.scheduler_config.max_num_seqs
|
||||
* np.ceil((decoder_step_token_num * group_size) / decoder_block_shape_q)
|
||||
res_buffer = allocate_launch_related_buffer(
|
||||
max_batch_size=self.scheduler_config.max_num_seqs,
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
encoder_block_shape_q=encoder_block_shape_q,
|
||||
decoder_block_shape_q=decoder_block_shape_q,
|
||||
decoder_step_token_num=self.speculative_config.num_speculative_tokens + 1,
|
||||
num_heads=num_heads,
|
||||
kv_num_heads=self.model_config.kv_num_heads,
|
||||
block_size=self.fd_config.cache_config.block_size,
|
||||
)
|
||||
encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
(self.model_config.max_model_len * group_size) / encoder_block_shape_q
|
||||
)
|
||||
kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
self.model_config.max_model_len / self.fd_config.cache_config.block_size
|
||||
)
|
||||
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").pin_memory()
|
||||
# NOTE: (changwenbin) MLA kernel only needs decoder_num_blocks_device in place of GPU tensor,
|
||||
# adapted to cudagraph.
|
||||
self.share_inputs["decoder_num_blocks_device"] = paddle.full([1], 0, dtype="int32")
|
||||
self.share_inputs["decoder_chunk_size_device"] = paddle.full([1], 64, dtype="int32")
|
||||
self.share_inputs["max_len_tensor_cpu"] = paddle.full([9], 0, dtype="int32").cpu()
|
||||
|
||||
self.share_inputs["encoder_batch_ids"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32")
|
||||
self.share_inputs["encoder_tile_ids_per_batch"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32")
|
||||
self.share_inputs["encoder_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
|
||||
|
||||
self.share_inputs["kv_batch_ids"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32")
|
||||
self.share_inputs["kv_tile_ids_per_batch"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32")
|
||||
self.share_inputs["kv_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
|
||||
self.share_inputs.update(res_buffer)
|
||||
|
||||
# Get the attention backend
|
||||
attn_cls = get_attention_backend()
|
||||
|
||||
Reference in New Issue
Block a user