mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
support c4 attn && fix cache
This commit is contained in:
@@ -217,14 +217,17 @@ class MLAAttentionBackend(AttentionBackend):
|
||||
self.attention_metadata: AttentionMetadata = metadata
|
||||
|
||||
forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False)
|
||||
forward_meta.decoder_tile_ids_per_batch.copy_(
|
||||
metadata.decoder_tile_ids_per_batch, False)
|
||||
forward_meta.decoder_tile_ids_per_batch.copy_(metadata.decoder_tile_ids_per_batch, False)
|
||||
|
||||
def get_attntion_meta(self) -> AttentionMetadata:
|
||||
"""get_attntion_meta"""
|
||||
return self.attention_metadata
|
||||
|
||||
def get_kv_cache_shape(self, max_num_blocks: int) -> Tuple[int, int, int, int]:
|
||||
def get_kv_cache_shape(
|
||||
self,
|
||||
max_num_blocks: int,
|
||||
kv_cache_quant_type: str = None,
|
||||
) -> Tuple[int, int, int, int]:
|
||||
"""
|
||||
Calculate kv cache shape for MLA
|
||||
"""
|
||||
|
Reference in New Issue
Block a user