support c4 attn && fix cache

This commit is contained in:
lizhenyun01
2025-07-23 23:51:28 +08:00
parent 832d25334a
commit 29c3292f02
16 changed files with 198 additions and 65 deletions

View File

@@ -217,14 +217,17 @@ class MLAAttentionBackend(AttentionBackend):
self.attention_metadata: AttentionMetadata = metadata
forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False)
forward_meta.decoder_tile_ids_per_batch.copy_(
metadata.decoder_tile_ids_per_batch, False)
forward_meta.decoder_tile_ids_per_batch.copy_(metadata.decoder_tile_ids_per_batch, False)
def get_attntion_meta(self) -> AttentionMetadata:
"""get_attntion_meta"""
return self.attention_metadata
def get_kv_cache_shape(self, max_num_blocks: int) -> Tuple[int, int, int, int]:
def get_kv_cache_shape(
self,
max_num_blocks: int,
kv_cache_quant_type: str = None,
) -> Tuple[int, int, int, int]:
"""
Calculate kv cache shape for MLA
"""