support c4 attn && fix cache

This commit is contained in:
lizhenyun01
2025-07-23 23:51:28 +08:00
parent 832d25334a
commit 29c3292f02
16 changed files with 198 additions and 65 deletions

View File

@@ -136,16 +136,25 @@ class FlashAttentionBackend(AttentionBackend):
def get_kv_cache_shape(
self,
max_num_blocks: int,
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
"""
return (
max_num_blocks,
self.kv_num_heads,
self.block_size,
self.head_dim,
)
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
return (
max_num_blocks,
self.kv_num_heads,
self.block_size,
self.head_dim // 2,
)
else:
return (
max_num_blocks,
self.kv_num_heads,
self.block_size,
self.head_dim,
)
def init_attention_metadata(self, forward_meta: ForwardMeta):
metadata = FlashAttentionMetadata()