support c4 attn && fix cache

2025-10-05 16:48:03 +08:00 · 2025-07-23 23:51:28 +08:00
parent 832d25334a
commit 29c3292f02
16 changed files with 198 additions and 65 deletions
--- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
+++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -217,14 +217,17 @@ class MLAAttentionBackend(AttentionBackend):
        self.attention_metadata: AttentionMetadata = metadata

        forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False)
-        forward_meta.decoder_tile_ids_per_batch.copy_(
-            metadata.decoder_tile_ids_per_batch, False)
+        forward_meta.decoder_tile_ids_per_batch.copy_(metadata.decoder_tile_ids_per_batch, False)

    def get_attntion_meta(self) -> AttentionMetadata:
        """get_attntion_meta"""
        return self.attention_metadata

-    def get_kv_cache_shape(self, max_num_blocks: int) -> Tuple[int, int, int, int]:
+    def get_kv_cache_shape(
+        self,
+        max_num_blocks: int,
+        kv_cache_quant_type: str = None,
+    ) -> Tuple[int, int, int, int]:
        """
        Calculate kv cache shape for MLA
        """