support c4 attn && fix cache

2025-11-02 20:54:03 +08:00 · 2025-07-23 23:51:28 +08:00
parent 832d25334a
commit 29c3292f02
16 changed files with 198 additions and 65 deletions
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
@@ -211,6 +211,7 @@ class GCUFlashAttnBackend(AttentionBackend):
    def get_kv_cache_shape(
        self,
        max_num_blocks: int,
+        kv_cache_quant_type: str = None,
    ):
        """
        Caculate kv cache shape
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
@@ -222,6 +222,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
    def get_kv_cache_shape(
        self,
        max_num_blocks: int,
+        kv_cache_quant_type: str = None,
    ):
        """
        Caculate kv cache shape