support c4 attn && fix cache

2025-10-04 08:16:42 +08:00 · 2025-07-23 23:51:28 +08:00
parent 832d25334a
commit 29c3292f02
16 changed files with 198 additions and 65 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -810,15 +810,19 @@ class GPUModelRunner(ModelRunnerBase):
        # Get kv cache dtype
        cache_type = self.parallel_config.dtype

+        kv_cache_quant_type = None
        if (
            self.quant_config
            and hasattr(self.quant_config, "kv_cache_quant_type")
            and self.quant_config.kv_cache_quant_type is not None
        ):
            cache_type = "uint8"
+            kv_cache_quant_type = self.quant_config.kv_cache_quant_type

        # Get kv cache shape
-        kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num)
+        kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
+            max_num_blocks=max_block_num, kv_cache_quant_type=kv_cache_quant_type
+        )
        local_rank = self.local_rank % self.parallel_config.tensor_parallel_size

        if not profile and (