support c4 attn && fix cache

2025-10-04 16:22:57 +08:00 · 2025-07-23 23:51:28 +08:00
parent 832d25334a
commit 29c3292f02
16 changed files with 198 additions and 65 deletions
--- a/fastdeploy/worker/iluvatar_model_runner.py
+++ b/fastdeploy/worker/iluvatar_model_runner.py
@@ -559,15 +559,19 @@ class IluvatarModelRunner(ModelRunnerBase):
        # Get kv cache dtype
        cache_type = self.parallel_config.dtype

+        kv_cache_quant_type = None
        if (
            self.quant_config
            and hasattr(self.quant_config, "kv_cache_quant_type")
            and self.quant_config.kv_cache_quant_type is not None
        ):
            cache_type = "uint8"
+            kv_cache_quant_type = self.quant_config.kv_cache_quant_type

        # Get kv cache shape
-        kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num)
+        kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
+            max_num_blocks=max_block_num, kv_cache_quant_type=kv_cache_quant_type
+        )

        if not self.parallel_config.do_profile and (
            self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"