mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
support c4 attn && fix cache
This commit is contained in:
@@ -559,15 +559,19 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
# Get kv cache dtype
|
||||
cache_type = self.parallel_config.dtype
|
||||
|
||||
kv_cache_quant_type = None
|
||||
if (
|
||||
self.quant_config
|
||||
and hasattr(self.quant_config, "kv_cache_quant_type")
|
||||
and self.quant_config.kv_cache_quant_type is not None
|
||||
):
|
||||
cache_type = "uint8"
|
||||
kv_cache_quant_type = self.quant_config.kv_cache_quant_type
|
||||
|
||||
# Get kv cache shape
|
||||
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num)
|
||||
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
|
||||
max_num_blocks=max_block_num, kv_cache_quant_type=kv_cache_quant_type
|
||||
)
|
||||
|
||||
if not self.parallel_config.do_profile and (
|
||||
self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
|
||||
|
Reference in New Issue
Block a user