Unify server-side and model-side Config (Part1) (#3018)

* move cache config * fix mtp
2025-10-05 08:37:06 +08:00 · 2025-07-28 10:51:52 +08:00
parent 8f426c1690
commit 6ccc10ad47
23 changed files with 243 additions and 289 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -186,12 +186,8 @@ class ParallelConfig:
        self.dtype: str = "bfloat16"
        # Encoder's decoder num
        self.enc_dec_block_num: int = 1
-        # KV cache ratio for input
-        self.kv_cache_ratio: float = 0.7
        # First token id
        self.first_token_id: int = 1
-        # Gpu memory utilization
-        self.gpu_memory_utilization: float = 0.9
        # Process ID of engine
        self.engine_pid: Optional[int] = None
        # Do profile or not
@@ -200,12 +196,8 @@ class ParallelConfig:
        self.pad_token_id: int = -1
        #
        self.eos_tokens_lens: int = 2
-        # Enable chunked prefill
-        self.enable_chunked_prefill: bool = False

        self.max_num_batched_tokens: int = 2048
-        # enable prefix cache
-        self.enable_prefix_caching = None
        # splitwise role
        self.splitwise_role: str = "mixed"
        # guided decoding backend
@@ -440,10 +432,153 @@ class LoRAConfig:
    pass


-class KVCacheConfig:
-    """KV Cache Config"""
+class CacheConfig:
+    """
+    Configuration for the KV cache.

-    cache_quant_dtype: str = "none"
+    Attributes:
+        block_size (int): Size of a cache block in number of tokens.
+        gpu_memory_utilization (float): Fraction of GPU memory to use for model execution.
+        cache_dtype (str): Data type for kv cache storage. Default is 'bfloat16'.
+        num_gpu_blocks_override (Optional[int]): Number of GPU blocks to use.
+        Overrides profiled num_gpu_blocks if provided.
+        kv_cache_ratio (float): Ratio for calculating the maximum block number.
+        enc_dec_block_num (int): Number of encoder-decoder blocks.
+        prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding.
+        enable_prefix_caching (bool): Flag to enable prefix caching.
+    """
+
+    def __init__(self, args):
+        """
+        Initialize the CacheConfig class.
+
+        Args:
+            block_size (int): Size of a cache block in number of tokens.
+            gpu_memory_utilization (float): Fraction of GPU memory to use.
+            cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
+            num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
+            num_cpu_blocks (Optional[int]): Number of CPU blocks.
+            kv_cache_ratio (float): Ratio for max block calculation.
+            enc_dec_block_num (int): Number of encoder-decoder blocks.
+            prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
+            enable_prefix_caching (bool): Enable prefix caching.
+        """
+        self.block_size = 64
+        self.gpu_memory_utilization = 0.9
+        self.num_gpu_blocks_override = None
+        self.kv_cache_ratio = 0.75
+        self.enc_dec_block_num = 2
+        self.prealloc_dec_block_slot_num_threshold = 5
+        self.cache_dtype = "bfloat16"
+        self.model_cfg = None
+        self.enable_chunked_prefill = False
+        self.rdma_comm_ports = None
+        self.cache_transfer_protocol = None
+        self.pd_comm_port = None
+        self.enable_prefix_caching = False
+        self.enable_ssd_cache = False
+        self.cache_queue_port = None
+        self.swap_space = None
+        for key, value in args.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+
+        if self.rdma_comm_ports is not None and isinstance(self.rdma_comm_ports, str):
+            self.rdma_comm_ports = self.rdma_comm_ports.split(",")
+
+        if self.pd_comm_port is not None and isinstance(self.pd_comm_port, str):
+            self.pd_comm_port = [int(port) for port in self.pd_comm_port.split(",")]
+
+        if self.swap_space is None:
+            self.enable_hierarchical_cache = False
+        else:
+            self.enable_hierarchical_cache = True
+
+        if self.model_cfg is not None:
+            if hasattr(self.model_cfg, "quantization_config"):
+                self.cache_dtype = self.model_cfg.quantization_config.get("kv_cache_quant_type", self.cache_dtype)
+            if (
+                hasattr(self.model_cfg, "num_key_value_heads")
+                and hasattr(self.model_cfg, "num_key_value_heads")
+                and self.model_cfg.num_key_value_heads is not None
+                and int(self.model_cfg.num_key_value_heads) > 0
+            ):
+                kv_num_head = int(self.model_cfg.num_key_value_heads)
+            else:
+                kv_num_head = self.model_cfg.num_attention_heads
+            self.model_cfg.kv_num_head = kv_num_head
+            # TODO check name
+            if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
+                byte_size = 0.5
+                self.cache_dtype = "uint8"
+            elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
+                self.cache_dtype = "uint8"
+                byte_size = 1
+            else:
+                byte_size = 2
+            self.each_token_cache_space = int(
+                self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
+            )
+            self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
+            self.bytes_per_layer_per_block = int(
+                self.block_size
+                * self.model_cfg.kv_num_head
+                * self.model_cfg.head_dim
+                // args["tensor_parallel_size"]
+                * byte_size
+            )
+
+        if self.swap_space is None:
+            self.num_cpu_blocks = 0
+        else:
+            self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
+        self._verify_args()
+
+    def metrics_info(self):
+        """Convert cache_config to dict(key: str, value: str) for prometheus metrics info."""
+        return {key: str(value) for key, value in self.__dict__.items()}
+
+    def _verify_args(self):
+        if self.gpu_memory_utilization > 1.0:
+            raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
+        if self.kv_cache_ratio > 1.0:
+            raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
+
+    def postprocess(self, num_total_tokens, number_of_tasks):
+        """
+        calculate block num
+        """
+        self.dec_token_num = self.enc_dec_block_num * self.block_size
+        if self.num_gpu_blocks_override is not None:
+            self.total_block_num = self.num_gpu_blocks_override
+            self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
+        else:
+            length = num_total_tokens // number_of_tasks
+            block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
+            self.total_block_num = block_num * number_of_tasks
+            self.prefill_kvcache_block_num = self.total_block_num
+            logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
+
+    def reset(self, num_gpu_blocks):
+        """
+        reset gpu block number
+        """
+        self.total_block_num = num_gpu_blocks
+        self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
+        logger.info(
+            f"Reset block num, the total_block_num:{self.total_block_num},"
+            f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
+        )
+
+    def print(self):
+        """
+        print all config
+
+        """
+        logger.info("Cache Configuration Information :")
+        for k, v in self.__dict__.items():
+            logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        logger.info("=============================================================")


 class DecodingConfig:
@@ -477,7 +612,7 @@ class FDConfig:
    quant_config: Optional[QuantConfigBase] = None
    graph_opt_config: Optional[GraphOptimizationConfig] = None
    decoding_config: DecodingConfig = field(default=None, init=True)  # type: ignore
-    kv_cache_config: KVCacheConfig = field(default=None, init=True)  # type: ignore
+    cache_config: CacheConfig = field(default=None, init=True)  # type: ignore

    def __post_init__(self):
        # Initialize cuda graph capture list
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -19,8 +19,8 @@ from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional

+from fastdeploy.config import CacheConfig
 from fastdeploy.engine.config import (
-    CacheConfig,
    Config,
    GraphOptimizationConfig,
    ModelConfig,
@@ -770,28 +770,6 @@ class EngineArgs:
            load_strategy=self.load_strategy,
        )

-    def create_cache_config(self, model_cfg) -> CacheConfig:
-        """
-        Create and return a CacheConfig object based on the current settings.
-        """
-        return CacheConfig(
-            block_size=self.block_size,
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
-            num_gpu_blocks_override=self.num_gpu_blocks_override,
-            kv_cache_ratio=self.kv_cache_ratio,
-            prealloc_dec_block_slot_num_threshold=self.prealloc_dec_block_slot_num_threshold,
-            enable_prefix_caching=self.enable_prefix_caching,
-            swap_space=self.swap_space,
-            cache_queue_port=self.cache_queue_port,
-            model_cfg=model_cfg,
-            enable_chunked_prefill=self.enable_chunked_prefill,
-            enc_dec_block_num=self.static_decode_blocks,
-            rdma_comm_ports=self.rdma_comm_ports,
-            cache_transfer_protocol=self.cache_transfer_protocol,
-            pd_comm_port=self.pd_comm_port,
-        )
-
    def create_speculative_config(self) -> SpeculativeConfig:
        """ """
        if self.speculative_config is not None:
@@ -864,12 +842,16 @@ class EngineArgs:
            self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
        ), "enable_custom_all_reduce must be used with tensor_parallel_size>1"

+        all_dict = asdict(self)
+        all_dict["model_cfg"] = model_cfg
+        cache_cfg = CacheConfig(all_dict)
+
        return Config(
            model_name_or_path=self.model,
            model_config=model_cfg,
            scheduler_config=scheduler_cfg,
            tokenizer=self.tokenizer,
-            cache_config=self.create_cache_config(model_cfg),
+            cache_config=cache_cfg,
            parallel_config=self.create_parallel_config(),
            max_model_len=self.max_model_len,
            tensor_parallel_size=self.tensor_parallel_size,
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -20,6 +20,7 @@ from datetime import datetime
 from typing import Any, Dict, List, Literal, Optional

 from fastdeploy import envs
+from fastdeploy.config import CacheConfig
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
 from fastdeploy.utils import (
@@ -157,170 +158,6 @@ class ModelConfig:
        llm_logger.info("=============================================================")


-class CacheConfig:
-    """
-    Configuration for the KV cache.
-
-    Attributes:
-        block_size (int): Size of a cache block in number of tokens.
-        gpu_memory_utilization (float): Fraction of GPU memory to use for model execution.
-        cache_dtype (str): Data type for kv cache storage. Default is 'bfloat16'.
-        num_gpu_blocks_override (Optional[int]): Number of GPU blocks to use.
-        Overrides profiled num_gpu_blocks if provided.
-        kv_cache_ratio (float): Ratio for calculating the maximum block number.
-        enc_dec_block_num (int): Number of encoder-decoder blocks.
-        prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding.
-        enable_prefix_caching (bool): Flag to enable prefix caching.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-        gpu_memory_utilization: float,
-        cache_dtype: str = "bfloat16",
-        num_gpu_blocks_override: Optional[int] = None,
-        swap_space: Optional[int] = None,
-        kv_cache_ratio: float = 0.75,
-        enc_dec_block_num: int = 2,
-        prealloc_dec_block_slot_num_threshold: int = 5,
-        tensor_parallel_size: int = 1,
-        enable_prefix_caching=False,
-        enable_ssd_cache=False,
-        model_cfg=None,
-        cache_queue_port=None,
-        enable_chunked_prefill=False,
-        rdma_comm_ports=None,
-        cache_transfer_protocol=None,
-        pd_comm_port=None,
-    ):
-        """
-        Initialize the CacheConfig class.
-
-        Args:
-            block_size (int): Size of a cache block in number of tokens.
-            gpu_memory_utilization (float): Fraction of GPU memory to use.
-            cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
-            num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
-            num_cpu_blocks (Optional[int]): Number of CPU blocks.
-            kv_cache_ratio (float): Ratio for max block calculation.
-            enc_dec_block_num (int): Number of encoder-decoder blocks.
-            prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
-            enable_prefix_caching (bool): Enable prefix caching.
-        """
-        self.block_size = block_size
-        self.gpu_memory_utilization = gpu_memory_utilization
-        self.num_gpu_blocks_override = num_gpu_blocks_override
-        self.kv_cache_ratio = kv_cache_ratio
-        self.enc_dec_block_num = enc_dec_block_num
-        self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold
-        self.cache_dtype = cache_dtype
-        if hasattr(model_cfg, "quantization_config"):
-            self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype)
-
-        self.enable_chunked_prefill = enable_chunked_prefill
-        self.rdma_comm_ports = rdma_comm_ports
-        self.cache_transfer_protocol = cache_transfer_protocol
-        self.pd_comm_port = pd_comm_port
-
-        if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str):
-            self.rdma_comm_ports = rdma_comm_ports.split(",")
-
-        if pd_comm_port is not None and isinstance(pd_comm_port, str):
-            self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")]
-
-        self.enable_prefix_caching = enable_prefix_caching
-        if swap_space is None:
-            self.enable_hierarchical_cache = False
-        else:
-            self.enable_hierarchical_cache = True
-
-        self.enable_ssd_cache = enable_ssd_cache
-        self.model_cfg = model_cfg
-        self.cache_queue_port = cache_queue_port
-        self.swap_space = swap_space
-
-        if (
-            hasattr(self.model_cfg, "num_key_value_heads")
-            and hasattr(self.model_cfg, "num_key_value_heads")
-            and self.model_cfg.num_key_value_heads is not None
-            and int(self.model_cfg.num_key_value_heads) > 0
-        ):
-            kv_num_head = int(self.model_cfg.num_key_value_heads)
-        else:
-            kv_num_head = self.model_cfg.num_attention_heads
-        self.model_cfg.kv_num_head = kv_num_head
-
-        # TODO check name
-        if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
-            byte_size = 0.5
-            self.cache_dtype = "uint8"
-        elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
-            self.cache_dtype = "uint8"
-            byte_size = 1
-        else:
-            byte_size = 2
-
-        self.each_token_cache_space = int(
-            self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
-        )
-        self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
-        self.bytes_per_layer_per_block = int(
-            self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size
-        )
-
-        if self.swap_space is None:
-            self.num_cpu_blocks = 0
-        else:
-            self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
-        self._verify_args()
-
-    def metrics_info(self):
-        """Convert cache_config to dict(key: str, value: str) for prometheus metrics info."""
-        return {key: str(value) for key, value in self.__dict__.items()}
-
-    def _verify_args(self):
-        if self.gpu_memory_utilization > 1.0:
-            raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
-        if self.kv_cache_ratio > 1.0:
-            raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
-
-    def postprocess(self, num_total_tokens, number_of_tasks):
-        """
-        calculate block num
-        """
-        self.dec_token_num = self.enc_dec_block_num * self.block_size
-        if self.num_gpu_blocks_override is not None:
-            self.total_block_num = self.num_gpu_blocks_override
-            self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
-        else:
-            length = num_total_tokens // number_of_tasks
-            block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
-            self.total_block_num = block_num * number_of_tasks
-            self.prefill_kvcache_block_num = self.total_block_num
-            llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
-
-    def reset(self, num_gpu_blocks):
-        """
-        reset gpu block number
-        """
-        self.total_block_num = num_gpu_blocks
-        self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
-        llm_logger.info(
-            f"Reset block num, the total_block_num:{self.total_block_num},"
-            f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
-        )
-
-    def print(self):
-        """
-        print all config
-
-        """
-        llm_logger.info("Cache Configuration Information :")
-        for k, v in self.__dict__.items():
-            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
-
-
 class SpeculativeConfig:
    """
    Speculative Decoding Configuration class.
--- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -95,7 +95,7 @@ class AppendAttentionBackend(AttentionBackend):
        """
        super().__init__()
        self.attention_metadata: AppendAttentionMetadata = None
-        self.block_size: int = fd_config.parallel_config.block_size
+        self.block_size: int = fd_config.cache_config.block_size
        self.max_seq_len: int = fd_config.parallel_config.max_model_len
        self.rope_theta: float = (
            10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
--- a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py
@@ -85,7 +85,7 @@ class BlockAttentionBackend(AttentionBackend):
        """
        super().__init__()
        self.attention_metadata: BlockAttentionMetadata = None
-        self.block_size = fd_config.parallel_config.block_size
+        self.block_size = fd_config.cache_config.block_size
        self.max_seq_len = fd_config.parallel_config.max_model_len
        self.rope_theta = 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
        self.rank = fd_config.parallel_config.tensor_parallel_rank
--- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -113,7 +113,7 @@ class FlashAttentionBackend(AttentionBackend):
        self.num_heads = num_heads
        self.head_dim = fd_config.model_config.head_dim
        self.attn_outputsize_tp = self.num_heads * self.head_dim
-        self.block_size = fd_config.parallel_config.block_size
+        self.block_size = fd_config.cache_config.block_size
        self.num_layers: int = fd_config.model_config.num_hidden_layers

        self.speculative_method = fd_config.speculative_config.method
--- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
@@ -94,8 +94,8 @@ class IluvatarAttnBackend(AttentionBackend):
    ):
        super().__init__()
        self.attention_metadata = IluvatarAttentionMetadata()
-        self.attention_metadata.block_size = llm_config.parallel_config.block_size
-        assert llm_config.parallel_config.enc_dec_block_num == 0, "Iluvatar does not support yet"
+        self.attention_metadata.block_size = llm_config.cache_config.block_size
+        assert llm_config.cache_config.enc_dec_block_num == 0, "Iluvatar does not support yet"

        self.attention_metadata.max_context_len = llm_config.parallel_config.max_model_len
        self.attention_metadata.causal = getattr(llm_config.model_config, "causal", True)
--- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
+++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -113,7 +113,7 @@ class MLAAttentionBackend(AttentionBackend):
        self.attention_metadata: MLAAttentionMetadata = None

        # 基础配置
-        self.block_size: int = fd_config.parallel_config.block_size
+        self.block_size: int = fd_config.cache_config.block_size
        self.max_seq_len: int = fd_config.parallel_config.max_model_len
        self.rope_theta: float = (
            10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
--- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
@@ -92,7 +92,7 @@ class XPUAttentionBackend(AttentionBackend):
        super().__init__()
        self.attention_metadata: XPUAttentionMetadata = None
        # TODO(gongshaotian): Use fd_config parameters in the correct location
-        self.block_size: int = fd_config.parallel_config.block_size
+        self.block_size: int = fd_config.cache_config.block_size
        self.max_seq_len: int = fd_config.parallel_config.max_model_len
        self.rope_theta: float = (
            10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
@@ -82,7 +82,7 @@ class GCUFlashAttnBackend(AttentionBackend):
        """
        super().__init__()
        self.attention_metadata: GCUFlashAttnMetadata = None
-        self.block_size = fd_config.parallel_config.block_size
+        self.block_size = fd_config.cache_config.block_size
        self.max_seq_len = fd_config.parallel_config.max_model_len
        self.max_num_seqs = fd_config.parallel_config.max_num_seqs

--- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
@@ -80,7 +80,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        """
        super().__init__()
        self.attention_metadata: GCUMemEfficientAttnMetadata = None
-        self.block_size = fd_config.parallel_config.block_size
+        self.block_size = fd_config.cache_config.block_size
        self.max_seq_len = fd_config.parallel_config.max_model_len
        self.max_num_seqs = fd_config.parallel_config.max_num_seqs

--- a/fastdeploy/spec_decode/base.py
+++ b/fastdeploy/spec_decode/base.py
@@ -38,7 +38,7 @@ class Proposer(ABC):
        self.parallel_config = self.cfg.parallel_config
        self.model_config = self.cfg.model_config
        self.speculative_config = self.cfg.speculative_config
-        self.kv_cache_config = self.cfg.kv_cache_config
+        self.cache_config = self.cfg.cache_config
        self.quant_config = self.cfg.quant_config

        self.max_num_seqs = self.parallel_config.max_num_seqs
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -97,10 +97,10 @@ class MTPProposer(Proposer):
            num_tokens // batch_size,
            self.parallel_config.max_model_len - max_dec_len,
        )
-        input_length = int(full_length * self.parallel_config.kv_cache_ratio)
+        input_length = int(full_length * self.cache_config.kv_cache_ratio)
        block_num = (
-            input_length + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            input_length + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num

        for i in range(batch_size):
            idx = i
@@ -141,7 +141,7 @@ class MTPProposer(Proposer):
            max_num_blocks=self.num_gpu_blocks, kv_cache_quant_type=kv_cache_quant_type
        )
        if not self.parallel_config.do_profile and (
-            self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
+            self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
        ):
            cache_kvs_list = []
            for i in range(
@@ -219,14 +219,14 @@ class MTPProposer(Proposer):

        self.main_model_num_gpu_blocks = num_gpu_blocks
        self.num_gpu_blocks = int(num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio)
-        if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
+        if not (self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
            self.initialize_kv_cache()

        # Reset free list
        free_list = list(
            range(
                self.num_gpu_blocks - 1,
-                int(self.main_model_num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.main_model_num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -299,7 +299,7 @@ class MTPProposer(Proposer):
        self.free_list = list(
            range(
                self.parallel_config.total_block_num - 1,
-                int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.parallel_config.total_block_num * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -371,7 +371,7 @@ class MTPProposer(Proposer):
                    ]
                self.model_inputs["pre_ids"][idx : idx + 1] = -1
                self.model_inputs["step_idx"][idx : idx + 1] = 0
-                if self.parallel_config.enable_chunked_prefill:
+                if self.cache_config.enable_chunked_prefill:
                    token_chunk_size = request.prefill_chunk_info[0]
                    self.model_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size
                    self.model_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size
@@ -640,7 +640,7 @@ class MTPProposer(Proposer):
            self.model_inputs["used_list_len"],
            self.model_inputs["free_list"],
            self.model_inputs["free_list_len"],
-            self.parallel_config.block_size,
+            self.cache_config.block_size,
            self.max_draft_token_num,
        )

--- a/fastdeploy/worker/dcu_worker.py
+++ b/fastdeploy/worker/dcu_worker.py
@@ -89,9 +89,7 @@ class DcuWorker(GpuWorker):
        paddle.device.cuda.empty_cache()
        paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
        available_kv_cache_memory = (
-            total_gpu_memory * self.parallel_config.gpu_memory_utilization
-            - after_used_gpu_memory
-            - paddle_peak_increase
+            total_gpu_memory * self.cache_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase
        )
        available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num

--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -207,7 +207,7 @@ class GCUModelRunner(ModelRunnerBase):
                self.share_inputs["prompt_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids)

                # Use chunked prefill
-                if self.parallel_config.enable_chunked_prefill:
+                if self.cache_config.enable_chunked_prefill:
                    request.set("chunk_idx", 1)
                    logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}")
                    token_chunk_size = request.prefill_chunk_info[0]
@@ -287,10 +287,10 @@ class GCUModelRunner(ModelRunnerBase):
            num_tokens // batch_size,
            self.parallel_config.max_model_len - max_dec_len,
        )
-        input_length = int(full_length * self.parallel_config.kv_cache_ratio)
+        input_length = int(full_length * self.cache_config.kv_cache_ratio)
        block_num = (
-            input_length + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            input_length + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num

        for i in range(batch_size):
            idx = i
@@ -417,15 +417,15 @@ class GCUModelRunner(ModelRunnerBase):

        # Set block tables
        pre_max_block_num = (
-            self.parallel_config.max_model_len + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
        self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")

        # Initialize free list
        free_list = list(
            range(
                self.parallel_config.total_block_num - 1,
-                int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.parallel_config.total_block_num * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -608,9 +608,7 @@ class GCUModelRunner(ModelRunnerBase):
        )
        # local_rank = self.local_rank % self.parallel_config.tensor_parallel_size

-        if not profile and (
-            self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
-        ):
+        if not profile and (self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
            raise NotImplementedError("prefix_caching is not support by GCUModelRunner.")
        else:
            for i in range(self.model_config.num_hidden_layers):
@@ -795,7 +793,7 @@ class GCUModelRunner(ModelRunnerBase):
        """
        Update chunked prefill related parameters
        """
-        if not self.parallel_config.enable_chunked_prefill:
+        if not self.cache_config.enable_chunked_prefill:
            return
        for task in tasks:
            if task.get("prefill_chunk_info", None) is None:
@@ -861,7 +859,7 @@ class GCUModelRunner(ModelRunnerBase):
            A list of indices corresponding to the requests that need to be skipped.
        """
        skip_idx_list = []
-        if not self.parallel_config.enable_chunked_prefill or self.guided_backend is None:
+        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
            return skip_idx_list

        for task in model_forward_batch:
@@ -1079,7 +1077,7 @@ class GCUModelRunner(ModelRunnerBase):
        free_list = list(
            range(
                self.num_gcu_blocks - 1,
-                int(self.num_gcu_blocks * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.num_gcu_blocks * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -1123,7 +1121,7 @@ class GCUModelRunner(ModelRunnerBase):
            if self.speculative_method in ["mtp"]
            else self.model_config.num_hidden_layers
        )
-        required_memory = byte_of_dtype * 2 * (self.parallel_config.block_size * hidden_dim) * num_layers  # k + v
+        required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers  # k + v
        return required_memory

    def not_need_stop(self) -> bool:
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -339,7 +339,7 @@ class GPUModelRunner(ModelRunnerBase):
                self.share_inputs["prompt_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids)

                # Use chunked prefill
-                if self.parallel_config.enable_chunked_prefill:
+                if self.cache_config.enable_chunked_prefill:
                    request.set("chunk_idx", 1)
                    logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}")
                    token_chunk_size = request.prefill_chunk_info[0]
@@ -467,10 +467,10 @@ class GPUModelRunner(ModelRunnerBase):
            num_tokens // batch_size,
            self.parallel_config.max_model_len - max_dec_len,
        )
-        input_length = int(full_length * self.parallel_config.kv_cache_ratio)
+        input_length = int(full_length * self.cache_config.kv_cache_ratio)
        block_num = (
-            input_length + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            input_length + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num

        for i in range(batch_size):
            idx = i
@@ -602,15 +602,15 @@ class GPUModelRunner(ModelRunnerBase):

        # Set block tables
        pre_max_block_num = (
-            self.parallel_config.max_model_len + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
        self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")

        # Initialize free list
        free_list = list(
            range(
                self.parallel_config.total_block_num - 1,
-                int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.parallel_config.total_block_num * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -689,7 +689,7 @@ class GPUModelRunner(ModelRunnerBase):
                self.share_inputs["step_seq_lens_decoder"],
                self.share_inputs["block_tables"],
                self.share_inputs["is_block_step"],
-                self.parallel_config.block_size,
+                self.cache_config.block_size,
            )

        # Remove padding
@@ -833,9 +833,7 @@ class GPUModelRunner(ModelRunnerBase):
        )
        local_rank = self.local_rank % self.parallel_config.tensor_parallel_size

-        if not profile and (
-            self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
-        ):
+        if not profile and (self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
            cache_kvs_list = []
            for i in range(self.model_config.num_hidden_layers):
                key_cache = paddle.empty(shape=[], dtype=cache_type)
@@ -1015,7 +1013,7 @@ class GPUModelRunner(ModelRunnerBase):
                sampler_output=sampler_output,
                model_output=model_output_data,
                share_inputs=self.share_inputs,
-                block_size=self.parallel_config.block_size,
+                block_size=self.cache_config.block_size,
                speculative_decoding=self.speculative_decoding,
                skip_save_output=True,
            )
@@ -1031,10 +1029,10 @@ class GPUModelRunner(ModelRunnerBase):
            self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
            step_cuda(
                self.share_inputs,
-                self.parallel_config.block_size,
-                self.parallel_config.enc_dec_block_num,
+                self.cache_config.block_size,
+                self.cache_config.enc_dec_block_num,
                self.speculative_config,
-                self.parallel_config.enable_prefix_caching,
+                self.cache_config.enable_prefix_caching,
            )

            if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
@@ -1044,7 +1042,7 @@ class GPUModelRunner(ModelRunnerBase):
        """
        Update chunked prefill related parameters
        """
-        if not self.parallel_config.enable_chunked_prefill:
+        if not self.cache_config.enable_chunked_prefill:
            return
        for task in tasks:
            if task.get("prefill_chunk_info", None) is None:
@@ -1144,7 +1142,7 @@ class GPUModelRunner(ModelRunnerBase):
            A list of indices corresponding to the requests that need to be skipped.
        """
        skip_idx_list = []
-        if not self.parallel_config.enable_chunked_prefill or self.guided_backend is None:
+        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
            return skip_idx_list

        for task in model_forward_batch:
@@ -1283,7 +1281,7 @@ class GPUModelRunner(ModelRunnerBase):
            sampler_output=sampler_output,
            model_output=model_output_data,
            share_inputs=self.share_inputs,
-            block_size=self.parallel_config.block_size,
+            block_size=self.cache_config.block_size,
            save_each_rank=self.parallel_config.use_ep,
            speculative_decoding=self.speculative_decoding,
            skip_save_output=skip_save_output,
@@ -1302,10 +1300,10 @@ class GPUModelRunner(ModelRunnerBase):
        if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
            step_cuda(
                self.share_inputs,
-                self.parallel_config.block_size,
-                self.parallel_config.enc_dec_block_num,
+                self.cache_config.block_size,
+                self.cache_config.enc_dec_block_num,
                self.speculative_config,
-                self.parallel_config.enable_prefix_caching,
+                self.cache_config.enable_prefix_caching,
            )

            self._update_chunked_prefill(model_forward_batch)
@@ -1379,7 +1377,7 @@ class GPUModelRunner(ModelRunnerBase):
        free_list = list(
            range(
                self.num_gpu_blocks - 1,
-                int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -1425,7 +1423,7 @@ class GPUModelRunner(ModelRunnerBase):
            if self.speculative_method in ["mtp"]
            else self.model_config.num_hidden_layers
        )
-        required_memory = byte_of_dtype * 2 * (self.parallel_config.block_size * hidden_dim) * num_layers  # k + v
+        required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers  # k + v
        return required_memory

    def not_need_stop(self) -> bool:
--- a/fastdeploy/worker/gpu_worker.py
+++ b/fastdeploy/worker/gpu_worker.py
@@ -137,7 +137,7 @@ class GpuWorker(WorkerBase):
        pynvml.nvmlShutdown()

        available_kv_cache_memory = (
-            after_run_meminfo.total * self.parallel_config.gpu_memory_utilization
+            after_run_meminfo.total * self.cache_config.gpu_memory_utilization
            - after_run_meminfo.used
            - paddle_peak_increase
        )
--- a/fastdeploy/worker/iluvatar_model_runner.py
+++ b/fastdeploy/worker/iluvatar_model_runner.py
@@ -189,7 +189,7 @@ class IluvatarModelRunner(ModelRunnerBase):
                self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids)

                # Use chunked prefill
-                if self.parallel_config.enable_chunked_prefill:
+                if self.cache_config.enable_chunked_prefill:
                    request.set("chunk_idx", 1)
                    logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}")
                    token_chunk_size = request.prefill_chunk_info[0]
@@ -257,10 +257,10 @@ class IluvatarModelRunner(ModelRunnerBase):
            num_tokens // batch_size,
            self.parallel_config.max_model_len - max_dec_len,
        )
-        input_length = int(full_length * self.parallel_config.kv_cache_ratio)
+        input_length = int(full_length * self.cache_config.kv_cache_ratio)
        block_num = (
-            input_length + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            input_length + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num

        for i in range(batch_size):
            idx = i
@@ -383,15 +383,15 @@ class IluvatarModelRunner(ModelRunnerBase):

        # Set block tables
        pre_max_block_num = (
-            self.parallel_config.max_model_len + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
        self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")

        # Initialize free list
        free_list = list(
            range(
                self.parallel_config.total_block_num - 1,
-                int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.parallel_config.total_block_num * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -574,7 +574,7 @@ class IluvatarModelRunner(ModelRunnerBase):
        )

        if not self.parallel_config.do_profile and (
-            self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
+            self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
        ):
            raise NotImplementedError("Iluvatar does not support yet")
        else:
@@ -733,10 +733,10 @@ class IluvatarModelRunner(ModelRunnerBase):
            self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
            step_cuda(
                self.share_inputs,
-                self.parallel_config.block_size,
-                self.parallel_config.enc_dec_block_num,
+                self.cache_config.block_size,
+                self.cache_config.enc_dec_block_num,
                self.speculative_config,
-                self.parallel_config.enable_prefix_caching,
+                self.cache_config.enable_prefix_caching,
            )

            if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
@@ -746,7 +746,7 @@ class IluvatarModelRunner(ModelRunnerBase):
        """
        更新chunked prefill相关参数
        """
-        if not self.parallel_config.enable_chunked_prefill:
+        if not self.cache_config.enable_chunked_prefill:
            return

        for task in tasks:
@@ -815,7 +815,7 @@ class IluvatarModelRunner(ModelRunnerBase):
            A list of indices corresponding to the requests that need to be skipped.
        """
        skip_idx_list = []
-        if not self.parallel_config.enable_chunked_prefill or self.guided_backend is None:
+        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
            return skip_idx_list

        for task in model_forward_batch:
@@ -952,10 +952,10 @@ class IluvatarModelRunner(ModelRunnerBase):
        self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
        step_cuda(
            self.share_inputs,
-            self.parallel_config.block_size,
-            self.parallel_config.enc_dec_block_num,
+            self.cache_config.block_size,
+            self.cache_config.enc_dec_block_num,
            self.speculative_config,
-            self.parallel_config.enable_prefix_caching,
+            self.cache_config.enable_prefix_caching,
        )

        self._update_chunked_prefill(model_forward_batch)
@@ -1023,7 +1023,7 @@ class IluvatarModelRunner(ModelRunnerBase):
        free_list = list(
            range(
                self.num_gpu_blocks - 1,
-                int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -1066,7 +1066,7 @@ class IluvatarModelRunner(ModelRunnerBase):
            if self.speculative_method in ["mtp"]
            else self.model_config.num_hidden_layers
        )
-        required_memory = byte_of_dtype * 2 * (self.parallel_config.block_size * hidden_dim) * num_layers  # k + v
+        required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers  # k + v
        return required_memory

    def not_need_stop(self) -> bool:
--- a/fastdeploy/worker/model_runner_base.py
+++ b/fastdeploy/worker/model_runner_base.py
@@ -44,6 +44,7 @@ class ModelRunnerBase(ABC):
        self.parallel_config = fd_config.parallel_config
        self.graph_opt_config = fd_config.graph_opt_config
        self.quant_config = fd_config.quant_config
+        self.cache_config = fd_config.cache_config
        # ... config

        self.device = device
--- a/fastdeploy/worker/worker_base.py
+++ b/fastdeploy/worker/worker_base.py
@@ -50,6 +50,7 @@ class WorkerBase(ABC):
        self.load_config = fd_config.load_config
        self.parallel_config = fd_config.parallel_config
        self.device_config = fd_config.device_config
+        self.cache_config = fd_config.cache_config
        # ... config

        # Device and Runner
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -25,6 +25,7 @@ import paddle.distributed as dist
 from paddle.distributed import fleet

 from fastdeploy.config import (
+    CacheConfig,
    DecodingConfig,
    DeviceConfig,
    ErnieArchitectures,
@@ -140,6 +141,7 @@ class PaddleDisWorkerProc:
        self.local_rank = local_rank
        self.fd_config = fd_config
        self.parallel_config = fd_config.parallel_config
+        self.cache_config = fd_config.cache_config

        # TODO(gongshaotian): Use worker factory to get worker
        self.worker = get_worker(fd_config=fd_config, local_rank=self.local_rank, rank=self.ranks)
@@ -404,7 +406,7 @@ class PaddleDisWorkerProc:

        logger.info(f"------- num_blocks_global: {num_blocks_local} --------")
        # wait engine launch cache_manager
-        if self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed":
+        if self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed":
            launched_cache_manager_signal_data = np.zeros([1], dtype=np.int32)
            self.launched_cache_manager_signal = IPCSignal(
                name="launched_cache_manager_signal",
@@ -607,6 +609,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
    decoding_config = DecodingConfig(vars(args))
    speculative_config = SpeculativeConfig(vars(args))
    parallel_config = ParallelConfig(vars(args))
+    cache_config = CacheConfig(vars(args))
    parallel_config.tensor_parallel_size = args.tensor_parallel_size
    parallel_config.tensor_parallel_rank = local_rank % args.tensor_parallel_size
    parallel_config.expert_parallel_size = args.expert_parallel_size
@@ -707,6 +710,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
        decoding_config=decoding_config,
        quant_config=quant_config,
        graph_opt_config=graph_opt_config,
+        cache_config=cache_config,
    )
    update_fd_config_for_mm(fd_config)

--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -428,15 +428,15 @@ class XPUModelRunner(ModelRunnerBase):

        # Set block tables
        pre_max_block_num = (
-            self.parallel_config.max_model_len + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
        self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")

        # Initialize free list
        free_list = list(
            range(
                self.parallel_config.total_block_num - 1,
-                int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.parallel_config.total_block_num * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
@@ -598,8 +598,8 @@ class XPUModelRunner(ModelRunnerBase):
        full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - 10)
        input_length = int(full_length - 512)
        block_num = (
-            input_length + self.parallel_config.block_size - 1
-        ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
+            input_length + self.cache_config.block_size - 1
+        ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num

        for i in range(batch_size):
            idx = i
@@ -707,8 +707,8 @@ class XPUModelRunner(ModelRunnerBase):
        self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
        step_paddle(
            self.share_inputs,
-            self.parallel_config.block_size,
-            self.parallel_config.enc_dec_block_num,
+            self.cache_config.block_size,
+            self.cache_config.enc_dec_block_num,
        )

        return None
@@ -764,7 +764,7 @@ class XPUModelRunner(ModelRunnerBase):
        required_memory = (
            byte_of_dtype
            * 2  # k + v
-            * (self.parallel_config.block_size * hidden_dim)
+            * (self.cache_config.block_size * hidden_dim)
            * self.model_config.num_hidden_layers
        )
        return required_memory
@@ -784,7 +784,7 @@ class XPUModelRunner(ModelRunnerBase):
        free_list = list(
            range(
                self.num_gpu_blocks - 1,
-                int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1,
+                int(self.num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
--- a/fastdeploy/worker/xpu_worker.py
+++ b/fastdeploy/worker/xpu_worker.py
@@ -104,7 +104,7 @@ class XpuWorker(WorkerBase):
        self.model_runner.prepare_profile()
        self.model_runner.profile_run()

-        total_available_memory = int(total_memory * self.parallel_config.gpu_memory_utilization)
+        total_available_memory = int(total_memory * self.cache_config.gpu_memory_utilization)
        used_memory = xpu_get_used_global_memory(self.local_rank)
        available_kv_cache_memory = total_available_memory - used_memory
        model_block_memory_used = self.cal_theortical_kvcache()