Unify server-side and model-side Config (Part1) (#3018)

* move cache config * fix mtp
2025-10-16 05:30:58 +08:00 · 2025-07-28 10:51:52 +08:00
parent 8f426c1690
commit 6ccc10ad47
23 changed files with 243 additions and 289 deletions
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -20,6 +20,7 @@ from datetime import datetime
 from typing import Any, Dict, List, Literal, Optional

 from fastdeploy import envs
+from fastdeploy.config import CacheConfig
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
 from fastdeploy.utils import (
@@ -157,170 +158,6 @@ class ModelConfig:
        llm_logger.info("=============================================================")


-class CacheConfig:
-    """
-    Configuration for the KV cache.
-
-    Attributes:
-        block_size (int): Size of a cache block in number of tokens.
-        gpu_memory_utilization (float): Fraction of GPU memory to use for model execution.
-        cache_dtype (str): Data type for kv cache storage. Default is 'bfloat16'.
-        num_gpu_blocks_override (Optional[int]): Number of GPU blocks to use.
-        Overrides profiled num_gpu_blocks if provided.
-        kv_cache_ratio (float): Ratio for calculating the maximum block number.
-        enc_dec_block_num (int): Number of encoder-decoder blocks.
-        prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding.
-        enable_prefix_caching (bool): Flag to enable prefix caching.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-        gpu_memory_utilization: float,
-        cache_dtype: str = "bfloat16",
-        num_gpu_blocks_override: Optional[int] = None,
-        swap_space: Optional[int] = None,
-        kv_cache_ratio: float = 0.75,
-        enc_dec_block_num: int = 2,
-        prealloc_dec_block_slot_num_threshold: int = 5,
-        tensor_parallel_size: int = 1,
-        enable_prefix_caching=False,
-        enable_ssd_cache=False,
-        model_cfg=None,
-        cache_queue_port=None,
-        enable_chunked_prefill=False,
-        rdma_comm_ports=None,
-        cache_transfer_protocol=None,
-        pd_comm_port=None,
-    ):
-        """
-        Initialize the CacheConfig class.
-
-        Args:
-            block_size (int): Size of a cache block in number of tokens.
-            gpu_memory_utilization (float): Fraction of GPU memory to use.
-            cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
-            num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
-            num_cpu_blocks (Optional[int]): Number of CPU blocks.
-            kv_cache_ratio (float): Ratio for max block calculation.
-            enc_dec_block_num (int): Number of encoder-decoder blocks.
-            prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
-            enable_prefix_caching (bool): Enable prefix caching.
-        """
-        self.block_size = block_size
-        self.gpu_memory_utilization = gpu_memory_utilization
-        self.num_gpu_blocks_override = num_gpu_blocks_override
-        self.kv_cache_ratio = kv_cache_ratio
-        self.enc_dec_block_num = enc_dec_block_num
-        self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold
-        self.cache_dtype = cache_dtype
-        if hasattr(model_cfg, "quantization_config"):
-            self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype)
-
-        self.enable_chunked_prefill = enable_chunked_prefill
-        self.rdma_comm_ports = rdma_comm_ports
-        self.cache_transfer_protocol = cache_transfer_protocol
-        self.pd_comm_port = pd_comm_port
-
-        if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str):
-            self.rdma_comm_ports = rdma_comm_ports.split(",")
-
-        if pd_comm_port is not None and isinstance(pd_comm_port, str):
-            self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")]
-
-        self.enable_prefix_caching = enable_prefix_caching
-        if swap_space is None:
-            self.enable_hierarchical_cache = False
-        else:
-            self.enable_hierarchical_cache = True
-
-        self.enable_ssd_cache = enable_ssd_cache
-        self.model_cfg = model_cfg
-        self.cache_queue_port = cache_queue_port
-        self.swap_space = swap_space
-
-        if (
-            hasattr(self.model_cfg, "num_key_value_heads")
-            and hasattr(self.model_cfg, "num_key_value_heads")
-            and self.model_cfg.num_key_value_heads is not None
-            and int(self.model_cfg.num_key_value_heads) > 0
-        ):
-            kv_num_head = int(self.model_cfg.num_key_value_heads)
-        else:
-            kv_num_head = self.model_cfg.num_attention_heads
-        self.model_cfg.kv_num_head = kv_num_head
-
-        # TODO check name
-        if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
-            byte_size = 0.5
-            self.cache_dtype = "uint8"
-        elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
-            self.cache_dtype = "uint8"
-            byte_size = 1
-        else:
-            byte_size = 2
-
-        self.each_token_cache_space = int(
-            self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
-        )
-        self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
-        self.bytes_per_layer_per_block = int(
-            self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size
-        )
-
-        if self.swap_space is None:
-            self.num_cpu_blocks = 0
-        else:
-            self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
-        self._verify_args()
-
-    def metrics_info(self):
-        """Convert cache_config to dict(key: str, value: str) for prometheus metrics info."""
-        return {key: str(value) for key, value in self.__dict__.items()}
-
-    def _verify_args(self):
-        if self.gpu_memory_utilization > 1.0:
-            raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
-        if self.kv_cache_ratio > 1.0:
-            raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
-
-    def postprocess(self, num_total_tokens, number_of_tasks):
-        """
-        calculate block num
-        """
-        self.dec_token_num = self.enc_dec_block_num * self.block_size
-        if self.num_gpu_blocks_override is not None:
-            self.total_block_num = self.num_gpu_blocks_override
-            self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
-        else:
-            length = num_total_tokens // number_of_tasks
-            block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
-            self.total_block_num = block_num * number_of_tasks
-            self.prefill_kvcache_block_num = self.total_block_num
-            llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
-
-    def reset(self, num_gpu_blocks):
-        """
-        reset gpu block number
-        """
-        self.total_block_num = num_gpu_blocks
-        self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
-        llm_logger.info(
-            f"Reset block num, the total_block_num:{self.total_block_num},"
-            f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
-        )
-
-    def print(self):
-        """
-        print all config
-
-        """
-        llm_logger.info("Cache Configuration Information :")
-        for k, v in self.__dict__.items():
-            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
-
-
 class SpeculativeConfig:
    """
    Speculative Decoding Configuration class.