mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-16 05:30:58 +08:00
Unify server-side and model-side Config (Part1) (#3018)
* move cache config * fix mtp
This commit is contained in:
@@ -20,6 +20,7 @@ from datetime import datetime
|
||||
from typing import Any, Dict, List, Literal, Optional
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import CacheConfig
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.scheduler import SchedulerConfig
|
||||
from fastdeploy.utils import (
|
||||
@@ -157,170 +158,6 @@ class ModelConfig:
|
||||
llm_logger.info("=============================================================")
|
||||
|
||||
|
||||
class CacheConfig:
|
||||
"""
|
||||
Configuration for the KV cache.
|
||||
|
||||
Attributes:
|
||||
block_size (int): Size of a cache block in number of tokens.
|
||||
gpu_memory_utilization (float): Fraction of GPU memory to use for model execution.
|
||||
cache_dtype (str): Data type for kv cache storage. Default is 'bfloat16'.
|
||||
num_gpu_blocks_override (Optional[int]): Number of GPU blocks to use.
|
||||
Overrides profiled num_gpu_blocks if provided.
|
||||
kv_cache_ratio (float): Ratio for calculating the maximum block number.
|
||||
enc_dec_block_num (int): Number of encoder-decoder blocks.
|
||||
prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding.
|
||||
enable_prefix_caching (bool): Flag to enable prefix caching.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_size: int,
|
||||
gpu_memory_utilization: float,
|
||||
cache_dtype: str = "bfloat16",
|
||||
num_gpu_blocks_override: Optional[int] = None,
|
||||
swap_space: Optional[int] = None,
|
||||
kv_cache_ratio: float = 0.75,
|
||||
enc_dec_block_num: int = 2,
|
||||
prealloc_dec_block_slot_num_threshold: int = 5,
|
||||
tensor_parallel_size: int = 1,
|
||||
enable_prefix_caching=False,
|
||||
enable_ssd_cache=False,
|
||||
model_cfg=None,
|
||||
cache_queue_port=None,
|
||||
enable_chunked_prefill=False,
|
||||
rdma_comm_ports=None,
|
||||
cache_transfer_protocol=None,
|
||||
pd_comm_port=None,
|
||||
):
|
||||
"""
|
||||
Initialize the CacheConfig class.
|
||||
|
||||
Args:
|
||||
block_size (int): Size of a cache block in number of tokens.
|
||||
gpu_memory_utilization (float): Fraction of GPU memory to use.
|
||||
cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
|
||||
num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
|
||||
num_cpu_blocks (Optional[int]): Number of CPU blocks.
|
||||
kv_cache_ratio (float): Ratio for max block calculation.
|
||||
enc_dec_block_num (int): Number of encoder-decoder blocks.
|
||||
prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
|
||||
enable_prefix_caching (bool): Enable prefix caching.
|
||||
"""
|
||||
self.block_size = block_size
|
||||
self.gpu_memory_utilization = gpu_memory_utilization
|
||||
self.num_gpu_blocks_override = num_gpu_blocks_override
|
||||
self.kv_cache_ratio = kv_cache_ratio
|
||||
self.enc_dec_block_num = enc_dec_block_num
|
||||
self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold
|
||||
self.cache_dtype = cache_dtype
|
||||
if hasattr(model_cfg, "quantization_config"):
|
||||
self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype)
|
||||
|
||||
self.enable_chunked_prefill = enable_chunked_prefill
|
||||
self.rdma_comm_ports = rdma_comm_ports
|
||||
self.cache_transfer_protocol = cache_transfer_protocol
|
||||
self.pd_comm_port = pd_comm_port
|
||||
|
||||
if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str):
|
||||
self.rdma_comm_ports = rdma_comm_ports.split(",")
|
||||
|
||||
if pd_comm_port is not None and isinstance(pd_comm_port, str):
|
||||
self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")]
|
||||
|
||||
self.enable_prefix_caching = enable_prefix_caching
|
||||
if swap_space is None:
|
||||
self.enable_hierarchical_cache = False
|
||||
else:
|
||||
self.enable_hierarchical_cache = True
|
||||
|
||||
self.enable_ssd_cache = enable_ssd_cache
|
||||
self.model_cfg = model_cfg
|
||||
self.cache_queue_port = cache_queue_port
|
||||
self.swap_space = swap_space
|
||||
|
||||
if (
|
||||
hasattr(self.model_cfg, "num_key_value_heads")
|
||||
and hasattr(self.model_cfg, "num_key_value_heads")
|
||||
and self.model_cfg.num_key_value_heads is not None
|
||||
and int(self.model_cfg.num_key_value_heads) > 0
|
||||
):
|
||||
kv_num_head = int(self.model_cfg.num_key_value_heads)
|
||||
else:
|
||||
kv_num_head = self.model_cfg.num_attention_heads
|
||||
self.model_cfg.kv_num_head = kv_num_head
|
||||
|
||||
# TODO check name
|
||||
if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
|
||||
byte_size = 0.5
|
||||
self.cache_dtype = "uint8"
|
||||
elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
|
||||
self.cache_dtype = "uint8"
|
||||
byte_size = 1
|
||||
else:
|
||||
byte_size = 2
|
||||
|
||||
self.each_token_cache_space = int(
|
||||
self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
|
||||
)
|
||||
self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
|
||||
self.bytes_per_layer_per_block = int(
|
||||
self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size
|
||||
)
|
||||
|
||||
if self.swap_space is None:
|
||||
self.num_cpu_blocks = 0
|
||||
else:
|
||||
self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
|
||||
self._verify_args()
|
||||
|
||||
def metrics_info(self):
|
||||
"""Convert cache_config to dict(key: str, value: str) for prometheus metrics info."""
|
||||
return {key: str(value) for key, value in self.__dict__.items()}
|
||||
|
||||
def _verify_args(self):
|
||||
if self.gpu_memory_utilization > 1.0:
|
||||
raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
|
||||
if self.kv_cache_ratio > 1.0:
|
||||
raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
|
||||
|
||||
def postprocess(self, num_total_tokens, number_of_tasks):
|
||||
"""
|
||||
calculate block num
|
||||
"""
|
||||
self.dec_token_num = self.enc_dec_block_num * self.block_size
|
||||
if self.num_gpu_blocks_override is not None:
|
||||
self.total_block_num = self.num_gpu_blocks_override
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
else:
|
||||
length = num_total_tokens // number_of_tasks
|
||||
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
|
||||
self.total_block_num = block_num * number_of_tasks
|
||||
self.prefill_kvcache_block_num = self.total_block_num
|
||||
llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
|
||||
|
||||
def reset(self, num_gpu_blocks):
|
||||
"""
|
||||
reset gpu block number
|
||||
"""
|
||||
self.total_block_num = num_gpu_blocks
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
llm_logger.info(
|
||||
f"Reset block num, the total_block_num:{self.total_block_num},"
|
||||
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
|
||||
)
|
||||
|
||||
def print(self):
|
||||
"""
|
||||
print all config
|
||||
|
||||
"""
|
||||
llm_logger.info("Cache Configuration Information :")
|
||||
for k, v in self.__dict__.items():
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||
llm_logger.info("=============================================================")
|
||||
|
||||
|
||||
class SpeculativeConfig:
|
||||
"""
|
||||
Speculative Decoding Configuration class.
|
||||
|
Reference in New Issue
Block a user