diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 0ac34ad6a..f033a565c 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -64,7 +64,10 @@ class PrefixCacheManager: self.speculative_config = config.speculative_config self.local_data_parallel_id = local_data_parallel_id - self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.num_gpu_blocks = self.cache_config.total_block_num + else: + self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num self.num_cpu_blocks = self.cache_config.num_cpu_blocks self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1)) if self.num_cpu_blocks > 0: diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 99278f7d1..68863777c 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -731,7 +731,10 @@ class CacheConfig: self.block_size = 64 self.gpu_memory_utilization = 0.9 self.num_gpu_blocks_override = None - self.kv_cache_ratio = 0.75 + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.kv_cache_ratio = 1.0 + else: + self.kv_cache_ratio = 0.75 self.enc_dec_block_num = 0 if current_platform.is_iluvatar() else 2 self.prealloc_dec_block_slot_num_threshold = 5 self.cache_dtype = "bfloat16" @@ -816,7 +819,10 @@ class CacheConfig: self.dec_token_num = self.enc_dec_block_num * self.block_size if self.num_gpu_blocks_override is not None: self.total_block_num = self.num_gpu_blocks_override - self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.prefill_kvcache_block_num = self.total_block_num + else: + self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) else: length = num_total_tokens // number_of_tasks block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size @@ -829,7 +835,10 @@ class CacheConfig: reset gpu block number """ self.total_block_num = num_gpu_blocks - self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.prefill_kvcache_block_num = self.total_block_num + else: + self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) logger.info( f"Reset block num, the total_block_num:{self.total_block_num}," f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index ba712ed0c..1086334c7 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -18,6 +18,7 @@ import json from dataclasses import asdict, dataclass from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional +import os from fastdeploy.config import ( CacheConfig, @@ -884,7 +885,10 @@ class EngineArgs: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - self.max_num_batched_tokens = self.max_model_len + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM all_dict = asdict(self) all_dict["model_cfg"] = model_cfg diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 035cea96c..eb3da60dd 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -245,7 +245,10 @@ class Config: if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - self.max_num_batched_tokens = self.max_model_len + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) @@ -293,10 +296,11 @@ class Config: ) if not self.cache_config.enable_chunked_prefill: - assert self.max_num_batched_tokens >= self.max_model_len, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " - f"should be larger than or equal to max_model_len: {self.max_model_len}" - ) + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + assert self.max_num_batched_tokens >= self.max_model_len, ( + f"max_num_batched_tokens: {self.max_num_batched_tokens} " + f"should be larger than or equal to max_model_len: {self.max_model_len}" + ) else: assert self.max_num_batched_tokens >= self.cache_config.block_size, ( f"max_num_batched_tokens: {self.max_num_batched_tokens} "