diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index b403d3944..ab9dabf05 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -64,7 +64,10 @@ class PrefixCacheManager: self.speculative_config = config.speculative_config self.local_data_parallel_id = local_data_parallel_id - self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.num_gpu_blocks = self.cache_config.total_block_num + else: + self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num self.num_cpu_blocks = self.cache_config.num_cpu_blocks self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1)) if self.num_cpu_blocks > 0: diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 613831a76..c31543ee1 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -18,6 +18,7 @@ import json from dataclasses import asdict, dataclass from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional +import os from fastdeploy.engine.config import ( CacheConfig, @@ -854,7 +855,10 @@ class EngineArgs: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - self.max_num_batched_tokens = self.max_model_len + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 scheduler_cfg = self.create_scheduler_config() speculative_cfg = self.create_speculative_config() graph_opt_cfg = self.create_graph_optimization_config() diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 0e44ecd0e..25dcc19c3 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -211,6 +211,8 @@ class CacheConfig: self.gpu_memory_utilization = gpu_memory_utilization self.num_gpu_blocks_override = num_gpu_blocks_override self.kv_cache_ratio = kv_cache_ratio + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.kv_cache_ratio = 1.0 self.enc_dec_block_num = enc_dec_block_num self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold self.cache_dtype = cache_dtype @@ -291,7 +293,10 @@ class CacheConfig: self.dec_token_num = self.enc_dec_block_num * self.block_size if self.num_gpu_blocks_override is not None: self.total_block_num = self.num_gpu_blocks_override - self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.prefill_kvcache_block_num = self.total_block_num + else: + self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) else: length = num_total_tokens // number_of_tasks block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size @@ -304,7 +309,10 @@ class CacheConfig: reset gpu block number """ self.total_block_num = num_gpu_blocks - self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + self.prefill_kvcache_block_num = self.total_block_num + else: + self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) llm_logger.info( f"Reset block num, the total_block_num:{self.total_block_num}," f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" @@ -796,7 +804,10 @@ class Config: if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - self.max_num_batched_tokens = self.max_model_len + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) @@ -844,10 +855,11 @@ class Config: ) if not self.cache_config.enable_chunked_prefill: - assert self.max_num_batched_tokens >= self.max_model_len, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " - f"should be larger than or equal to max_model_len: {self.max_model_len}" - ) + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + assert self.max_num_batched_tokens >= self.max_model_len, ( + f"max_num_batched_tokens: {self.max_num_batched_tokens} " + f"should be larger than or equal to max_model_len: {self.max_model_len}" + ) else: assert self.max_num_batched_tokens >= self.cache_config.block_size, ( f"max_num_batched_tokens: {self.max_num_batched_tokens} " diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 61b34c468..5b147c4aa 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -234,8 +234,7 @@ class ResourceManagerV1(ResourceManager): llm_logger.debug( f"scheduler prefill task: {request} request.need_prefill_tokens {request.need_prefill_tokens} request.num_computed_tokens {request.num_computed_tokens}" ) - num_new_tokens = request.prompt_token_ids_len - request.num_computed_tokens - num_new_tokens = min(num_new_tokens, token_budget) + num_new_tokens = self._get_num_new_tokens(request, token_budget) num_new_block = self.get_new_block_nums(request, num_new_tokens) # Allocate blocks to prefill if self.cache_manager.can_allocate_gpu_blocks(num_new_block): diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 844f80e61..cbd9f6c2d 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -208,11 +208,15 @@ class GPUModelRunner(ModelRunnerBase): request = req_dicts[i] idx = request.idx if request.task_type.value == RequestType.PREFILL.value: # prefill task - logger.debug(f"Handle prefill request {request} at idx {idx}") prefill_start_index = request.prefill_start_index prefill_end_index = request.prefill_end_index length = prefill_end_index - prefill_start_index input_ids = request.prompt_token_ids + request.output_token_ids + logger.debug( + f"Handle prefill request {request} at idx {idx}, " + f"{prefill_start_index=}, {prefill_end_index=}, " + f"need_prefilled_token_num={len(input_ids)}" + ) self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array( input_ids[prefill_start_index:prefill_end_index] )