[Bug fix] fix block num setting in scheduler v1 for develop (#3303)

* fix block num setting in scheduler v1

* fix block num setting in scheduler v1

* fix max_block_num and max_num_batched_tokens setting

* fix max_block_num and max_num_batched_tokens setting

* fix max_block_num and max_num_batched_tokens setting

* fix max_block_num and max_num_batched_tokens setting
This commit is contained in:
chenjian
2025-08-12 10:38:51 +08:00
committed by GitHub
parent 183e3863e8
commit b21272d9ff
4 changed files with 30 additions and 10 deletions

View File

@@ -64,7 +64,10 @@ class PrefixCacheManager:
self.speculative_config = config.speculative_config self.speculative_config = config.speculative_config
self.local_data_parallel_id = local_data_parallel_id self.local_data_parallel_id = local_data_parallel_id
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num if envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.num_gpu_blocks = self.cache_config.total_block_num
else:
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
self.num_cpu_blocks = self.cache_config.num_cpu_blocks self.num_cpu_blocks = self.cache_config.num_cpu_blocks
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1)) self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
if self.num_cpu_blocks > 0: if self.num_cpu_blocks > 0:

View File

@@ -731,7 +731,10 @@ class CacheConfig:
self.block_size = 64 self.block_size = 64
self.gpu_memory_utilization = 0.9 self.gpu_memory_utilization = 0.9
self.num_gpu_blocks_override = None self.num_gpu_blocks_override = None
self.kv_cache_ratio = 0.75 if envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.kv_cache_ratio = 1.0
else:
self.kv_cache_ratio = 0.75
self.enc_dec_block_num = 0 if current_platform.is_iluvatar() else 2 self.enc_dec_block_num = 0 if current_platform.is_iluvatar() else 2
self.prealloc_dec_block_slot_num_threshold = 5 self.prealloc_dec_block_slot_num_threshold = 5
self.cache_dtype = "bfloat16" self.cache_dtype = "bfloat16"
@@ -816,7 +819,10 @@ class CacheConfig:
self.dec_token_num = self.enc_dec_block_num * self.block_size self.dec_token_num = self.enc_dec_block_num * self.block_size
if self.num_gpu_blocks_override is not None: if self.num_gpu_blocks_override is not None:
self.total_block_num = self.num_gpu_blocks_override self.total_block_num = self.num_gpu_blocks_override
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) if envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.prefill_kvcache_block_num = self.total_block_num
else:
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
else: else:
length = num_total_tokens // number_of_tasks length = num_total_tokens // number_of_tasks
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
@@ -829,7 +835,10 @@ class CacheConfig:
reset gpu block number reset gpu block number
""" """
self.total_block_num = num_gpu_blocks self.total_block_num = num_gpu_blocks
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) if envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.prefill_kvcache_block_num = self.total_block_num
else:
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
logger.info( logger.info(
f"Reset block num, the total_block_num:{self.total_block_num}," f"Reset block num, the total_block_num:{self.total_block_num},"
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"

View File

@@ -18,6 +18,7 @@ import json
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from dataclasses import fields as dataclass_fields from dataclasses import fields as dataclass_fields
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import os
from fastdeploy.config import ( from fastdeploy.config import (
CacheConfig, CacheConfig,
@@ -884,7 +885,10 @@ class EngineArgs:
if self.enable_chunked_prefill: if self.enable_chunked_prefill:
self.max_num_batched_tokens = 2048 self.max_num_batched_tokens = 2048
else: else:
self.max_num_batched_tokens = self.max_model_len if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
all_dict = asdict(self) all_dict = asdict(self)
all_dict["model_cfg"] = model_cfg all_dict["model_cfg"] = model_cfg

View File

@@ -245,7 +245,10 @@ class Config:
if self.cache_config.enable_chunked_prefill: if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048 self.max_num_batched_tokens = 2048
else: else:
self.max_num_batched_tokens = self.max_model_len if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
if self.long_prefill_token_threshold == 0: if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04) self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -293,10 +296,11 @@ class Config:
) )
if not self.cache_config.enable_chunked_prefill: if not self.cache_config.enable_chunked_prefill:
assert self.max_num_batched_tokens >= self.max_model_len, ( if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
f"max_num_batched_tokens: {self.max_num_batched_tokens} " assert self.max_num_batched_tokens >= self.max_model_len, (
f"should be larger than or equal to max_model_len: {self.max_model_len}" f"max_num_batched_tokens: {self.max_num_batched_tokens} "
) f"should be larger than or equal to max_model_len: {self.max_model_len}"
)
else: else:
assert self.max_num_batched_tokens >= self.cache_config.block_size, ( assert self.max_num_batched_tokens >= self.cache_config.block_size, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"max_num_batched_tokens: {self.max_num_batched_tokens} "