Revert "[Optim] Remove limitation of number of kvcache blocks (#5612)" (#5702)

This reverts commit 9da89a374b.
This commit is contained in:
Divano
2025-12-23 15:41:33 +08:00
committed by GitHub
parent 0bef9b684f
commit c1aa66df02
7 changed files with 12 additions and 20 deletions

View File

@@ -21,7 +21,6 @@ import time
import numpy as np
import paddle
from fastdeploy import envs
from fastdeploy.config import FDConfig
from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.utils import get_logger, set_random_seed
@@ -127,10 +126,11 @@ class IluvatarPaddleDisWorkerProc(PaddleDisWorkerProc):
# 2. Calculate the appropriate number of blocks
model_block_memory_used = self.worker.cal_theortical_kvcache()
num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
if envs.FD_MAX_KVCACHE_BLOCKS > 0 and num_blocks_local > envs.FD_MAX_KVCACHE_BLOCKS:
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to {envs.FD_MAX_KVCACHE_BLOCKS}")
num_blocks_local = envs.FD_MAX_KVCACHE_BLOCKS
# NOTE(liuzichang): Too many block will lead to illegal memory access
# We will develop dynamic limits in future.
if num_blocks_local > 40000:
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
num_blocks_local = min(40000, num_blocks_local)
logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")

View File

@@ -530,9 +530,11 @@ class PaddleDisWorkerProc:
# 2. Calculate the appropriate number of blocks
model_block_memory_used = self.worker.cal_theortical_kvcache()
num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
if envs.FD_MAX_KVCACHE_BLOCKS > 0 and num_blocks_local > envs.FD_MAX_KVCACHE_BLOCKS:
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to {envs.FD_MAX_KVCACHE_BLOCKS}")
num_blocks_local = envs.FD_MAX_KVCACHE_BLOCKS
# NOTE(liuzichang): Too many block will lead to illegal memory access
# We will develop dynamic limits in future.
if num_blocks_local > 40000:
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
num_blocks_local = min(40000, num_blocks_local)
logger.info(f"------- model_block_memory_used:{model_block_memory_used / 1024**3} GB --------")
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")