From 6250c686cca24aa8ead868027fbe4922e1241e59 Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Date: Tue, 23 Dec 2025 15:42:10 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20"Revert=20"[Optim]=20Remove=20limitati?= =?UTF-8?q?on=20of=20number=20of=20kvcache=20blocks=20(#5612)=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c1aa66df02dbb2fb257581ab4e36db4bfd85d4bf. --- docs/usage/environment_variables.md | 4 ++++ docs/zh/usage/environment_variables.md | 3 +++ fastdeploy/envs.py | 3 +++ fastdeploy/worker/iluvatar_worker.py | 10 +++++----- fastdeploy/worker/worker_process.py | 8 +++----- tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 2 +- tests/e2e/test_EB_VL_Lite_serving.py | 2 +- 7 files changed, 20 insertions(+), 12 deletions(-) diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md index c4c319f83..17fe91aee 100644 --- a/docs/usage/environment_variables.md +++ b/docs/usage/environment_variables.md @@ -88,5 +88,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # Count for cache_transfer_manager process error "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), + + # Max allocated KV cache blocks. Use this to limit how many KV cache blocks the engine is allowed to allocate. + # Set to -1 (default) for no limit, or a positive integer to cap the maximum number of blocks that can be allocated. + "FD_MAX_KVCACHE_BLOCKS": lambda: int(os.getenv("FD_MAX_KVCACHE_BLOCKS", "-1")), } ``` diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md index b0a162a8a..ad3cdad62 100644 --- a/docs/zh/usage/environment_variables.md +++ b/docs/zh/usage/environment_variables.md @@ -88,4 +88,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # cache_transfer_manager 进程残留时连续错误阈值 "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),} + + # KVCache Block块分配值的上限。此变量限制引擎分配的块数上限。当为默认值-1时表示不设限 + "FD_MAX_KVCACHE_BLOCKS": lambda: int(os.getenv("FD_MAX_KVCACHE_BLOCKS", "-1")), ``` diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 6d294a0c8..c74f46205 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -119,6 +119,9 @@ environment_variables: dict[str, Callable[[], Any]] = { "FD_EP_BATCHED_TOKEN_TIMEOUT": lambda: float(os.getenv("FD_EP_BATCHED_TOKEN_TIMEOUT", "0.1")), # Max pre-fetch requests number in PD "FD_EP_MAX_PREFETCH_TASK_NUM": lambda: int(os.getenv("FD_EP_MAX_PREFETCH_TASK_NUM", "8")), + # Max allocated KV cache blocks. Use this to limit how many KV cache blocks the engine is allowed to allocate. + # Set to -1 (default) for no limit, or a positive integer to cap the maximum number of blocks that can be allocated. + "FD_MAX_KVCACHE_BLOCKS": lambda: int(os.getenv("FD_MAX_KVCACHE_BLOCKS", "-1")), # Enable or disable model caching. # When enabled, the quantized model is stored as a cache for future inference to improve loading efficiency. "FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))), diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index 625aca86d..6ac65c4b7 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -21,6 +21,7 @@ import time import numpy as np import paddle +from fastdeploy import envs from fastdeploy.config import FDConfig from fastdeploy.inter_communicator import IPCSignal from fastdeploy.utils import get_logger, set_random_seed @@ -126,11 +127,10 @@ class IluvatarPaddleDisWorkerProc(PaddleDisWorkerProc): # 2. Calculate the appropriate number of blocks model_block_memory_used = self.worker.cal_theortical_kvcache() num_blocks_local = int(available_kv_cache_memory // model_block_memory_used) - # NOTE(liuzichang): Too many block will lead to illegal memory access - # We will develop dynamic limits in future. - if num_blocks_local > 40000: - logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000") - num_blocks_local = min(40000, num_blocks_local) + + if envs.FD_MAX_KVCACHE_BLOCKS > 0 and num_blocks_local > envs.FD_MAX_KVCACHE_BLOCKS: + logger.info(f"------- Reset num_blocks_local {num_blocks_local} to {envs.FD_MAX_KVCACHE_BLOCKS}") + num_blocks_local = envs.FD_MAX_KVCACHE_BLOCKS logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------") logger.info(f"------- num_blocks_local:{num_blocks_local} --------") diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index c3a3b5076..74bf185bd 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -530,11 +530,9 @@ class PaddleDisWorkerProc: # 2. Calculate the appropriate number of blocks model_block_memory_used = self.worker.cal_theortical_kvcache() num_blocks_local = int(available_kv_cache_memory // model_block_memory_used) - # NOTE(liuzichang): Too many block will lead to illegal memory access - # We will develop dynamic limits in future. - if num_blocks_local > 40000: - logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000") - num_blocks_local = min(40000, num_blocks_local) + if envs.FD_MAX_KVCACHE_BLOCKS > 0 and num_blocks_local > envs.FD_MAX_KVCACHE_BLOCKS: + logger.info(f"------- Reset num_blocks_local {num_blocks_local} to {envs.FD_MAX_KVCACHE_BLOCKS}") + num_blocks_local = envs.FD_MAX_KVCACHE_BLOCKS logger.info(f"------- model_block_memory_used:{model_block_memory_used / 1024**3} GB --------") logger.info(f"------- num_blocks_local:{num_blocks_local} --------") diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 686c53779..5c28fa67b 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -879,7 +879,7 @@ def test_structured_outputs_grammar(openai_client): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" log_file = "./log/config.log" - baseline = 40000 + baseline = 65565 if not os.path.exists(log_file): pytest.fail(f"Log file not found: {log_file}") diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index fed152d0e..4a01f718a 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -636,7 +636,7 @@ def test_chat_with_reasoning_max_tokens(openai_client): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" log_file = "./log/config.log" - baseline = 40000 + baseline = 65565 if not os.path.exists(log_file): pytest.fail(f"Log file not found: {log_file}")