mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
This reverts commit 9da89a374b.
This commit is contained in:
@@ -88,9 +88,5 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
|
|
||||||
# Count for cache_transfer_manager process error
|
# Count for cache_transfer_manager process error
|
||||||
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
|
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
|
||||||
|
|
||||||
# Max allocated KV cache blocks. Use this to limit how many KV cache blocks the engine is allowed to allocate.
|
|
||||||
# Set to -1 (default) for no limit, or a positive integer to cap the maximum number of blocks that can be allocated.
|
|
||||||
"FD_MAX_KVCACHE_BLOCKS": lambda: int(os.getenv("FD_MAX_KVCACHE_BLOCKS", "-1")),
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -88,7 +88,4 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
|
|
||||||
# cache_transfer_manager 进程残留时连续错误阈值
|
# cache_transfer_manager 进程残留时连续错误阈值
|
||||||
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),}
|
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),}
|
||||||
|
|
||||||
# KVCache Block块分配值的上限。此变量限制引擎分配的块数上限。当为默认值-1时表示不设限
|
|
||||||
"FD_MAX_KVCACHE_BLOCKS": lambda: int(os.getenv("FD_MAX_KVCACHE_BLOCKS", "-1")),
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -119,9 +119,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"FD_EP_BATCHED_TOKEN_TIMEOUT": lambda: float(os.getenv("FD_EP_BATCHED_TOKEN_TIMEOUT", "0.1")),
|
"FD_EP_BATCHED_TOKEN_TIMEOUT": lambda: float(os.getenv("FD_EP_BATCHED_TOKEN_TIMEOUT", "0.1")),
|
||||||
# Max pre-fetch requests number in PD
|
# Max pre-fetch requests number in PD
|
||||||
"FD_EP_MAX_PREFETCH_TASK_NUM": lambda: int(os.getenv("FD_EP_MAX_PREFETCH_TASK_NUM", "8")),
|
"FD_EP_MAX_PREFETCH_TASK_NUM": lambda: int(os.getenv("FD_EP_MAX_PREFETCH_TASK_NUM", "8")),
|
||||||
# Max allocated KV cache blocks. Use this to limit how many KV cache blocks the engine is allowed to allocate.
|
|
||||||
# Set to -1 (default) for no limit, or a positive integer to cap the maximum number of blocks that can be allocated.
|
|
||||||
"FD_MAX_KVCACHE_BLOCKS": lambda: int(os.getenv("FD_MAX_KVCACHE_BLOCKS", "-1")),
|
|
||||||
# Enable or disable model caching.
|
# Enable or disable model caching.
|
||||||
# When enabled, the quantized model is stored as a cache for future inference to improve loading efficiency.
|
# When enabled, the quantized model is stored as a cache for future inference to improve loading efficiency.
|
||||||
"FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))),
|
"FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))),
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ import time
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
from fastdeploy import envs
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.inter_communicator import IPCSignal
|
from fastdeploy.inter_communicator import IPCSignal
|
||||||
from fastdeploy.utils import get_logger, set_random_seed
|
from fastdeploy.utils import get_logger, set_random_seed
|
||||||
@@ -127,10 +126,11 @@ class IluvatarPaddleDisWorkerProc(PaddleDisWorkerProc):
|
|||||||
# 2. Calculate the appropriate number of blocks
|
# 2. Calculate the appropriate number of blocks
|
||||||
model_block_memory_used = self.worker.cal_theortical_kvcache()
|
model_block_memory_used = self.worker.cal_theortical_kvcache()
|
||||||
num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
|
num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
|
||||||
|
# NOTE(liuzichang): Too many block will lead to illegal memory access
|
||||||
if envs.FD_MAX_KVCACHE_BLOCKS > 0 and num_blocks_local > envs.FD_MAX_KVCACHE_BLOCKS:
|
# We will develop dynamic limits in future.
|
||||||
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to {envs.FD_MAX_KVCACHE_BLOCKS}")
|
if num_blocks_local > 40000:
|
||||||
num_blocks_local = envs.FD_MAX_KVCACHE_BLOCKS
|
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
|
||||||
|
num_blocks_local = min(40000, num_blocks_local)
|
||||||
logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
|
logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
|
||||||
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
|
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
|
||||||
|
|
||||||
|
|||||||
@@ -530,9 +530,11 @@ class PaddleDisWorkerProc:
|
|||||||
# 2. Calculate the appropriate number of blocks
|
# 2. Calculate the appropriate number of blocks
|
||||||
model_block_memory_used = self.worker.cal_theortical_kvcache()
|
model_block_memory_used = self.worker.cal_theortical_kvcache()
|
||||||
num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
|
num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
|
||||||
if envs.FD_MAX_KVCACHE_BLOCKS > 0 and num_blocks_local > envs.FD_MAX_KVCACHE_BLOCKS:
|
# NOTE(liuzichang): Too many block will lead to illegal memory access
|
||||||
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to {envs.FD_MAX_KVCACHE_BLOCKS}")
|
# We will develop dynamic limits in future.
|
||||||
num_blocks_local = envs.FD_MAX_KVCACHE_BLOCKS
|
if num_blocks_local > 40000:
|
||||||
|
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
|
||||||
|
num_blocks_local = min(40000, num_blocks_local)
|
||||||
logger.info(f"------- model_block_memory_used:{model_block_memory_used / 1024**3} GB --------")
|
logger.info(f"------- model_block_memory_used:{model_block_memory_used / 1024**3} GB --------")
|
||||||
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
|
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
|
||||||
|
|
||||||
|
|||||||
@@ -879,7 +879,7 @@ def test_structured_outputs_grammar(openai_client):
|
|||||||
def test_profile_reset_block_num():
|
def test_profile_reset_block_num():
|
||||||
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
|
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
|
||||||
log_file = "./log/config.log"
|
log_file = "./log/config.log"
|
||||||
baseline = 65565
|
baseline = 40000
|
||||||
|
|
||||||
if not os.path.exists(log_file):
|
if not os.path.exists(log_file):
|
||||||
pytest.fail(f"Log file not found: {log_file}")
|
pytest.fail(f"Log file not found: {log_file}")
|
||||||
|
|||||||
@@ -636,7 +636,7 @@ def test_chat_with_reasoning_max_tokens(openai_client):
|
|||||||
def test_profile_reset_block_num():
|
def test_profile_reset_block_num():
|
||||||
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
|
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
|
||||||
log_file = "./log/config.log"
|
log_file = "./log/config.log"
|
||||||
baseline = 65565
|
baseline = 40000
|
||||||
|
|
||||||
if not os.path.exists(log_file):
|
if not os.path.exists(log_file):
|
||||||
pytest.fail(f"Log file not found: {log_file}")
|
pytest.fail(f"Log file not found: {log_file}")
|
||||||
|
|||||||
Reference in New Issue
Block a user