mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Bug fix] fix block num setting in scheduler v1 for develop (#3303)
* fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix max_block_num and max_num_batched_tokens setting * fix max_block_num and max_num_batched_tokens setting * fix max_block_num and max_num_batched_tokens setting * fix max_block_num and max_num_batched_tokens setting
This commit is contained in:
@@ -64,6 +64,9 @@ class PrefixCacheManager:
|
|||||||
self.speculative_config = config.speculative_config
|
self.speculative_config = config.speculative_config
|
||||||
self.local_data_parallel_id = local_data_parallel_id
|
self.local_data_parallel_id = local_data_parallel_id
|
||||||
|
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.num_gpu_blocks = self.cache_config.total_block_num
|
||||||
|
else:
|
||||||
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
|
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
|
||||||
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
|
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
|
||||||
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
|
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
|
||||||
|
@@ -731,6 +731,9 @@ class CacheConfig:
|
|||||||
self.block_size = 64
|
self.block_size = 64
|
||||||
self.gpu_memory_utilization = 0.9
|
self.gpu_memory_utilization = 0.9
|
||||||
self.num_gpu_blocks_override = None
|
self.num_gpu_blocks_override = None
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.kv_cache_ratio = 1.0
|
||||||
|
else:
|
||||||
self.kv_cache_ratio = 0.75
|
self.kv_cache_ratio = 0.75
|
||||||
self.enc_dec_block_num = 0 if current_platform.is_iluvatar() else 2
|
self.enc_dec_block_num = 0 if current_platform.is_iluvatar() else 2
|
||||||
self.prealloc_dec_block_slot_num_threshold = 5
|
self.prealloc_dec_block_slot_num_threshold = 5
|
||||||
@@ -816,6 +819,9 @@ class CacheConfig:
|
|||||||
self.dec_token_num = self.enc_dec_block_num * self.block_size
|
self.dec_token_num = self.enc_dec_block_num * self.block_size
|
||||||
if self.num_gpu_blocks_override is not None:
|
if self.num_gpu_blocks_override is not None:
|
||||||
self.total_block_num = self.num_gpu_blocks_override
|
self.total_block_num = self.num_gpu_blocks_override
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.prefill_kvcache_block_num = self.total_block_num
|
||||||
|
else:
|
||||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||||
else:
|
else:
|
||||||
length = num_total_tokens // number_of_tasks
|
length = num_total_tokens // number_of_tasks
|
||||||
@@ -829,6 +835,9 @@ class CacheConfig:
|
|||||||
reset gpu block number
|
reset gpu block number
|
||||||
"""
|
"""
|
||||||
self.total_block_num = num_gpu_blocks
|
self.total_block_num = num_gpu_blocks
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.prefill_kvcache_block_num = self.total_block_num
|
||||||
|
else:
|
||||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Reset block num, the total_block_num:{self.total_block_num},"
|
f"Reset block num, the total_block_num:{self.total_block_num},"
|
||||||
|
@@ -18,6 +18,7 @@ import json
|
|||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from dataclasses import fields as dataclass_fields
|
from dataclasses import fields as dataclass_fields
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
import os
|
||||||
|
|
||||||
from fastdeploy.config import (
|
from fastdeploy.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
@@ -884,7 +885,10 @@ class EngineArgs:
|
|||||||
if self.enable_chunked_prefill:
|
if self.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
else:
|
else:
|
||||||
|
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
|
||||||
self.max_num_batched_tokens = self.max_model_len
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
|
else:
|
||||||
|
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||||
|
|
||||||
all_dict = asdict(self)
|
all_dict = asdict(self)
|
||||||
all_dict["model_cfg"] = model_cfg
|
all_dict["model_cfg"] = model_cfg
|
||||||
|
@@ -245,7 +245,10 @@ class Config:
|
|||||||
if self.cache_config.enable_chunked_prefill:
|
if self.cache_config.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
else:
|
else:
|
||||||
|
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
|
||||||
self.max_num_batched_tokens = self.max_model_len
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
|
else:
|
||||||
|
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||||
|
|
||||||
if self.long_prefill_token_threshold == 0:
|
if self.long_prefill_token_threshold == 0:
|
||||||
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
||||||
@@ -293,6 +296,7 @@ class Config:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not self.cache_config.enable_chunked_prefill:
|
if not self.cache_config.enable_chunked_prefill:
|
||||||
|
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
|
||||||
assert self.max_num_batched_tokens >= self.max_model_len, (
|
assert self.max_num_batched_tokens >= self.max_model_len, (
|
||||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||||
|
Reference in New Issue
Block a user