mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[Bug fix] Fix block num in scheduler v1 for release2.0.4 (#3314)
* fix bug for scheduler v0 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1
This commit is contained in:
@@ -64,6 +64,9 @@ class PrefixCacheManager:
|
|||||||
self.speculative_config = config.speculative_config
|
self.speculative_config = config.speculative_config
|
||||||
self.local_data_parallel_id = local_data_parallel_id
|
self.local_data_parallel_id = local_data_parallel_id
|
||||||
|
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.num_gpu_blocks = self.cache_config.total_block_num
|
||||||
|
else:
|
||||||
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
|
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
|
||||||
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
|
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
|
||||||
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
|
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
|
||||||
|
@@ -18,6 +18,7 @@ import json
|
|||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from dataclasses import fields as dataclass_fields
|
from dataclasses import fields as dataclass_fields
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
import os
|
||||||
|
|
||||||
from fastdeploy.engine.config import (
|
from fastdeploy.engine.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
@@ -854,7 +855,10 @@ class EngineArgs:
|
|||||||
if self.enable_chunked_prefill:
|
if self.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
else:
|
else:
|
||||||
|
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
|
||||||
self.max_num_batched_tokens = self.max_model_len
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
|
else:
|
||||||
|
self.max_num_batched_tokens = 8192
|
||||||
scheduler_cfg = self.create_scheduler_config()
|
scheduler_cfg = self.create_scheduler_config()
|
||||||
speculative_cfg = self.create_speculative_config()
|
speculative_cfg = self.create_speculative_config()
|
||||||
graph_opt_cfg = self.create_graph_optimization_config()
|
graph_opt_cfg = self.create_graph_optimization_config()
|
||||||
|
@@ -211,6 +211,8 @@ class CacheConfig:
|
|||||||
self.gpu_memory_utilization = gpu_memory_utilization
|
self.gpu_memory_utilization = gpu_memory_utilization
|
||||||
self.num_gpu_blocks_override = num_gpu_blocks_override
|
self.num_gpu_blocks_override = num_gpu_blocks_override
|
||||||
self.kv_cache_ratio = kv_cache_ratio
|
self.kv_cache_ratio = kv_cache_ratio
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.kv_cache_ratio = 1.0
|
||||||
self.enc_dec_block_num = enc_dec_block_num
|
self.enc_dec_block_num = enc_dec_block_num
|
||||||
self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold
|
self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold
|
||||||
self.cache_dtype = cache_dtype
|
self.cache_dtype = cache_dtype
|
||||||
@@ -291,6 +293,9 @@ class CacheConfig:
|
|||||||
self.dec_token_num = self.enc_dec_block_num * self.block_size
|
self.dec_token_num = self.enc_dec_block_num * self.block_size
|
||||||
if self.num_gpu_blocks_override is not None:
|
if self.num_gpu_blocks_override is not None:
|
||||||
self.total_block_num = self.num_gpu_blocks_override
|
self.total_block_num = self.num_gpu_blocks_override
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.prefill_kvcache_block_num = self.total_block_num
|
||||||
|
else:
|
||||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||||
else:
|
else:
|
||||||
length = num_total_tokens // number_of_tasks
|
length = num_total_tokens // number_of_tasks
|
||||||
@@ -304,6 +309,9 @@ class CacheConfig:
|
|||||||
reset gpu block number
|
reset gpu block number
|
||||||
"""
|
"""
|
||||||
self.total_block_num = num_gpu_blocks
|
self.total_block_num = num_gpu_blocks
|
||||||
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
|
self.prefill_kvcache_block_num = self.total_block_num
|
||||||
|
else:
|
||||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||||
llm_logger.info(
|
llm_logger.info(
|
||||||
f"Reset block num, the total_block_num:{self.total_block_num},"
|
f"Reset block num, the total_block_num:{self.total_block_num},"
|
||||||
@@ -796,7 +804,10 @@ class Config:
|
|||||||
if self.cache_config.enable_chunked_prefill:
|
if self.cache_config.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
else:
|
else:
|
||||||
|
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
|
||||||
self.max_num_batched_tokens = self.max_model_len
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
|
else:
|
||||||
|
self.max_num_batched_tokens = 8192
|
||||||
|
|
||||||
if self.long_prefill_token_threshold == 0:
|
if self.long_prefill_token_threshold == 0:
|
||||||
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
||||||
@@ -844,6 +855,7 @@ class Config:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not self.cache_config.enable_chunked_prefill:
|
if not self.cache_config.enable_chunked_prefill:
|
||||||
|
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
|
||||||
assert self.max_num_batched_tokens >= self.max_model_len, (
|
assert self.max_num_batched_tokens >= self.max_model_len, (
|
||||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||||
|
@@ -234,8 +234,7 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
llm_logger.debug(
|
llm_logger.debug(
|
||||||
f"scheduler prefill task: {request} request.need_prefill_tokens {request.need_prefill_tokens} request.num_computed_tokens {request.num_computed_tokens}"
|
f"scheduler prefill task: {request} request.need_prefill_tokens {request.need_prefill_tokens} request.num_computed_tokens {request.num_computed_tokens}"
|
||||||
)
|
)
|
||||||
num_new_tokens = request.prompt_token_ids_len - request.num_computed_tokens
|
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||||
num_new_tokens = min(num_new_tokens, token_budget)
|
|
||||||
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
||||||
# Allocate blocks to prefill
|
# Allocate blocks to prefill
|
||||||
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
|
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
|
||||||
|
@@ -208,11 +208,15 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
request = req_dicts[i]
|
request = req_dicts[i]
|
||||||
idx = request.idx
|
idx = request.idx
|
||||||
if request.task_type.value == RequestType.PREFILL.value: # prefill task
|
if request.task_type.value == RequestType.PREFILL.value: # prefill task
|
||||||
logger.debug(f"Handle prefill request {request} at idx {idx}")
|
|
||||||
prefill_start_index = request.prefill_start_index
|
prefill_start_index = request.prefill_start_index
|
||||||
prefill_end_index = request.prefill_end_index
|
prefill_end_index = request.prefill_end_index
|
||||||
length = prefill_end_index - prefill_start_index
|
length = prefill_end_index - prefill_start_index
|
||||||
input_ids = request.prompt_token_ids + request.output_token_ids
|
input_ids = request.prompt_token_ids + request.output_token_ids
|
||||||
|
logger.debug(
|
||||||
|
f"Handle prefill request {request} at idx {idx}, "
|
||||||
|
f"{prefill_start_index=}, {prefill_end_index=}, "
|
||||||
|
f"need_prefilled_token_num={len(input_ids)}"
|
||||||
|
)
|
||||||
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
|
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
|
||||||
input_ids[prefill_start_index:prefill_end_index]
|
input_ids[prefill_start_index:prefill_end_index]
|
||||||
)
|
)
|
||||||
|
Reference in New Issue
Block a user