Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

View File

@@ -14,110 +14,116 @@
# limitations under the License.
"""
import copy
import os
import math
import random
import threading
import time
import numpy as np
from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
from fastdeploy.metrics.metrics import main_process_metrics
from fastdeploy.utils import llm_logger
class ResourceManager(object):
"""Manages and allocates computational resources for the inference engine.
This class handles the allocation and recycling of memory blocks for KV cache,
manages task scheduling, and tracks resource utilization.
"""
def __init__(self, max_num_seqs, cache_config):
"""Initializes the resource manager with configuration parameters.
Args:
max_num_seqs (int): Maximum number of concurrent sequences the engine can handle
cache_config (Config): Configuration object containing:
- prefill_kvcache_block_num: Number of pre-allocated KV cache blocks
- block_size: Size of each memory block in tokens
- dec_token_num: Number of decoder tokens
record and allocate resources for the engine
"""
def __init__(self,
max_num_seqs,
config,
tensor_parallel_size,
splitwise_role,
local_data_parallel_id=0):
"""
self.cfg = cache_config
Args:
cfg (Config): config object containing parameters for the engine
initialization
Returns:
None
Initializes the engine with the given configuration and sets up necessary
data structures to manage tasks and blocks.
"""
self.cfg = config.cache_config
self.max_num_seqs = max_num_seqs
self.stop_flags = [True] * max_num_seqs
self.free_list = list(range(self.cfg.prefill_kvcache_block_num - 1, -1, -1))
self.enable_prefix_cache = config.cache_config.enable_prefix_caching
self.cache_manager = PrefixCacheManager(config, tensor_parallel_size,
splitwise_role,
local_data_parallel_id)
self.tasks_list = [None] * max_num_seqs
self.req_dict = dict()
# current batch status of the engine
self.real_bsz = 0
llm_logger.info(f"{self.info()}")
def reset_cache_config(self, cfg):
"""Updates the cache configuration with new parameters.
Args:
cfg (Config): New cache configuration object
"""
reset cache config
"""
self.cfg = cfg
self.free_list = list(range(self.cfg.prefill_kvcache_block_num - 1, -1, -1))
self.cache_manager.update_cache_config(cfg)
def get_required_block_number(self, input_token_num):
"""Calculates the total number of blocks needed for a sequence.
Includes both encoder and decoder requirements.
Args:
input_token_num (int): Number of tokens in the input sequence
Returns:
int: Total number of blocks required (rounded up)
"""
block_num = (input_token_num + self.cfg.block_size - 1 + self.cfg.dec_token_num) // self.cfg.block_size
Calculate Block resources are needed
Args:
input_token_num (int): input token number
Returns:
int: block number
"""
block_num = (input_token_num + self.cfg.block_size - 1 +
self.cfg.dec_token_num) // self.cfg.block_size
return block_num
def get_encoder_block_number(self, input_token_num):
"""Calculates the number of blocks needed for encoder inputs only.
Args:
input_token_num (int): Number of tokens in the encoder input
Returns:
int: Number of blocks required for encoder (rounded up)
"""
enc_block_num = (input_token_num + self.cfg.block_size - 1) // self.cfg.block_size
get the number of blocks for the encoder
Args:
input_token_num (int): input token number
Returns:
int: encoder block number
"""
enc_block_num = (input_token_num + self.cfg.block_size -
1) // self.cfg.block_size
return enc_block_num
def get_decoder_block_number(self):
"""Calculates the number of blocks needed for decoder outputs.
Returns:
int: Number of blocks required for decoder (rounded up)
"""
return (self.cfg.dec_token_num + self.cfg.block_size - 1) // self.cfg.block_size
get the number of blocks for the decoder
Returns:
int: decoder block number
"""
return (self.cfg.dec_token_num + self.cfg.block_size -
1) // self.cfg.block_size
def total_block_number(self):
"""Gets the total number of pre-allocated KV cache blocks.
Returns:
int: Total number of blocks available in the pool
"""
return self.cfg.prefill_kvcache_block_num
the number of pre allocated blocks at service startup
Returns:
int: total block number
"""
return self.cache_manager.num_gpu_blocks
def _get_block_tables(self, input_token_num, required_type="all"):
"""Allocates memory blocks from the free pool.
"""
allocate memory resources
Args:
input_token_num (int): Number of input tokens
required_type (str): Type of blocks needed:
- "all": Both encoder and decoder blocks
- "encoder": Encoder blocks only
- "decoder": Decoder blocks only
input_token_num (int): input token number
required_type (str): required type
Returns:
list: List of allocated block IDs
Raises:
ValueError: If unknown required_type is specified
list: block list
"""
if required_type == "all":
block_num = self.get_required_block_number(input_token_num)
@@ -129,50 +135,75 @@ class ResourceManager(object):
raise ValueError('unknown required type')
block_list = list()
if block_num > len(self.free_list):
llm_logger.error("block_num:{0} > free_list len:{1}".format(block_num, len(self.free_list)))
current_block_num = self.available_block_num()
if block_num > current_block_num:
llm_logger.error("block_num:{0} > free_list len:{1}".format(
block_num, current_block_num))
return block_list
for _ in range(block_num):
used_block_id = self.free_list.pop()
block_list.append(used_block_id)
block_list = self.cache_manager.allocate_gpu_blocks(block_num)
llm_logger.debug(f"dispatch {len(block_list)} blocks.")
return block_list
def _recycle_block_tables(self, block_tables):
"""Returns memory blocks to the free pool for reuse.
Args:
block_tables (list): List of block IDs to recycle
def check_and_free_block_tables(self):
"""
ori_number = len(self.free_list)
self.free_list.extend(block_tables)
cur_number = len(self.free_list)
llm_logger.info(f"recycle {cur_number - ori_number} blocks.")
Check and free block tables only in prefix caching mode.
If the number of free blocks is less than a certain threshold, free up to the threshold.
"""
if self.enable_prefix_cache:
if self.available_block_num() < self.cfg.max_block_num_per_seq:
self.free_block_tables(self.cfg.max_block_num_per_seq)
def _recycle_block_tables(self, task):
"""
Recycling memory resource blocks
Args:
block_tables (list): block list
"""
if self.enable_prefix_cache:
self.cache_manager.release_block_ids_async(task)
else:
req_id = task.request_id
if isinstance(task, list):
block_tables = task
else:
block_tables = task.block_tables
ori_number = self.available_block_num()
self.cache_manager.recycle_gpu_blocks(block_tables)
cur_number = self.available_block_num()
main_process_metrics.gpu_cache_usage_perc.set(
self.get_gpu_cache_usage_perc())
llm_logger.info(
f"recycle {req_id} {cur_number - ori_number} blocks.")
def available_batch(self):
"""Gets the number of available sequence slots.
"""
available batch size for engine
Returns:
int: Number of available sequence slots in the batch
int: available batch size
"""
return np.sum(self.stop_flags)
def available_block_num(self):
"""Gets the number of available memory blocks.
Returns:
int: Number of free blocks in the pool
"""
return len(self.free_list)
available block size for engine
Returns:
int: available block size
"""
return len(self.cache_manager.gpu_free_block_list)
def is_resource_sufficient(self, input_token_num):
"""Checks if sufficient resources are available for a new sequence.
"""
check current available resources meet the new requirements
Args:
input_token_num (int): Number of tokens in the new sequence
input_token_num (int): input token number
Returns:
bool: True if both batch slots and memory blocks are available
bool: whether current available resources meet the new requirements
"""
if self.available_batch() < 1:
return False
@@ -181,19 +212,21 @@ class ResourceManager(object):
return False
return True
def free_block_tables(self, need_reserved_block_num):
"""
回收block到可用资源池
"""
return self.cache_manager.free_block_ids_async(need_reserved_block_num)
def allocate_resources_for_new_tasks(self, tasks):
"""Assigns resources to new inference tasks.
"""
allocate resources for new tasks
Args:
tasks (list): List of Request objects needing resources
tasks (list): task list
Returns:
list: List of successfully allocated Request objects
Note:
- Assigns sequence slots and memory blocks
- Sets initial timestamps and metadata
- Updates real-time batch size statistics
list: processed task list
"""
allocated_position = 0
@@ -205,7 +238,8 @@ class ResourceManager(object):
can_insert = False
while allocated_position + 1 <= self.max_num_seqs:
if sum(self.stop_flags[allocated_position : allocated_position + 1]) == 1:
if sum(self.stop_flags[allocated_position:allocated_position +
1]) == 1:
can_insert = True
break
allocated_position += 1
@@ -215,14 +249,61 @@ class ResourceManager(object):
task = tasks[processing_task_index]
if task.get("seed") is None:
task.set("seed", random.randint(0, 9223372036854775807))
task.set("seed",
random.randint(0, 9223372036854775807))
task.idx = allocated_position
block_tables = self._get_block_tables(task.prompt_token_ids_len)
if not block_tables:
llm_logger.error("req_id: {0} block_tables is empty".format(task.request_id))
continue
if self.enable_prefix_cache:
cache_prepare_time = time.time()
common_block_ids, unique_block_ids, hit_info = self.cache_manager.request_block_ids(
task, self.cfg.block_size, self.cfg.dec_token_num)
if unique_block_ids is None:
llm_logger.warning(
"req_id: {0} not enough blocks available".
format(task["req_id"]))
return
cached_len = self._record_request_cache_info(
task, common_block_ids, unique_block_ids, hit_info)
task.cache_prepare_time = time.time(
) - cache_prepare_time
if task.disaggregate_info is not None:
if task.disaggregate_info['role'] == "prefill":
self.req_dict[
task.request_id] = allocated_position
task.disaggregate_info[
'block_tables'] = task.block_tables
self._delete_cached_data(task, cached_len)
elif task.disaggregate_info['role'] == "decode":
self.req_dict[
task.request_id] = allocated_position
task.disaggregate_info[
'block_tables'] = task.need_block_tables
else:
self._delete_cached_data(task, cached_len)
else:
task.block_tables = block_tables
block_tables = self._get_block_tables(
task.prompt_token_ids_len)
if not block_tables:
llm_logger.error(
"req_id: {0} block_tables is empty".format(
task.request_id))
continue
else:
task.block_tables = block_tables
task.need_block_tables = task.block_tables
if task.disaggregate_info is not None:
task.disaggregate_info[
'block_tables'] = block_tables
if task.disaggregate_info['role'] == "prefill":
self.req_dict[
task.request_id] = allocated_position
elif task.disaggregate_info['role'] == "decode":
self.req_dict[
task.request_id] = allocated_position
processed_tasks.append(task)
self.stop_flags[allocated_position] = False
@@ -230,9 +311,10 @@ class ResourceManager(object):
task.inference_time_cost = -1.0
task.tokens_all_num = int(0)
self.tasks_list[allocated_position] = task
llm_logger.info(f"Allocate request: {task.request_id}, "
f"allocated_position:{allocated_position}, "
f"length of prompt token: {task.prompt_token_ids_len}")
llm_logger.info(
f"Allocate request: {task.request_id}, "
f"allocated_position:{allocated_position}, "
f"length of prompt token: {task.prompt_token_ids_len}")
allocated_position += 1
processing_task_index += 1
@@ -242,20 +324,70 @@ class ResourceManager(object):
self.real_bsz = i + 1
break
llm_logger.info(f"Number of allocated requests: {len(tasks)}, number of "
f"running requests in worker: {self.real_bsz}")
llm_logger.info(
f"Number of allocated requests: {len(tasks)}, number of "
f"running requests in worker: {self.real_bsz}")
llm_logger.info(f"{self.info()}")
main_process_metrics.gpu_cache_usage_perc.set(
self.get_gpu_cache_usage_perc())
return processed_tasks
def _delete_cached_data(self, task, cached_len):
"""
Delete cached data from the task's prompt token ids based on the cached length.
"""
if cached_len == len(task.prompt_token_ids):
task.prompt_token_ids = task.prompt_token_ids[cached_len - 1:]
task.seq_lens_decoder = cached_len - 1
else:
task.prompt_token_ids = task.prompt_token_ids[cached_len:]
task.seq_lens_decoder = cached_len
task.prompt_token_ids_len = len(task.prompt_token_ids)
def _record_request_cache_info(self, task, common_block_ids,
unique_block_ids, hit_info):
"""
Record the cache information for a given task and its corresponding block IDs.
"""
cache_block_num = len(common_block_ids)
no_cache_block_num = math.ceil(len(task.prompt_token_ids) / self.cfg.block_size \
- cache_block_num)
task.num_cached_tokens = cache_block_num * self.cfg.block_size
task.gpu_cache_token_num = hit_info[
"gpu_cache_blocks"] * self.cfg.block_size
task.cpu_cache_token_num = hit_info[
"cpu_cache_blocks"] * self.cfg.block_size
task.cache_info = (cache_block_num, no_cache_block_num)
cached_len = len(common_block_ids) * self.cfg.block_size
task.block_tables = common_block_ids + unique_block_ids
task.need_block_tables = unique_block_ids
llm_logger.debug(f"common: {common_block_ids} ")
llm_logger.debug(f"unique: {unique_block_ids} ")
return cached_len
def info(self):
"""Generates a summary of current resource status.
"""
get resource manager info
Returns:
str: Formatted string showing:
- Total blocks/batch slots
- Available blocks/batch slots
str: resource manager info
"""
info = f"ResourceManager info, " \
f"total_block_number: {self.total_block_number()}, total_batch_number: {len(self.stop_flags)}, " \
f"available_block_num: {self.available_block_num()}, available_batch: {self.available_batch()}"
return info
def get_gpu_cache_usage_perc(self):
"""
Calculate GPU KV-cache usage
Returns:
float: GPU KV-cache usage (0.0 - 1.0)
"""
num_total_gpu = self.total_block_number()
num_free_gpu = len(self.cache_manager.gpu_free_block_list)
if num_total_gpu > 0:
return 1.0 - (num_free_gpu / num_total_gpu)
return 0.0