""" # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ import copy import os import random import threading import time import numpy as np from fastdeploy.utils import llm_logger class ResourceManager(object): """Manages and allocates computational resources for the inference engine. This class handles the allocation and recycling of memory blocks for KV cache, manages task scheduling, and tracks resource utilization. """ def __init__(self, max_num_seqs, cache_config): """Initializes the resource manager with configuration parameters. Args: max_num_seqs (int): Maximum number of concurrent sequences the engine can handle cache_config (Config): Configuration object containing: - prefill_kvcache_block_num: Number of pre-allocated KV cache blocks - block_size: Size of each memory block in tokens - dec_token_num: Number of decoder tokens """ self.cfg = cache_config self.max_num_seqs = max_num_seqs self.stop_flags = [True] * max_num_seqs self.free_list = list(range(self.cfg.prefill_kvcache_block_num - 1, -1, -1)) self.tasks_list = [None] * max_num_seqs # current batch status of the engine self.real_bsz = 0 llm_logger.info(f"{self.info()}") def reset_cache_config(self, cfg): """Updates the cache configuration with new parameters. Args: cfg (Config): New cache configuration object """ self.cfg = cfg self.free_list = list(range(self.cfg.prefill_kvcache_block_num - 1, -1, -1)) def get_required_block_number(self, input_token_num): """Calculates the total number of blocks needed for a sequence. Includes both encoder and decoder requirements. Args: input_token_num (int): Number of tokens in the input sequence Returns: int: Total number of blocks required (rounded up) """ block_num = (input_token_num + self.cfg.block_size - 1 + self.cfg.dec_token_num) // self.cfg.block_size return block_num def get_encoder_block_number(self, input_token_num): """Calculates the number of blocks needed for encoder inputs only. Args: input_token_num (int): Number of tokens in the encoder input Returns: int: Number of blocks required for encoder (rounded up) """ enc_block_num = (input_token_num + self.cfg.block_size - 1) // self.cfg.block_size return enc_block_num def get_decoder_block_number(self): """Calculates the number of blocks needed for decoder outputs. Returns: int: Number of blocks required for decoder (rounded up) """ return (self.cfg.dec_token_num + self.cfg.block_size - 1) // self.cfg.block_size def total_block_number(self): """Gets the total number of pre-allocated KV cache blocks. Returns: int: Total number of blocks available in the pool """ return self.cfg.prefill_kvcache_block_num def _get_block_tables(self, input_token_num, required_type="all"): """Allocates memory blocks from the free pool. Args: input_token_num (int): Number of input tokens required_type (str): Type of blocks needed: - "all": Both encoder and decoder blocks - "encoder": Encoder blocks only - "decoder": Decoder blocks only Returns: list: List of allocated block IDs Raises: ValueError: If unknown required_type is specified """ if required_type == "all": block_num = self.get_required_block_number(input_token_num) elif required_type == "encoder": block_num = self.get_encoder_block_number(input_token_num) elif required_type == "decoder": block_num = self.get_decoder_block_number() else: raise ValueError('unknown required type') block_list = list() if block_num > len(self.free_list): llm_logger.error("block_num:{0} > free_list len:{1}".format(block_num, len(self.free_list))) return block_list for _ in range(block_num): used_block_id = self.free_list.pop() block_list.append(used_block_id) llm_logger.debug(f"dispatch {len(block_list)} blocks.") return block_list def _recycle_block_tables(self, block_tables): """Returns memory blocks to the free pool for reuse. Args: block_tables (list): List of block IDs to recycle """ ori_number = len(self.free_list) self.free_list.extend(block_tables) cur_number = len(self.free_list) llm_logger.info(f"recycle {cur_number - ori_number} blocks.") def available_batch(self): """Gets the number of available sequence slots. Returns: int: Number of available sequence slots in the batch """ return np.sum(self.stop_flags) def available_block_num(self): """Gets the number of available memory blocks. Returns: int: Number of free blocks in the pool """ return len(self.free_list) def is_resource_sufficient(self, input_token_num): """Checks if sufficient resources are available for a new sequence. Args: input_token_num (int): Number of tokens in the new sequence Returns: bool: True if both batch slots and memory blocks are available """ if self.available_batch() < 1: return False block_num = self.get_required_block_number(input_token_num) if block_num > self.available_block_num(): return False return True def allocate_resources_for_new_tasks(self, tasks): """Assigns resources to new inference tasks. Args: tasks (list): List of Request objects needing resources Returns: list: List of successfully allocated Request objects Note: - Assigns sequence slots and memory blocks - Sets initial timestamps and metadata - Updates real-time batch size statistics """ allocated_position = 0 processing_task_index = 0 processed_tasks = list() while allocated_position < self.max_num_seqs: if processing_task_index >= len(tasks): break can_insert = False while allocated_position + 1 <= self.max_num_seqs: if sum(self.stop_flags[allocated_position : allocated_position + 1]) == 1: can_insert = True break allocated_position += 1 if can_insert: if self.stop_flags[allocated_position]: task = tasks[processing_task_index] if task.get("seed") is None: task.set("seed", random.randint(0, 9223372036854775807)) task.idx = allocated_position block_tables = self._get_block_tables(task.prompt_token_ids_len) if not block_tables: llm_logger.error("req_id: {0} block_tables is empty".format(task.request_id)) continue else: task.block_tables = block_tables processed_tasks.append(task) self.stop_flags[allocated_position] = False task.inference_start_time = time.time() task.inference_time_cost = -1.0 task.tokens_all_num = int(0) self.tasks_list[allocated_position] = task llm_logger.info(f"Allocate request: {task.request_id}, " f"allocated_position:{allocated_position}, " f"length of prompt token: {task.prompt_token_ids_len}") allocated_position += 1 processing_task_index += 1 # batch size when the statistical engine is inferring for i in range(self.max_num_seqs - 1, -1, -1): if not self.stop_flags[i]: self.real_bsz = i + 1 break llm_logger.info(f"Number of allocated requests: {len(tasks)}, number of " f"running requests in worker: {self.real_bsz}") llm_logger.info(f"{self.info()}") return processed_tasks def info(self): """Generates a summary of current resource status. Returns: str: Formatted string showing: - Total blocks/batch slots - Available blocks/batch slots """ info = f"ResourceManager info, " \ f"total_block_number: {self.total_block_number()}, total_batch_number: {len(self.stop_flags)}, " \ f"available_block_num: {self.available_block_num()}, available_batch: {self.available_batch()}" return info