mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
[LLM] First commit the llm deployment code
This commit is contained in:
261
fastdeploy/engine/resource_manager.py
Normal file
261
fastdeploy/engine/resource_manager.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import copy
|
||||
import os
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
from fastdeploy.utils import llm_logger
|
||||
|
||||
|
||||
class ResourceManager(object):
|
||||
"""Manages and allocates computational resources for the inference engine.
|
||||
|
||||
This class handles the allocation and recycling of memory blocks for KV cache,
|
||||
manages task scheduling, and tracks resource utilization.
|
||||
"""
|
||||
def __init__(self, max_num_seqs, cache_config):
|
||||
"""Initializes the resource manager with configuration parameters.
|
||||
|
||||
Args:
|
||||
max_num_seqs (int): Maximum number of concurrent sequences the engine can handle
|
||||
cache_config (Config): Configuration object containing:
|
||||
- prefill_kvcache_block_num: Number of pre-allocated KV cache blocks
|
||||
- block_size: Size of each memory block in tokens
|
||||
- dec_token_num: Number of decoder tokens
|
||||
"""
|
||||
self.cfg = cache_config
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.stop_flags = [True] * max_num_seqs
|
||||
|
||||
|
||||
self.free_list = list(range(self.cfg.prefill_kvcache_block_num - 1, -1, -1))
|
||||
self.tasks_list = [None] * max_num_seqs
|
||||
# current batch status of the engine
|
||||
self.real_bsz = 0
|
||||
llm_logger.info(f"{self.info()}")
|
||||
|
||||
def reset_cache_config(self, cfg):
|
||||
"""Updates the cache configuration with new parameters.
|
||||
|
||||
Args:
|
||||
cfg (Config): New cache configuration object
|
||||
"""
|
||||
self.cfg = cfg
|
||||
self.free_list = list(range(self.cfg.prefill_kvcache_block_num - 1, -1, -1))
|
||||
|
||||
|
||||
def get_required_block_number(self, input_token_num):
|
||||
"""Calculates the total number of blocks needed for a sequence.
|
||||
|
||||
Includes both encoder and decoder requirements.
|
||||
|
||||
Args:
|
||||
input_token_num (int): Number of tokens in the input sequence
|
||||
|
||||
Returns:
|
||||
int: Total number of blocks required (rounded up)
|
||||
"""
|
||||
block_num = (input_token_num + self.cfg.block_size - 1 + self.cfg.dec_token_num) // self.cfg.block_size
|
||||
return block_num
|
||||
|
||||
def get_encoder_block_number(self, input_token_num):
|
||||
"""Calculates the number of blocks needed for encoder inputs only.
|
||||
|
||||
Args:
|
||||
input_token_num (int): Number of tokens in the encoder input
|
||||
|
||||
Returns:
|
||||
int: Number of blocks required for encoder (rounded up)
|
||||
"""
|
||||
enc_block_num = (input_token_num + self.cfg.block_size - 1) // self.cfg.block_size
|
||||
return enc_block_num
|
||||
|
||||
def get_decoder_block_number(self):
|
||||
"""Calculates the number of blocks needed for decoder outputs.
|
||||
|
||||
Returns:
|
||||
int: Number of blocks required for decoder (rounded up)
|
||||
"""
|
||||
return (self.cfg.dec_token_num + self.cfg.block_size - 1) // self.cfg.block_size
|
||||
|
||||
def total_block_number(self):
|
||||
"""Gets the total number of pre-allocated KV cache blocks.
|
||||
|
||||
Returns:
|
||||
int: Total number of blocks available in the pool
|
||||
"""
|
||||
return self.cfg.prefill_kvcache_block_num
|
||||
|
||||
def _get_block_tables(self, input_token_num, required_type="all"):
|
||||
"""Allocates memory blocks from the free pool.
|
||||
|
||||
Args:
|
||||
input_token_num (int): Number of input tokens
|
||||
required_type (str): Type of blocks needed:
|
||||
- "all": Both encoder and decoder blocks
|
||||
- "encoder": Encoder blocks only
|
||||
- "decoder": Decoder blocks only
|
||||
|
||||
Returns:
|
||||
list: List of allocated block IDs
|
||||
|
||||
Raises:
|
||||
ValueError: If unknown required_type is specified
|
||||
"""
|
||||
if required_type == "all":
|
||||
block_num = self.get_required_block_number(input_token_num)
|
||||
elif required_type == "encoder":
|
||||
block_num = self.get_encoder_block_number(input_token_num)
|
||||
elif required_type == "decoder":
|
||||
block_num = self.get_decoder_block_number()
|
||||
else:
|
||||
raise ValueError('unknown required type')
|
||||
|
||||
block_list = list()
|
||||
if block_num > len(self.free_list):
|
||||
llm_logger.error("block_num:{0} > free_list len:{1}".format(block_num, len(self.free_list)))
|
||||
return block_list
|
||||
for _ in range(block_num):
|
||||
used_block_id = self.free_list.pop()
|
||||
block_list.append(used_block_id)
|
||||
llm_logger.debug(f"dispatch {len(block_list)} blocks.")
|
||||
return block_list
|
||||
|
||||
def _recycle_block_tables(self, block_tables):
|
||||
"""Returns memory blocks to the free pool for reuse.
|
||||
|
||||
Args:
|
||||
block_tables (list): List of block IDs to recycle
|
||||
"""
|
||||
ori_number = len(self.free_list)
|
||||
self.free_list.extend(block_tables)
|
||||
cur_number = len(self.free_list)
|
||||
llm_logger.info(f"recycle {cur_number - ori_number} blocks.")
|
||||
|
||||
def available_batch(self):
|
||||
"""Gets the number of available sequence slots.
|
||||
|
||||
Returns:
|
||||
int: Number of available sequence slots in the batch
|
||||
"""
|
||||
return np.sum(self.stop_flags)
|
||||
|
||||
def available_block_num(self):
|
||||
"""Gets the number of available memory blocks.
|
||||
|
||||
Returns:
|
||||
int: Number of free blocks in the pool
|
||||
"""
|
||||
return len(self.free_list)
|
||||
|
||||
def is_resource_sufficient(self, input_token_num):
|
||||
"""Checks if sufficient resources are available for a new sequence.
|
||||
|
||||
Args:
|
||||
input_token_num (int): Number of tokens in the new sequence
|
||||
|
||||
Returns:
|
||||
bool: True if both batch slots and memory blocks are available
|
||||
"""
|
||||
if self.available_batch() < 1:
|
||||
return False
|
||||
block_num = self.get_required_block_number(input_token_num)
|
||||
if block_num > self.available_block_num():
|
||||
return False
|
||||
return True
|
||||
|
||||
def allocate_resources_for_new_tasks(self, tasks):
|
||||
"""Assigns resources to new inference tasks.
|
||||
|
||||
Args:
|
||||
tasks (list): List of Request objects needing resources
|
||||
|
||||
Returns:
|
||||
list: List of successfully allocated Request objects
|
||||
|
||||
Note:
|
||||
- Assigns sequence slots and memory blocks
|
||||
- Sets initial timestamps and metadata
|
||||
- Updates real-time batch size statistics
|
||||
"""
|
||||
|
||||
allocated_position = 0
|
||||
processing_task_index = 0
|
||||
processed_tasks = list()
|
||||
while allocated_position < self.max_num_seqs:
|
||||
if processing_task_index >= len(tasks):
|
||||
break
|
||||
|
||||
can_insert = False
|
||||
while allocated_position + 1 <= self.max_num_seqs:
|
||||
if sum(self.stop_flags[allocated_position : allocated_position + 1]) == 1:
|
||||
can_insert = True
|
||||
break
|
||||
allocated_position += 1
|
||||
if can_insert:
|
||||
if self.stop_flags[allocated_position]:
|
||||
|
||||
task = tasks[processing_task_index]
|
||||
|
||||
if task.get("seed") is None:
|
||||
task.set("seed", random.randint(0, 9223372036854775807))
|
||||
task.idx = allocated_position
|
||||
block_tables = self._get_block_tables(task.prompt_token_ids_len)
|
||||
if not block_tables:
|
||||
llm_logger.error("req_id: {0} block_tables is empty".format(task.request_id))
|
||||
continue
|
||||
else:
|
||||
task.block_tables = block_tables
|
||||
|
||||
processed_tasks.append(task)
|
||||
self.stop_flags[allocated_position] = False
|
||||
task.inference_start_time = time.time()
|
||||
task.inference_time_cost = -1.0
|
||||
task.tokens_all_num = int(0)
|
||||
self.tasks_list[allocated_position] = task
|
||||
llm_logger.info(f"Allocate request: {task.request_id}, "
|
||||
f"allocated_position:{allocated_position}, "
|
||||
f"length of prompt token: {task.prompt_token_ids_len}")
|
||||
allocated_position += 1
|
||||
processing_task_index += 1
|
||||
|
||||
# batch size when the statistical engine is inferring
|
||||
for i in range(self.max_num_seqs - 1, -1, -1):
|
||||
if not self.stop_flags[i]:
|
||||
self.real_bsz = i + 1
|
||||
break
|
||||
|
||||
llm_logger.info(f"Number of allocated requests: {len(tasks)}, number of "
|
||||
f"running requests in worker: {self.real_bsz}")
|
||||
llm_logger.info(f"{self.info()}")
|
||||
return processed_tasks
|
||||
|
||||
def info(self):
|
||||
"""Generates a summary of current resource status.
|
||||
|
||||
Returns:
|
||||
str: Formatted string showing:
|
||||
- Total blocks/batch slots
|
||||
- Available blocks/batch slots
|
||||
"""
|
||||
info = f"ResourceManager info, " \
|
||||
f"total_block_number: {self.total_block_number()}, total_batch_number: {len(self.stop_flags)}, " \
|
||||
f"available_block_num: {self.available_block_num()}, available_batch: {self.available_batch()}"
|
||||
return info
|
Reference in New Issue
Block a user