Files
FastDeploy/fastdeploy/engine/resource_manager.py
李泳桦 98e03fb4ea
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
[feat] add metrics for yiyan adapter (#3219) (#3614)
* [feat] add metrics for yiyan adapter

* [fix] fix metrics num_requests_waiting and num_requests_running

* [fix] fix metrics gpu_cache_usage_perc

* [refactor] change where requests_number increases

* [chore] rename xxx_block_num as xxx_gpu_block_num, and update their values accordingly

* [chore] delete useless code
2025-08-30 23:20:58 +08:00

388 lines
15 KiB
Python

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import math
import random
import time
import numpy as np
from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
from fastdeploy.metrics.metrics import main_process_metrics
from fastdeploy.utils import llm_logger
class ResourceManager:
"""
record and allocate resources for the engine
"""
def __init__(
self,
max_num_seqs,
config,
tensor_parallel_size,
splitwise_role,
local_data_parallel_id=0,
):
"""
Args:
cfg (Config): config object containing parameters for the engine
initialization
Returns:
None
Initializes the engine with the given configuration and sets up necessary
data structures to manage tasks and blocks.
"""
self.cfg = config.cache_config
self.max_num_seqs = max_num_seqs
self.stop_flags = [True] * max_num_seqs # flag set to true if the slot has not been taken
self.enable_prefix_cache = config.cache_config.enable_prefix_caching
self.cache_manager = PrefixCacheManager(config, tensor_parallel_size, splitwise_role, local_data_parallel_id)
self.tasks_list = [None] * max_num_seqs # task slots
self.req_dict = dict()
# current batch status of the engine
self.real_bsz = 0
llm_logger.info(f"{self.info()}")
main_process_metrics.max_batch_size.set(max_num_seqs)
def reset_cache_config(self, cfg):
"""
reset cache config
"""
self.cfg = cfg
self.cache_manager.update_cache_config(cfg)
def get_required_block_number(self, input_token_num):
"""
Calculate Block resources are needed
Args:
input_token_num (int): input token number
Returns:
int: block number
"""
block_num = (input_token_num + self.cfg.block_size - 1 + self.cfg.dec_token_num) // self.cfg.block_size
return block_num
def get_encoder_block_number(self, input_token_num):
"""
get the number of blocks for the encoder
Args:
input_token_num (int): input token number
Returns:
int: encoder block number
"""
enc_block_num = (input_token_num + self.cfg.block_size - 1) // self.cfg.block_size
return enc_block_num
def get_decoder_block_number(self):
"""
get the number of blocks for the decoder
Returns:
int: decoder block number
"""
return (self.cfg.dec_token_num + self.cfg.block_size - 1) // self.cfg.block_size
def total_block_number(self):
"""
the number of pre allocated blocks at service startup
Returns:
int: total block number
"""
return self.cache_manager.num_gpu_blocks
def _get_block_tables(self, input_token_num, required_type="all"):
"""
allocate memory resources
Args:
input_token_num (int): input token number
required_type (str): required type
Returns:
list: block list
"""
if required_type == "all":
block_num = self.get_required_block_number(input_token_num)
elif required_type == "encoder":
block_num = self.get_encoder_block_number(input_token_num)
elif required_type == "decoder":
block_num = self.get_decoder_block_number()
else:
raise ValueError("unknown required type")
block_list = list()
current_block_num = self.available_block_num()
if block_num > current_block_num:
llm_logger.error(f"block_num:{block_num} > free_list len:{current_block_num}")
return block_list
block_list = self.cache_manager.allocate_gpu_blocks(block_num)
llm_logger.debug(f"dispatch {len(block_list)} blocks.")
return block_list
def check_and_free_block_tables(self):
"""
Check and free block tables only in prefix caching mode.
If the number of free blocks is less than a certain threshold, free up to the threshold.
"""
if self.enable_prefix_cache:
if self.available_block_num() < self.cfg.max_block_num_per_seq:
self.free_block_tables(self.cfg.max_block_num_per_seq)
def _recycle_block_tables(self, task):
"""
Recycling memory resource blocks
Args:
block_tables (list): block list
"""
if self.enable_prefix_cache:
self.cache_manager.release_block_ids_async(task)
else:
req_id = task.request_id
if isinstance(task, list):
block_tables = task
else:
block_tables = task.block_tables
ori_number = self.available_block_num()
self.cache_manager.recycle_gpu_blocks(block_tables)
cur_number = self.available_block_num()
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
llm_logger.info(f"recycle {req_id} {cur_number - ori_number} blocks.")
def available_batch(self):
"""
available batch size for engine
Returns:
int: available batch size
"""
return np.sum(self.stop_flags)
def available_block_num(self):
"""
available block size for engine
Returns:
int: available block size
"""
return len(self.cache_manager.gpu_free_block_list)
def is_resource_sufficient(self, input_token_num):
"""
check current available resources meet the new requirements
Args:
input_token_num (int): input token number
Returns:
bool: whether current available resources meet the new requirements
"""
if self.available_batch() < 1:
return False
block_num = self.get_required_block_number(input_token_num)
if block_num > self.available_block_num():
return False
return True
def free_block_tables(self, need_reserved_block_num):
"""
回收block到可用资源池
"""
return self.cache_manager.free_block_ids_async(need_reserved_block_num)
def allocate_resources_for_new_tasks(self, tasks):
"""
allocate resources for new tasks
Args:
tasks (list): task list
Returns:
list: processed task list
"""
llm_logger.debug(f"Allocating resources for a batch of new tasks: {tasks}")
allocated_position = 0 # number of tasks that have been allocated, also the position in request slots
processing_task_index = 0 # current task
processed_tasks = list()
while allocated_position < self.max_num_seqs: # loop until all tasks are allocated resources for
if processing_task_index >= len(tasks): # if all taskes have been tried, don't give a second chance
break
can_insert = False
while allocated_position < self.max_num_seqs:
if sum(self.stop_flags[allocated_position : allocated_position + 1]) == 1:
can_insert = True # if there is a empty slot, try to allocate resources for current task
break
allocated_position += 1
if can_insert:
task = tasks[processing_task_index]
if task.get("seed") is None:
task.set("seed", random.randint(0, 9223372036854775807))
task.idx = allocated_position
if self.enable_prefix_cache: # if prefix caching is enabled
# 1. request for enough blocks for current task
cache_prepare_time = time.time()
common_block_ids, unique_block_ids, hit_info = self.cache_manager.request_block_ids(
task,
self.cfg.block_size,
self.cfg.dec_token_num,
)
if unique_block_ids is None:
llm_logger.warning("req_id: {0} not enough blocks available".format(task["req_id"]))
return
# 2. record cache hit information, and return the number of tokens already in cache
cached_len = self._record_request_cache_info(task, common_block_ids, unique_block_ids, hit_info)
task.cache_prepare_time = time.time() - cache_prepare_time
# 3. if prefill/decode disaggregation is enabled
if task.disaggregate_info is not None:
if task.disaggregate_info["role"] == "prefill":
# record the slot position for current task, indexed by request id
self.req_dict[task.request_id] = allocated_position
task.disaggregate_info["block_tables"] = task.block_tables
self._delete_cached_data(task, cached_len)
elif task.disaggregate_info["role"] == "decode":
self.req_dict[task.request_id] = allocated_position
task.disaggregate_info["block_tables"] = task.need_block_tables
else:
# remove cached tokens from prompt token ids to avoid kv recomputation
self._delete_cached_data(task, cached_len)
else: # if prefix caching is disabled
# 1. directly allocate empty block from the cache, if there is any
block_tables = self._get_block_tables(task.prompt_token_ids_len)
if not block_tables:
llm_logger.error(f"req_id: {task.request_id} block_tables is empty")
continue # retry
else:
task.block_tables = block_tables
task.need_block_tables = task.block_tables
# 2. if prefill/decode disaggregation is enabled
if task.disaggregate_info is not None:
task.disaggregate_info["block_tables"] = block_tables
if task.disaggregate_info["role"] == "prefill":
self.req_dict[task.request_id] = allocated_position
elif task.disaggregate_info["role"] == "decode":
self.req_dict[task.request_id] = allocated_position
processed_tasks.append(task) # add current task
self.stop_flags[allocated_position] = False # mark the slot as occupied
task.inference_start_time = time.time()
task.inference_time_cost = -1.0
task.tokens_all_num = 0
self.tasks_list[allocated_position] = task
llm_logger.info(
f"Allocate request: {task.request_id}, "
f"allocated_position:{allocated_position}, "
f"length of prompt token: {task.prompt_token_ids_len}"
)
allocated_position += 1
processing_task_index += 1
# batch size when the statistical engine is inferring
# determine batch size by index of the first slot that is not occupied
for i in range(self.max_num_seqs - 1, -1, -1):
if not self.stop_flags[i]:
self.real_bsz = i + 1
break
# record batch size here
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
llm_logger.info(
f"Number of allocated requests: {len(tasks)}, number of " f"running requests in worker: {self.real_bsz}"
)
llm_logger.info(f"{self.info()}")
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
return processed_tasks
def _delete_cached_data(self, task, cached_len):
"""
Delete cached data from the task's prompt token ids based on the cached length.
"""
if cached_len == len(task.prompt_token_ids):
task.prompt_token_ids = task.prompt_token_ids[cached_len - 1 :]
task.seq_lens_decoder = cached_len - 1
else:
task.prompt_token_ids = task.prompt_token_ids[cached_len:]
task.seq_lens_decoder = cached_len
task.prompt_token_ids_len = len(task.prompt_token_ids)
def _record_request_cache_info(self, task, common_block_ids, unique_block_ids, hit_info):
"""
Record the cache information for a given task and its corresponding block IDs.
"""
cache_block_num = len(common_block_ids)
no_cache_block_num = math.ceil(len(task.prompt_token_ids) / self.cfg.block_size - cache_block_num)
task.num_cached_tokens = cache_block_num * self.cfg.block_size
task.gpu_cache_token_num = hit_info["gpu_cache_blocks"] * self.cfg.block_size
task.cpu_cache_token_num = hit_info["cpu_cache_blocks"] * self.cfg.block_size
task.cache_info = (cache_block_num, no_cache_block_num)
# Report the number of cached tokens to Prometheus metrics
main_process_metrics.prefix_cache_token_num.inc(task.num_cached_tokens)
main_process_metrics.prefix_gpu_cache_token_num.inc(task.gpu_cache_token_num)
main_process_metrics.prefix_cpu_cache_token_num.inc(task.cpu_cache_token_num)
cached_len = len(common_block_ids) * self.cfg.block_size
task.block_tables = common_block_ids + unique_block_ids
task.need_block_tables = unique_block_ids
llm_logger.debug(f"common: {common_block_ids} ")
llm_logger.debug(f"unique: {unique_block_ids} ")
return cached_len
def info(self):
"""
get resource manager info
Returns:
str: resource manager info
"""
info = (
f"ResourceManager info, "
f"total_block_number: {self.total_block_number()}, total_batch_number: {len(self.stop_flags)}, "
f"available_block_num: {self.available_block_num()}, available_batch: {self.available_batch()}"
)
return info
def get_gpu_cache_usage_perc(self):
"""
Calculate GPU KV-cache usage
Returns:
float: GPU KV-cache usage (0.0 - 1.0)
"""
num_total_gpu = self.total_block_number()
num_free_gpu = len(self.cache_manager.gpu_free_block_list)
if num_total_gpu > 0:
return 1.0 - (num_free_gpu / num_total_gpu)
return 0.0