""" # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ from enum import Enum from fastdeploy.utils import get_logger logger = get_logger("prefix_cache_manager", "prefix_cache_manager.log") class CacheStatus(Enum): """ cache status enum class """ GPU = 0 SWAP2CPU = 1 SWAP2GPU = 2 CPU = 3 class BlockNode: """ BlockNode: store the information of a block node """ def __init__( self, node_id, input_ids, input_hash_value, depth, block_id, token_num, hash_value, last_used_time, parent=None, shared_count=1, reverved_dec_block_ids=[], cache_status=CacheStatus.GPU, is_persistent=False, persistent_shared_count=0, ): """ Args: node_id: Unique identifier of the node depth: Depth of the node block_id: Assigned block ID (CPU block ID if on CPU, GPU block ID if on GPU) token_num: Number of tokens in the current block hash_value: Hash value of the current block last_used_time: Timestamp of last usage parent: Parent node shared_count: Reference count of requests currently using this node reserved_dec_block_ids: Pre-allocated block IDs reserved for decoding, formatted as [block_id, block_id,...] cache_status: Current cache state (USING, SWAP2CPU, SWAP2GPU, FREE) is_persistent: Whether the node is persistently stored persistent_shared_count: Reference count of persistent cache requests """ self.node_id = node_id self.depth = depth self.parent = parent self.hash_value = hash_value self.token_num = token_num self.input_ids = input_ids self.input_hash_value = input_hash_value self.children = {} self.shared_count = shared_count self.last_used_time = last_used_time self.block_id = block_id self.reverved_dec_block_ids = reverved_dec_block_ids self.cache_status = cache_status self.is_persistent = is_persistent self.persistent_shared_count = persistent_shared_count self.req_id_set = set() def __lt__(self, other): """ override the less than operator """ if self.last_used_time < other.last_used_time: return True elif self.last_used_time > other.last_used_time: return False else: return self.depth > other.depth def __str__(self): """ return node info """ if self.parent is not None: parent_node_id = self.parent.node_id else: parent_node_id = None return ( f"node_id {self.node_id}: depth {self.depth} hash_value {self.hash_value}" + f" shared_count {self.shared_count} is_gpu_leaf_node {self.is_gpu_leaf_node}" + f" is_cpu_leaf_node {self.is_cpu_leaf_node} block_id {self.block_id} " + f"has_in_gpu {self.has_in_gpu} " + f"cache_status {self.cache_status} parent {parent_node_id} with children number " + f"{len(self.children)} req_id_set {self.req_id_set}" ) @property def has_in_gpu(self): """ check if the node has been allocated in GPU """ return self.cache_status == CacheStatus.GPU def increment_shared_count(self): """ increment shared count """ self.shared_count += 1 def decrement_shared_count(self): """ decrement shared count """ self.shared_count -= 1 @property def is_cpu_leaf_node(self): """ check if the node is a leaf node in CPU """ if (self.cache_status == CacheStatus.CPU) and (len(self.children) == 0): return True return False @property def is_gpu_leaf_node(self): """ check if the node is a leaf node in GPU """ if self.has_in_gpu is False: return False else: if len(self.children) == 0: return True for child in self.children.values(): if child.has_in_gpu is True: return False return True