mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Optimize] optimize prefix cache in develop (#3890)
* optimize prefix cache in release22 * fix * fix * fix * add ci for v1 * add unit test --------- Co-authored-by: xiegegege <46314656+xiegegege@users.noreply.github.com>
This commit is contained in:
@@ -257,7 +257,12 @@ class PrefixCacheManager:
|
||||
Check if num_blocks gpu blocks can be allocated.
|
||||
"""
|
||||
if len(self.gpu_free_block_list) < num_blocks:
|
||||
return False
|
||||
if self.cache_config.enable_prefix_caching:
|
||||
self.free_block_ids(num_blocks)
|
||||
if len(self.gpu_free_block_list) < num_blocks:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
else:
|
||||
return True
|
||||
|
||||
@@ -448,7 +453,7 @@ class PrefixCacheManager:
|
||||
"""
|
||||
return (input_token_num + block_size - 1) // block_size
|
||||
|
||||
def update_cache_blocks(self, task, block_size):
|
||||
def update_cache_blocks(self, task, block_size, num_computed_tokens):
|
||||
"""
|
||||
update cache blocks for a task.
|
||||
# TODO(chengyanfu): support async update
|
||||
@@ -459,12 +464,19 @@ class PrefixCacheManager:
|
||||
"""
|
||||
try:
|
||||
req_id = task.request_id
|
||||
num_cached_tokens = task.num_cached_tokens
|
||||
block_tables = task.block_tables
|
||||
|
||||
last_node, input_ids = self.cache_info[req_id]
|
||||
left_input_ids = input_ids[num_cached_tokens:]
|
||||
last_node, num_cached_tokens = self.cache_info[req_id]
|
||||
if isinstance(task.prompt_token_ids, np.ndarray):
|
||||
prompt_token_ids = task.prompt_token_ids.tolist()
|
||||
else:
|
||||
prompt_token_ids = task.prompt_token_ids
|
||||
input_ids = prompt_token_ids + task.output_token_ids
|
||||
can_cache_computed_tokens = num_computed_tokens - num_computed_tokens % block_size
|
||||
left_input_ids = input_ids[num_cached_tokens:can_cache_computed_tokens]
|
||||
gpu_extra_block_ids = block_tables[num_cached_tokens // block_size :]
|
||||
if req_id in self.leaf_req_map[last_node]: # delete old leaf record, update later
|
||||
self.leaf_req_map[last_node].remove(req_id)
|
||||
|
||||
with self.request_release_lock:
|
||||
current_time = time.time()
|
||||
@@ -480,7 +492,8 @@ class PrefixCacheManager:
|
||||
)
|
||||
self.req_leaf_map[req_id] = leaf_node
|
||||
self.leaf_req_map[leaf_node].add(req_id)
|
||||
self.cache_info[req_id] = (leaf_node, input_ids)
|
||||
self.cache_info[req_id] = (leaf_node, can_cache_computed_tokens)
|
||||
task.cached_block_num = can_cache_computed_tokens // block_size
|
||||
except Exception as e:
|
||||
logger.error(f"update_cache_blocks, error: {type(e)} {e}, {str(traceback.format_exc())}")
|
||||
raise e
|
||||
@@ -508,7 +521,11 @@ class PrefixCacheManager:
|
||||
hit_info["gpu_cache_blocks"] = 0
|
||||
hit_info["cpu_cache_blocks"] = 0
|
||||
self.metrics.req_count += 1
|
||||
input_ids = task.prompt_token_ids
|
||||
if isinstance(task.prompt_token_ids, np.ndarray):
|
||||
prompt_token_ids = task.prompt_token_ids.tolist()
|
||||
else:
|
||||
prompt_token_ids = task.prompt_token_ids
|
||||
input_ids = prompt_token_ids + task.output_token_ids
|
||||
req_id = task.request_id
|
||||
logger.info(f"request_match_blocks: start to allocate blocks for req_id {req_id}")
|
||||
input_token_num = len(input_ids)
|
||||
@@ -546,9 +563,6 @@ class PrefixCacheManager:
|
||||
"request_match_blocks: Not enough GPU memory to allocate cache for matched CPU Cache"
|
||||
)
|
||||
|
||||
# record request cache info
|
||||
self.cache_info[req_id] = (match_block_node, input_ids)
|
||||
|
||||
# 3. update metrics
|
||||
matched_token_num = gpu_match_token_num + cpu_match_token_num
|
||||
common_block_ids = match_gpu_block_ids + gpu_recv_block_ids
|
||||
@@ -571,6 +585,9 @@ class PrefixCacheManager:
|
||||
# set leaf node temporarily, then update it in update_cache_blocks
|
||||
self.req_leaf_map[req_id] = match_block_node
|
||||
self.leaf_req_map[match_block_node].add(req_id)
|
||||
# record request cache info
|
||||
self.cache_info[req_id] = (match_block_node, matched_token_num)
|
||||
task.cached_block_num = matched_token_num // block_size
|
||||
return common_block_ids, matched_token_num, hit_info
|
||||
except Exception as e:
|
||||
logger.error(f"request_match_blocks: request_block_ids: error: {type(e)} {e}")
|
||||
@@ -687,6 +704,11 @@ class PrefixCacheManager:
|
||||
"""
|
||||
return self.executor_pool.submit(self.release_block_ids, task)
|
||||
|
||||
def free_block_ids(self, need_block_num):
|
||||
self.free_block_ids_async(need_block_num)
|
||||
while (self.gpu_free_task_future is not None) and (not self.gpu_free_task_future.done()):
|
||||
time.sleep(0.001)
|
||||
|
||||
def release_block_ids(self, task):
|
||||
"""
|
||||
release block ids
|
||||
@@ -1108,15 +1130,6 @@ class PrefixCacheManager:
|
||||
node.req_id_set.add(req_id)
|
||||
node = node.parent
|
||||
|
||||
def decrease_request_share_count(self, req_id):
|
||||
"""
|
||||
Decrease node shared count
|
||||
"""
|
||||
node, input_ids = self.cache_info[req_id]
|
||||
while node != self.radix_tree_root:
|
||||
node.decrement_shared_count()
|
||||
node = node.parent
|
||||
|
||||
def build_path(
|
||||
self,
|
||||
req_id,
|
||||
|
Reference in New Issue
Block a user