mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Bug fix] Fix prefix cache in V1 (#3715)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* [Bug fix] Fix prefix cache in V1 * fix code style
This commit is contained in:
@@ -510,7 +510,7 @@ class PrefixCacheManager:
|
||||
self.metrics.req_count += 1
|
||||
input_ids = task.prompt_token_ids
|
||||
req_id = task.request_id
|
||||
logger.info(f"request_block_ids: start to allocate blocks for req_id {req_id}")
|
||||
logger.info(f"request_match_blocks: start to allocate blocks for req_id {req_id}")
|
||||
input_token_num = len(input_ids)
|
||||
common_block_ids = []
|
||||
# 1. match block
|
||||
@@ -542,7 +542,9 @@ class PrefixCacheManager:
|
||||
cpu_recv_block_ids=[],
|
||||
)
|
||||
else:
|
||||
raise Exception("Not enough GPU memory to allocate cache for matched CPU Cache")
|
||||
raise Exception(
|
||||
"request_match_blocks: Not enough GPU memory to allocate cache for matched CPU Cache"
|
||||
)
|
||||
|
||||
# record request cache info
|
||||
self.cache_info[req_id] = (match_block_node, input_ids)
|
||||
@@ -564,11 +566,14 @@ class PrefixCacheManager:
|
||||
if self.metrics.req_count % 10000 == 0:
|
||||
self.metrics.reset_metrics()
|
||||
logger.info(
|
||||
f"request_block_ids: request block for req_id {req_id}: common_block_ids {common_block_ids}"
|
||||
f"request_match_blocks: request block for req_id {req_id}: common_block_ids {common_block_ids}"
|
||||
)
|
||||
# set leaf node temporarily, then update it in update_cache_blocks
|
||||
self.req_leaf_map[req_id] = match_block_node
|
||||
self.leaf_req_map[match_block_node].add(req_id)
|
||||
return common_block_ids, matched_token_num, hit_info
|
||||
except Exception as e:
|
||||
logger.error(f"request_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}")
|
||||
logger.error(f"request_match_blocks: request_block_ids: error: {type(e)} {e}")
|
||||
raise e
|
||||
|
||||
def request_block_ids(self, task, block_size, dec_token_num, *args):
|
||||
@@ -725,6 +730,41 @@ class PrefixCacheManager:
|
||||
logger.error(f"release_block_ids: error: {type(e)} {e}, {str(traceback.format_exc())}")
|
||||
raise e
|
||||
|
||||
def free_nodes_directly(self, node):
|
||||
with self.request_release_lock:
|
||||
try:
|
||||
total_gpu_free_count = 0
|
||||
while True:
|
||||
if node in self.gpu_lru_leaf_heap:
|
||||
self.gpu_lru_leaf_heap.remove(node)
|
||||
self.gpu_lru_leaf_set.remove(node)
|
||||
if node.shared_count == 0 and node.is_gpu_leaf_node: # 直接回收
|
||||
self._handle_free_gpu_node_without_cpu(node)
|
||||
logger.info(f"free_nodes_directly: node {node}")
|
||||
total_gpu_free_count += 1
|
||||
cur_node = node
|
||||
node = node.parent
|
||||
if cur_node.hash_value in node.children:
|
||||
del node.children[cur_node.hash_value]
|
||||
if not node.children:
|
||||
if node in self.gpu_lru_leaf_set:
|
||||
continue
|
||||
if (
|
||||
node != self.radix_tree_root
|
||||
and node.shared_count == 0
|
||||
and node.is_gpu_leaf_node
|
||||
and node.is_persistent is False
|
||||
):
|
||||
heapq.heappush(self.gpu_lru_leaf_heap, node)
|
||||
self.gpu_lru_leaf_set.add(node)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"free_nodes_directly: error: {type(e)} {e}")
|
||||
raise e
|
||||
|
||||
def _handle_free_gpu_node_without_cpu(self, node):
|
||||
"""
|
||||
GPU node eviction
|
||||
@@ -1068,6 +1108,15 @@ class PrefixCacheManager:
|
||||
node.req_id_set.add(req_id)
|
||||
node = node.parent
|
||||
|
||||
def decrease_request_share_count(self, req_id):
|
||||
"""
|
||||
Decrease node shared count
|
||||
"""
|
||||
node, input_ids = self.cache_info[req_id]
|
||||
while node != self.radix_tree_root:
|
||||
node.decrement_shared_count()
|
||||
node = node.parent
|
||||
|
||||
def build_path(
|
||||
self,
|
||||
req_id,
|
||||
|
Reference in New Issue
Block a user