mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Feature] Support block scheduler v1 for FD (#2928)
* Support FD block scheduler v1 * Support FD block scheduler v1 * Support FD block scheduler v1 * Fix according to copilot review * Fix according to review * Remove is_dummy * Fix bug when real_bsz=1 * Fix infer first token cost time --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -210,8 +210,16 @@ class PrefixCacheManager:
|
||||
update cache config
|
||||
"""
|
||||
self.cache_config = cache_config
|
||||
self.num_gpu_blocks = cache_config.prefill_kvcache_block_num
|
||||
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1)) # 服务端管理的GPU上剩余的block id
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.num_gpu_blocks = cache_config.total_block_num
|
||||
self.gpu_free_block_list = list(
|
||||
range(self.num_gpu_blocks - 1, -1, -1)
|
||||
) # All gpu blocks are managed by cache manager
|
||||
else:
|
||||
self.num_gpu_blocks = cache_config.prefill_kvcache_block_num
|
||||
self.gpu_free_block_list = list(
|
||||
range(self.num_gpu_blocks - 1, -1, -1)
|
||||
) # Only block table divided for prefill managed by server
|
||||
|
||||
heapq.heapify(self.gpu_free_block_list)
|
||||
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
|
||||
@@ -231,6 +239,15 @@ class PrefixCacheManager:
|
||||
self.transfer_recv_thread = threading.Thread(target=self.recv_data_transfer_result)
|
||||
self.transfer_recv_thread.start()
|
||||
|
||||
def can_allocate_gpu_blocks(self, num_blocks: int):
|
||||
"""
|
||||
Check if num_blocks gpu blocks can be allocated.
|
||||
"""
|
||||
if len(self.gpu_free_block_list) < num_blocks:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def allocate_gpu_blocks(self, num_blocks):
|
||||
"""
|
||||
allocate gpu blocks.
|
||||
|
Reference in New Issue
Block a user