mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
@@ -591,8 +591,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# Initialize free list
|
||||
free_list = list(
|
||||
range(
|
||||
self.parallel_config.max_block_num - 1,
|
||||
int(self.parallel_config.max_block_num *
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num *
|
||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||
self.free_list_len = len(free_list)
|
||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||
@@ -1295,7 +1295,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||
# TODO(gongshaotian): Optimize the management logic of kvcache
|
||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
||||
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||
self.initialize_kv_cache()
|
||||
|
||||
# 1. Profile with multimodal encoder & encoder cache
|
||||
|
Reference in New Issue
Block a user