mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
@@ -480,8 +480,8 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
# Initialize free list
|
||||
free_list = list(
|
||||
range(
|
||||
self.parallel_config.max_block_num - 1,
|
||||
int(self.parallel_config.max_block_num *
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num *
|
||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||
self.free_list_len = len(free_list)
|
||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||
@@ -1114,7 +1114,7 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
||||
|
||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||
self.num_gcu_blocks = self.parallel_config.max_block_num
|
||||
self.num_gcu_blocks = self.parallel_config.total_block_num
|
||||
self.initialize_kv_cache()
|
||||
|
||||
# 1. Profile with multimodal encoder & encoder cache
|
||||
|
Reference in New Issue
Block a user