mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Feature] Support using prefix-caching + cudagraph for inference (#2924)
* fix the bug in cudagraph+prefix-caching but still have some bug with profile Change-Id: Ibf2ba3f2e3b08641d03f4b1391d7c862c3efa397 * add the signal to make sure cache manager launched * fix judge condition * reomove useless control * update control stream * update * fix xpu * change the do_profile flag * update * add new threads to init cache_manager --------- Co-authored-by: RAM <gstian5555@outlook.com>
This commit is contained in:
@@ -165,9 +165,10 @@ class GpuWorker(WorkerBase):
|
||||
"""Get current model"""
|
||||
return self.model_runner.get_model()
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
|
||||
"""Initizlize the KV Cache"""
|
||||
pass
|
||||
def initialize_cache(self, num_gpu_blocks: int) -> None:
|
||||
"""Initizlize the KV Cache with accurate num_gpu_blocks"""
|
||||
# accurate cache size
|
||||
self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
@@ -198,7 +199,3 @@ class GpuWorker(WorkerBase):
|
||||
def cal_theortical_kvcache(self) -> int:
|
||||
"""Calculate the block memory required"""
|
||||
return self.model_runner.cal_theortical_kvcache()
|
||||
|
||||
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
|
||||
"""Reinitialize the kv cache using the parameters from the profile"""
|
||||
self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
|
||||
|
Reference in New Issue
Block a user