[Feature] Support using prefix-caching + cudagraph for inference (#2924)

* fix the bug in cudagraph+prefix-caching but still have some bug with profile

Change-Id: Ibf2ba3f2e3b08641d03f4b1391d7c862c3efa397

* add the signal to make sure cache manager launched

* fix judge condition

* reomove useless control

* update control stream

* update

* fix xpu

* change the do_profile flag

* update

* add new threads to init cache_manager

---------

Co-authored-by: RAM <gstian5555@outlook.com>
This commit is contained in:
Zero Rains
2025-07-22 15:59:45 +08:00
committed by GitHub
parent 48e6a0ca26
commit 89a485b69f
11 changed files with 63 additions and 65 deletions

View File

@@ -165,9 +165,10 @@ class GpuWorker(WorkerBase):
"""Get current model"""
return self.model_runner.get_model()
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
"""Initizlize the KV Cache"""
pass
def initialize_cache(self, num_gpu_blocks: int) -> None:
"""Initizlize the KV Cache with accurate num_gpu_blocks"""
# accurate cache size
self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
def execute_model(
self,
@@ -198,7 +199,3 @@ class GpuWorker(WorkerBase):
def cal_theortical_kvcache(self) -> int:
"""Calculate the block memory required"""
return self.model_runner.cal_theortical_kvcache()
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
"""Reinitialize the kv cache using the parameters from the profile"""
self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)