[Feature] Support using prefix-caching + cudagraph for inference (#2924)

* fix the bug in cudagraph+prefix-caching but still have some bug with profile

Change-Id: Ibf2ba3f2e3b08641d03f4b1391d7c862c3efa397

* add the signal to make sure cache manager launched

* fix judge condition

* reomove useless control

* update control stream

* update

* fix xpu

* change the do_profile flag

* update

* add new threads to init cache_manager

---------

Co-authored-by: RAM <gstian5555@outlook.com>
This commit is contained in:
Zero Rains
2025-07-22 15:59:45 +08:00
committed by GitHub
parent 48e6a0ca26
commit 89a485b69f
11 changed files with 63 additions and 65 deletions

View File

@@ -98,9 +98,9 @@ class GcuWorker(WorkerBase):
""" """
return self.model_runner.get_model()
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
def initialize_cache(self, num_gpu_blocks: int) -> None:
""" """
pass
self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
def execute_model(
self,
@@ -134,7 +134,3 @@ class GcuWorker(WorkerBase):
def cal_theortical_kvcache(self) -> int:
""" """
return self.model_runner.cal_theortical_kvcache()
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
""" """
self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)