[Feature] Support using prefix-caching + cudagraph for inference (#2924)

* fix the bug in cudagraph+prefix-caching but still have some bug with profile Change-Id: Ibf2ba3f2e3b08641d03f4b1391d7c862c3efa397 * add the signal to make sure cache manager launched * fix judge condition * reomove useless control * update control stream * update * fix xpu * change the do_profile flag * update * add new threads to init cache_manager --------- Co-authored-by: RAM <gstian5555@outlook.com>
2025-10-05 08:37:06 +08:00 · 2025-07-22 15:59:45 +08:00
parent 48e6a0ca26
commit 89a485b69f
11 changed files with 63 additions and 65 deletions
--- a/fastdeploy/worker/gcu_worker.py
+++ b/fastdeploy/worker/gcu_worker.py
@@ -98,9 +98,9 @@ class GcuWorker(WorkerBase):
        """ """
        return self.model_runner.get_model()

-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+    def initialize_cache(self, num_gpu_blocks: int) -> None:
        """ """
-        pass
+        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)

    def execute_model(
        self,
@@ -134,7 +134,3 @@ class GcuWorker(WorkerBase):
    def cal_theortical_kvcache(self) -> int:
        """ """
        return self.model_runner.cal_theortical_kvcache()
-
-    def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
-        """ """
-        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)