[Feature] Support using prefix-caching + cudagraph for inference (#2924)

* fix the bug in cudagraph+prefix-caching but still have some bug with profile Change-Id: Ibf2ba3f2e3b08641d03f4b1391d7c862c3efa397 * add the signal to make sure cache manager launched * fix judge condition * reomove useless control * update control stream * update * fix xpu * change the do_profile flag * update * add new threads to init cache_manager --------- Co-authored-by: RAM <gstian5555@outlook.com>
2025-10-04 08:16:42 +08:00 · 2025-07-22 15:59:45 +08:00
parent 48e6a0ca26
commit 89a485b69f
11 changed files with 63 additions and 65 deletions
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -151,8 +151,6 @@ class GCUModelRunner(ModelRunnerBase):
        """
        Process inputs for prefill tasks and insert it to share_inputs buffer
        """
-        if "caches" not in self.share_inputs:
-            self.initialize_kv_cache()

        if req_dicts[-1].disaggregate_info is not None and req_dicts[-1].disaggregate_info["role"] == "prefill":
            os.environ["PREFILL_NODE_ONE_STEP_STOP"] = "1"
@@ -561,7 +559,7 @@ class GCUModelRunner(ModelRunnerBase):
        self.initialize_kv_cache()
        self.dynamic_weight_manager._log_memory("dynamic weight manager update all memory")

-    def initialize_kv_cache(self) -> None:
+    def initialize_kv_cache(self, profile: bool = False) -> None:
        """
        Initialize kv cache
        """
@@ -582,7 +580,7 @@ class GCUModelRunner(ModelRunnerBase):
        kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num)
        # local_rank = self.local_rank % self.parallel_config.tensor_parallel_size

-        if not self.parallel_config.do_profile and (
+        if not profile and (
            self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
        ):
            raise NotImplementedError("prefix_caching is not support by GCUModelRunner.")
@@ -1012,7 +1010,7 @@ class GCUModelRunner(ModelRunnerBase):

        # Initialize kv cache for profile run. After profile run kv cache will be reset.
        self.num_gcu_blocks = self.parallel_config.total_block_num
-        self.initialize_kv_cache()
+        self.initialize_kv_cache(profile=True)

        # 1. Profile with multimodal encoder & encoder cache

@@ -1038,8 +1036,7 @@ class GCUModelRunner(ModelRunnerBase):
        self.num_gcu_blocks = num_gpu_blocks

        # Reset block table and kv cache with global block num
-        if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
-            self.initialize_kv_cache()
+        self.initialize_kv_cache()

        # Reset free list
        free_list = list(
@@ -1057,8 +1054,6 @@ class GCUModelRunner(ModelRunnerBase):
            }
        )

-        self.parallel_config.do_profile = False
-
        if self.speculative_method in ["mtp"]:
            self.proposer.update_block_num(num_gpu_blocks)