[feat] support prefix cache clearing when /clear_load_weight is called (#4008)

* [feat] support clearing prefix cache (cherry-picked from release/2.1) * [fix] fix ipc suffix, use port instead * [fix] fix prefix caching not enabled * [fix] fix key/value_cache_scales indent * [fix] fix ep group all-reduce * [fix] fix clear/update lock not working when workers > 1 * [chore] add preemption triggered info log * [fix] fix code style * [fix] fix max_num_seqs config * [fix] do not force enable_prefix_caching=False in dynamic loading * [fix] fix ci * Revert "[fix] fix ci" This reverts commit 0bc6d55cc8. * [fix] initialize available_gpu_block_num with max_gpu_block_num * [fix] fix config splitwise_role * [fix] fix clearing caches synchronization and add more logs * [chore] print cache_ready_signal in log * [fix] fix scheduler_config.splitwise_role * [fix] fix cache_messager cache_ready_signal create=True * [fix] stop cache messager from launching in mixed deployment
2025-10-06 17:17:14 +08:00 · 2025-09-28 19:42:53 +08:00
parent 59313ed7f9
commit 6265f4385f
20 changed files with 697 additions and 213 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -403,6 +403,7 @@ class EngineArgs:
        """
        Post-initialization processing to set default tokenizer if not provided.
        """
+
        if not self.tokenizer:
            self.tokenizer = self.model
        if self.splitwise_role == "decode":
@@ -411,8 +412,8 @@ class EngineArgs:
            self.enable_prefix_caching = False
        if not current_platform.is_cuda():
            self.enable_prefix_caching = False
-        if self.dynamic_load_weight:
-            self.enable_prefix_caching = False
+        # if self.dynamic_load_weight:
+        #     self.enable_prefix_caching = False
        if self.enable_logprob:
            if self.speculative_config is not None:
                raise NotImplementedError("Logprob does not support speculation_config.")