[Feature] Enable prefix caching as default (#3816)

* [Feature] Enable prefix caching as default * [Feature] Enable prefix caching as default * Set prefix caching as default * skip dynamic load * fix kill bug * fix kill bug * fix kill bug * fix ci * fix --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-21 15:49:31 +08:00 · 2025-09-06 09:51:34 +08:00
parent 11b18e5ef0
commit 41cd3e24c9
6 changed files with 37 additions and 5 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """

+import argparse
 import json
 from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
@@ -190,7 +191,7 @@ class EngineArgs:
    """
    Flag to indicate whether to use warm-up before inference.
    """
-    enable_prefix_caching: bool = False
+    enable_prefix_caching: bool = True
    """
    Flag to enable prefix caching.
    """
@@ -387,6 +388,16 @@ class EngineArgs:
        """
        if not self.tokenizer:
            self.tokenizer = self.model
+        if self.splitwise_role == "decode":
+            self.enable_prefix_caching = False
+        if self.speculative_config is not None:
+            self.enable_prefix_caching = False
+        if self.enable_mm:
+            self.enable_prefix_caching = False
+        if not current_platform.is_cuda():
+            self.enable_prefix_caching = False
+        if self.dynamic_load_weight:
+            self.enable_prefix_caching = False
        if self.enable_logprob:
            if self.speculative_config is not None:
                raise NotImplementedError("Logprob does not support speculation_config.")
@@ -725,7 +736,7 @@ class EngineArgs:
        perf_group = parser.add_argument_group("Performance Tuning")
        perf_group.add_argument(
            "--enable-prefix-caching",
-            action="store_true",
+            action=argparse.BooleanOptionalAction,
            default=EngineArgs.enable_prefix_caching,
            help="Flag to enable prefix caching.",
        )
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -342,7 +342,8 @@ class LLMEngine:
            for p in self.cache_manager_processes:
                llm_logger.info(f"Killing cache manager process {p.pid}")
                try:
-                    os.killpg(p.pid, signal.SIGTERM)
+                    pgid = os.getpgid(p.pid)
+                    os.killpg(pgid, signal.SIGTERM)
                except Exception as e:
                    console_logger.error(
                        f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}"