[Fearture] Support cache kv cache for output tokens (#4535)

* [Fearture] Support cache kv cache for output tokens * fix bug * fix ci bug * improve coverage * enable output caching by default * fix ci --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-12-04 20:53:08 +08:00
parent b6f8069b36
commit 3878a99b69
7 changed files with 94 additions and 2 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -237,6 +237,10 @@ class EngineArgs:
    """
    Flag to enable prefix caching.
    """
+    enable_output_caching: bool = True
+    """
+    Flag to enable kv cache for output tokens, only valid in V1 scheduler.
+    """

    disable_custom_all_reduce: bool = False
    """
@@ -955,6 +959,13 @@ class EngineArgs:
            help="Flag to enable prefix caching.",
        )

+        perf_group.add_argument(
+            "--enable-output-caching",
+            action=argparse.BooleanOptionalAction,
+            default=EngineArgs.enable_output_caching,
+            help="Flag to enable output caching.",
+        )
+
        perf_group.add_argument(
            "--enable-chunked-prefill",
            action="store_true",