[Feature] Set prefix caching as default (#3814)

* Set prefix caching as default

* Set prefix caching as default

* Set prefix caching as default

* skip dynamic load scene

* fix kill bug

* fix kill bug

* fix kill bug

* fix

* fix

* fix ci
This commit is contained in:
chenjian
2025-09-16 20:34:27 +08:00
committed by GitHub
parent de8638b1e9
commit 67e6d8c691
5 changed files with 23 additions and 8 deletions

View File

@@ -14,6 +14,7 @@
# limitations under the License.
"""
import argparse
import json
from dataclasses import asdict, dataclass
from dataclasses import fields as dataclass_fields
@@ -190,7 +191,7 @@ class EngineArgs:
"""
Flag to indicate whether to use warm-up before inference.
"""
enable_prefix_caching: bool = False
enable_prefix_caching: bool = True
"""
Flag to enable prefix caching.
"""
@@ -387,6 +388,16 @@ class EngineArgs:
"""
if not self.tokenizer:
self.tokenizer = self.model
if self.splitwise_role == "decode":
self.enable_prefix_caching = False
if self.speculative_config is not None:
self.enable_prefix_caching = False
if self.enable_mm:
self.enable_prefix_caching = False
if not current_platform.is_cuda():
self.enable_prefix_caching = False
if self.dynamic_load_weight:
self.enable_prefix_caching = False
if self.enable_logprob:
if self.speculative_config is not None:
raise NotImplementedError("Logprob does not support speculation_config.")
@@ -725,7 +736,7 @@ class EngineArgs:
perf_group = parser.add_argument_group("Performance Tuning")
perf_group.add_argument(
"--enable-prefix-caching",
action="store_true",
action=argparse.BooleanOptionalAction,
default=EngineArgs.enable_prefix_caching,
help="Flag to enable prefix caching.",
)