From 4178c110d23d1a62fcc70e6705da56a2e6c96390 Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord <31272106+ApplEOFDiscord@users.noreply.github.com> Date: Thu, 16 Oct 2025 11:10:33 +0800 Subject: [PATCH] [Bug Fix] fix outdated doc and disable mm model prefix caching (#4425) * fix outdated doc and disable mm model prefix caching * fix outdated doc and disable mm model prefix caching --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --- docs/zh/offline_inference.md | 2 +- fastdeploy/config.py | 2 ++ fastdeploy/engine/args_utils.py | 2 -- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/zh/offline_inference.md b/docs/zh/offline_inference.md index adac6ff28..7cd46a45d 100644 --- a/docs/zh/offline_inference.md +++ b/docs/zh/offline_inference.md @@ -92,7 +92,7 @@ from PIL import Image from fastdeploy.entrypoints.llm import LLM from fastdeploy.engine.sampling_params import SamplingParams -from fastdeploy.input.ernie_tokenizer import Ernie4_5Tokenizer +from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle" tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index ce83df4f2..653d3813c 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1257,6 +1257,8 @@ class FDConfig: self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs) self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) + if self.model_config is not None and self.model_config.enable_mm: + self.cache_config.enable_prefix_caching = False if self.guided_decoding_backend == "auto": if self.model_config.enable_mm: diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 21ebd6de1..11da1a8fb 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -394,8 +394,6 @@ class EngineArgs: self.enable_prefix_caching = False if self.speculative_config is not None: self.enable_prefix_caching = False - if self.enable_mm: - self.enable_prefix_caching = False if not current_platform.is_cuda(): self.enable_prefix_caching = False # if self.dynamic_load_weight: