diff --git a/docs/zh/offline_inference.md b/docs/zh/offline_inference.md index adac6ff28..7cd46a45d 100644 --- a/docs/zh/offline_inference.md +++ b/docs/zh/offline_inference.md @@ -92,7 +92,7 @@ from PIL import Image from fastdeploy.entrypoints.llm import LLM from fastdeploy.engine.sampling_params import SamplingParams -from fastdeploy.input.ernie_tokenizer import Ernie4_5Tokenizer +from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer PATH = "baidu/ERNIE-4.5-VL-28B-A3B-Paddle" tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index ce83df4f2..653d3813c 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1257,6 +1257,8 @@ class FDConfig: self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs) self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) + if self.model_config is not None and self.model_config.enable_mm: + self.cache_config.enable_prefix_caching = False if self.guided_decoding_backend == "auto": if self.model_config.enable_mm: diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 21ebd6de1..11da1a8fb 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -394,8 +394,6 @@ class EngineArgs: self.enable_prefix_caching = False if self.speculative_config is not None: self.enable_prefix_caching = False - if self.enable_mm: - self.enable_prefix_caching = False if not current_platform.is_cuda(): self.enable_prefix_caching = False # if self.dynamic_load_weight: