diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 03dffe906..14c7bcb00 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -230,7 +230,7 @@ class PrefixCacheManager: while np.sum(self.cache_ready_signal.value) != tensor_parallel_size: time.sleep(1) - if cache_config.swap_space is not None and cache_config.swap_space > 0: + if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0: while np.sum(self.swap_space_ready_signal.value) != tensor_parallel_size: time.sleep(1) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 4f0ee57cb..2a1c9dc5a 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -385,6 +385,7 @@ class EngineArgs: """ Post-initialization processing to set default tokenizer if not provided. """ + from fastdeploy.utils import llm_logger if not self.tokenizer: self.tokenizer = self.model if self.enable_logprob: diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 3905b3fe6..f8f17eda4 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -163,15 +163,7 @@ class LLMEngine: self._stop_profile() elif self.cfg.cache_config.enable_prefix_caching: device_ids = self.cfg.device_ids.split(",") - self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager( - cache_config=self.cfg.cache_config, - tensor_parallel_size=self.cfg.tensor_parallel_size, - device_ids=device_ids, - pod_ip=self.cfg.master_ip, - engine_worker_queue_port=self.cfg.engine_worker_queue_port, - pid_suffix=self.ipc_signal_suffix, - create_cache_tensor=False, - ) + self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, False) # Launch components: scheduler, cache_manager, expert_service et.al. self.launch_components()