diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index c6811e351..306423aaf 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -398,6 +398,8 @@ class EngineArgs: envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if not current_platform.is_cuda(): envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + if self.guided_decoding_backend != "off": + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 8ba805023..8a0ff6f09 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -755,6 +755,9 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: if not current_platform.is_cuda(): logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.") envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + if parallel_config.guided_decoding_backend != "off": + logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported guided_decoding.") + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 fd_config = FDConfig( model_config=model_config,