diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 8a6bd1378..2f3710bfa 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -1028,7 +1028,10 @@ class EngineArgs: if paddle.is_compiled_with_xpu(): self.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + if speculative_cfg is not None and speculative_cfg.method is not None: + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 12eb0c687..73585ef77 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -332,6 +332,9 @@ class TokenProcessor: + accept_num[i] ].tolist() if len(token_ids) == 0 or token_ids[-1] <= 0: + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + if task_id in self.resource_manager.to_be_rescheduled_request_id_set: + self.resource_manager.reschedule_preempt_task(task_id) continue else: token_id = int(tokens[i, 0])