From 6961130e046e2e400a33c8493089f6dd0200daba Mon Sep 17 00:00:00 2001 From: Yonghua Li <39643373+liyonghua0910@users.noreply.github.com> Date: Fri, 5 Dec 2025 21:51:59 +0800 Subject: [PATCH] [Cherry-Pick] [BugFix] fix scheduler hang when input length is very close to max_model_len (#5394) * [fix] fix scheduler hang when input length is very close to max_model_len * [fix] update local_scheduler for v1 scheduler * [fix] code style --- fastdeploy/engine/common_engine.py | 2 +- fastdeploy/scheduler/local_scheduler.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 3b77737d9..f2a37821f 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -574,7 +574,7 @@ class EngineSevice: tasks = self.scheduler.get_requests( available_blocks=available_blocks, block_size=self.cfg.cache_config.block_size, - reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, + reserved_output_blocks=0, # self.cfg.cache_config.enc_dec_block_num, max_num_batched_tokens=self.cfg.max_model_len, batch=num_prefill_batch, ) diff --git a/fastdeploy/scheduler/local_scheduler.py b/fastdeploy/scheduler/local_scheduler.py index 159dd447d..0b4feae96 100644 --- a/fastdeploy/scheduler/local_scheduler.py +++ b/fastdeploy/scheduler/local_scheduler.py @@ -18,6 +18,7 @@ import threading import time from typing import Dict, List, Optional, Tuple +from fastdeploy import envs from fastdeploy.engine.request import Request, RequestOutput from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse from fastdeploy.utils import scheduler_logger @@ -253,12 +254,11 @@ class LocalScheduler: for request_id in batch_ids: request = self.requests[request_id] required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) - current_prefill_tokens += request.prompt_tokens_ids_len required_total_blocks += required_input_blocks + reserved_output_blocks if required_total_blocks > available_blocks: break - if self.enable_chunked_prefill: + if not envs.ENABLE_V1_KVCACHE_SCHEDULER and self.enable_chunked_prefill: if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: # 长请求 long_partial_requests += 1 @@ -274,6 +274,7 @@ class LocalScheduler: break requests.append(request.raw) + current_prefill_tokens += request.prompt_tokens_ids_len self.ids_read_cursor += len(requests) if len(batch_ids) > 0 and len(requests) == 0: