diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 3b77737d9..f2a37821f 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -574,7 +574,7 @@ class EngineSevice: tasks = self.scheduler.get_requests( available_blocks=available_blocks, block_size=self.cfg.cache_config.block_size, - reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, + reserved_output_blocks=0, # self.cfg.cache_config.enc_dec_block_num, max_num_batched_tokens=self.cfg.max_model_len, batch=num_prefill_batch, ) diff --git a/fastdeploy/scheduler/local_scheduler.py b/fastdeploy/scheduler/local_scheduler.py index 159dd447d..0b4feae96 100644 --- a/fastdeploy/scheduler/local_scheduler.py +++ b/fastdeploy/scheduler/local_scheduler.py @@ -18,6 +18,7 @@ import threading import time from typing import Dict, List, Optional, Tuple +from fastdeploy import envs from fastdeploy.engine.request import Request, RequestOutput from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse from fastdeploy.utils import scheduler_logger @@ -253,12 +254,11 @@ class LocalScheduler: for request_id in batch_ids: request = self.requests[request_id] required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) - current_prefill_tokens += request.prompt_tokens_ids_len required_total_blocks += required_input_blocks + reserved_output_blocks if required_total_blocks > available_blocks: break - if self.enable_chunked_prefill: + if not envs.ENABLE_V1_KVCACHE_SCHEDULER and self.enable_chunked_prefill: if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: # 长请求 long_partial_requests += 1 @@ -274,6 +274,7 @@ class LocalScheduler: break requests.append(request.raw) + current_prefill_tokens += request.prompt_tokens_ids_len self.ids_read_cursor += len(requests) if len(batch_ids) > 0 and len(requests) == 0: