[Cherry-Pick] [BugFix] fix scheduler hang when input length is very close to max_model_len (#5394)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled

* [fix] fix scheduler hang when input length is very close to max_model_len

* [fix] update local_scheduler for v1 scheduler

* [fix] code style
This commit is contained in:
Yonghua Li
2025-12-05 21:51:59 +08:00
committed by GitHub
parent bce3739a57
commit 6961130e04
2 changed files with 4 additions and 3 deletions

View File

@@ -574,7 +574,7 @@ class EngineSevice:
tasks = self.scheduler.get_requests( tasks = self.scheduler.get_requests(
available_blocks=available_blocks, available_blocks=available_blocks,
block_size=self.cfg.cache_config.block_size, block_size=self.cfg.cache_config.block_size,
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, reserved_output_blocks=0, # self.cfg.cache_config.enc_dec_block_num,
max_num_batched_tokens=self.cfg.max_model_len, max_num_batched_tokens=self.cfg.max_model_len,
batch=num_prefill_batch, batch=num_prefill_batch,
) )

View File

@@ -18,6 +18,7 @@ import threading
import time import time
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
from fastdeploy import envs
from fastdeploy.engine.request import Request, RequestOutput from fastdeploy.engine.request import Request, RequestOutput
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
from fastdeploy.utils import scheduler_logger from fastdeploy.utils import scheduler_logger
@@ -253,12 +254,11 @@ class LocalScheduler:
for request_id in batch_ids: for request_id in batch_ids:
request = self.requests[request_id] request = self.requests[request_id]
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
current_prefill_tokens += request.prompt_tokens_ids_len
required_total_blocks += required_input_blocks + reserved_output_blocks required_total_blocks += required_input_blocks + reserved_output_blocks
if required_total_blocks > available_blocks: if required_total_blocks > available_blocks:
break break
if self.enable_chunked_prefill: if not envs.ENABLE_V1_KVCACHE_SCHEDULER and self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
# 长请求 # 长请求
long_partial_requests += 1 long_partial_requests += 1
@@ -274,6 +274,7 @@ class LocalScheduler:
break break
requests.append(request.raw) requests.append(request.raw)
current_prefill_tokens += request.prompt_tokens_ids_len
self.ids_read_cursor += len(requests) self.ids_read_cursor += len(requests)
if len(batch_ids) > 0 and len(requests) == 0: if len(batch_ids) > 0 and len(requests) == 0: