From dbedb0797b242b935edd74f29fd22f95d62d39b8 Mon Sep 17 00:00:00 2001 From: Daci <15625257+ST-XX@users.noreply.github.com> Date: Fri, 12 Dec 2025 17:43:29 +0800 Subject: [PATCH] [BugFix] reschedule_preempt_task append waiting & PREEMPTED blocksize (#5506) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bugfix reschedule_preempt_task append waiting & PREEMPTED blocksize * bugfix reschedule_preempt_task append waiting & PREEMPTED blocksize * 注释 * [bugfix] PREEMPTED task blocksize * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- fastdeploy/engine/sched/resource_manager_v1.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 0d4c062af..e5d1aee80 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -261,6 +261,7 @@ class ResourceManagerV1(ResourceManager): self.running.insert(0, preempted_req) continue preempted_req.status = RequestStatus.PREEMPTED + preempted_req.last_preempted_blocksize = len(preempted_req.block_tables) preempted_req.num_computed_tokens = 0 if self.config.scheduler_config.splitwise_role == "decode": self.tasks_list[preempted_req.idx] = None @@ -735,6 +736,14 @@ class ResourceManagerV1(ResourceManager): break num_new_tokens = self._get_num_new_tokens(request, token_budget) num_new_block = self.get_new_block_nums(request, num_new_tokens) + # If num_new_block is less than the last preempted block size, use the last preempted block size instead. + # For normal requests, when allocating blocks, we reserve two extra blocks for decoding. + # In the request rescheduling scenario, we currently only consider the number of tokens already generated, + # which might lead to allocating fewer blocks than the previous allocation, causing repeated rescheduling. + # This adjustment ensures we at least allocate as many blocks as before to avoid this issue. + last_preempted_blocksize = getattr(request, "last_preempted_blocksize", 0) + if num_new_block < last_preempted_blocksize: + num_new_block = last_preempted_blocksize # Allocate blocks to prefill if self.cache_manager.can_allocate_gpu_blocks(num_new_block): if not request.get("skip_allocate", False):