From 3da9f01e1942fe02aca1ef088d60da837fe1d969 Mon Sep 17 00:00:00 2001 From: Yonghua Li <39643373+liyonghua0910@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:50:38 +0800 Subject: [PATCH] [BugFix] fix num_requests_running after clear_data (#4989) * [BugFix] fix num_requests_running after clear_data * [fix] fix tasks_list and stop flags not cleared when _free_blocks failed --- .../engine/sched/resource_manager_v1.py | 26 ++++++++++++------- fastdeploy/output/token_processor.py | 2 +- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index fdff2c0ba..433b9b8c5 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -473,14 +473,7 @@ class ResourceManagerV1(ResourceManager): if scheduled_reqs: llm_logger.debug(f"schedued_reqs: {scheduled_reqs}") - # Update metrics - num_tasks = sum([1 if task else 0 for task in self.tasks_list]) - num_blocks_used_by_tasks = sum([len(task.block_tables) if task else 0 for task in self.tasks_list]) - main_process_metrics.available_gpu_block_num.set(self.total_block_number() - num_blocks_used_by_tasks) - main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch()) - main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc()) - main_process_metrics.num_requests_running.set(len(self.running)) - main_process_metrics.num_requests_waiting.set(num_tasks - len(self.running)) + self.update_metrics() return scheduled_reqs @@ -570,7 +563,10 @@ class ResourceManagerV1(ResourceManager): if request in self.running: # normally run and finished self.running.remove(request) request.status = RequestStatus.FINISHED - self._free_blocks(request) + try: + self._free_blocks(request) + except Exception as e: + llm_logger.warning(f"release block failed {req_id}: {e}") if ( request.request_id in self.to_be_rescheduled_request_id_set ): # finished after preempted, blocks have been recycled. @@ -587,7 +583,19 @@ class ResourceManagerV1(ResourceManager): del self.requests[req_id] except Exception as e: llm_logger.error(f"finish_request err: {e}, {str(traceback.format_exc())}") + finally: + self.update_metrics() def clear_data(self): self.waiting: deque[Request] = deque() self.to_be_rescheduled_request_id_set = set() + + def update_metrics(self): + # Update metrics + num_tasks = sum([1 if task else 0 for task in self.tasks_list]) + num_blocks_used_by_tasks = sum([len(task.block_tables) if task else 0 for task in self.tasks_list]) + main_process_metrics.available_gpu_block_num.set(self.total_block_number() - num_blocks_used_by_tasks) + main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch()) + main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc()) + main_process_metrics.num_requests_running.set(len(self.running)) + main_process_metrics.num_requests_waiting.set(num_tasks - len(self.running)) diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 9383ab3e0..78d887c96 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -626,7 +626,7 @@ class TokenProcessor: def clear_data(self): if envs.ENABLE_V1_KVCACHE_SCHEDULER: self.resource_manager.clear_data() - for i in range(self.cfg.max_num_seqs): + for i in range(self.resource_manager.max_num_seqs): if self.resource_manager.stop_flags[i]: continue task = self.resource_manager.tasks_list[i]