From 52338255624246291684e65a1194cc7bfe85f708 Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 3 Nov 2025 20:12:14 +0800 Subject: [PATCH] test scheduler (#4739) --- fastdeploy/scheduler/global_scheduler.py | 41 +++++++++--------- fastdeploy/scheduler/local_scheduler.py | 53 +++++++++++------------- 2 files changed, 44 insertions(+), 50 deletions(-) diff --git a/fastdeploy/scheduler/global_scheduler.py b/fastdeploy/scheduler/global_scheduler.py index 1debc0a11..9f6d06446 100644 --- a/fastdeploy/scheduler/global_scheduler.py +++ b/fastdeploy/scheduler/global_scheduler.py @@ -533,33 +533,32 @@ class GlobalScheduler: continue request: ScheduledRequest = ScheduledRequest.unserialize(serialized_request) - if not envs.ENABLE_V1_KVCACHE_SCHEDULER: - required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) + required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) - current_prefill_tokens += request.prompt_tokens_ids_len - required_total_blocks += required_input_blocks + reserved_output_blocks + current_prefill_tokens += request.prompt_tokens_ids_len + required_total_blocks += required_input_blocks + reserved_output_blocks - if required_total_blocks > available_blocks: - remaining_request.append((request_queue_name, serialized_request)) - continue + if required_total_blocks > available_blocks: + remaining_request.append((request_queue_name, serialized_request)) + continue - if not envs.FD_ENABLE_MAX_PREFILL: - if self.enable_chunked_prefill: - if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: - long_partial_requests += 1 - if long_partial_requests > self.max_long_partial_prefills: - remaining_request.append((request_queue_name, serialized_request)) - continue - else: - short_partial_requests += 1 - - if short_partial_requests + long_partial_requests > self.max_num_partial_prefills: + if not envs.FD_ENABLE_MAX_PREFILL: + if self.enable_chunked_prefill: + if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: + long_partial_requests += 1 + if long_partial_requests > self.max_long_partial_prefills: remaining_request.append((request_queue_name, serialized_request)) continue else: - if current_prefill_tokens > max_num_batched_tokens: - remaining_request.append((request_queue_name, serialized_request)) - continue + short_partial_requests += 1 + + if short_partial_requests + long_partial_requests > self.max_num_partial_prefills: + remaining_request.append((request_queue_name, serialized_request)) + continue + else: + if current_prefill_tokens > max_num_batched_tokens: + remaining_request.append((request_queue_name, serialized_request)) + continue scheduled_requests.append(request) diff --git a/fastdeploy/scheduler/local_scheduler.py b/fastdeploy/scheduler/local_scheduler.py index 26989f3dc..b246ca09c 100644 --- a/fastdeploy/scheduler/local_scheduler.py +++ b/fastdeploy/scheduler/local_scheduler.py @@ -247,38 +247,33 @@ class LocalScheduler: ) requests: List[Request] = [] - if not envs.ENABLE_V1_KVCACHE_SCHEDULER: - required_total_blocks = 0 - current_prefill_tokens = 0 - long_partial_requests, short_partial_requests = 0, 0 - for request_id in batch_ids: - request = self.requests[request_id] - required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) - current_prefill_tokens += request.prompt_tokens_ids_len - required_total_blocks += required_input_blocks + reserved_output_blocks - if required_total_blocks > available_blocks: - break + required_total_blocks = 0 + current_prefill_tokens = 0 + long_partial_requests, short_partial_requests = 0, 0 + for request_id in batch_ids: + request = self.requests[request_id] + required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) + current_prefill_tokens += request.prompt_tokens_ids_len + required_total_blocks += required_input_blocks + reserved_output_blocks + if required_total_blocks > available_blocks: + break - if not envs.FD_ENABLE_MAX_PREFILL: - if self.enable_chunked_prefill: - if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: - # 长请求 - long_partial_requests += 1 - if long_partial_requests > self.max_long_partial_prefills: - break - else: - short_partial_requests += 1 - - if short_partial_requests + long_partial_requests > self.max_num_partial_prefills: + if not envs.FD_ENABLE_MAX_PREFILL: + if self.enable_chunked_prefill: + if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: + # 长请求 + long_partial_requests += 1 + if long_partial_requests > self.max_long_partial_prefills: break else: - if current_prefill_tokens > max_num_batched_tokens: - break - requests.append(request.raw) - else: - for request_id in batch_ids: - request = self.requests[request_id] - requests.append(request.raw) + short_partial_requests += 1 + + if short_partial_requests + long_partial_requests > self.max_num_partial_prefills: + break + else: + if current_prefill_tokens > max_num_batched_tokens: + break + requests.append(request.raw) self.ids_read_cursor += len(requests)