test scheduler (#4739)

This commit is contained in:
kevin
2025-11-03 20:12:14 +08:00
committed by GitHub
parent 35a6969a44
commit 5233825562
2 changed files with 44 additions and 50 deletions

View File

@@ -533,33 +533,32 @@ class GlobalScheduler:
continue
request: ScheduledRequest = ScheduledRequest.unserialize(serialized_request)
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
current_prefill_tokens += request.prompt_tokens_ids_len
required_total_blocks += required_input_blocks + reserved_output_blocks
current_prefill_tokens += request.prompt_tokens_ids_len
required_total_blocks += required_input_blocks + reserved_output_blocks
if required_total_blocks > available_blocks:
remaining_request.append((request_queue_name, serialized_request))
continue
if required_total_blocks > available_blocks:
remaining_request.append((request_queue_name, serialized_request))
continue
if not envs.FD_ENABLE_MAX_PREFILL:
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
remaining_request.append((request_queue_name, serialized_request))
continue
else:
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
if not envs.FD_ENABLE_MAX_PREFILL:
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
remaining_request.append((request_queue_name, serialized_request))
continue
else:
if current_prefill_tokens > max_num_batched_tokens:
remaining_request.append((request_queue_name, serialized_request))
continue
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
remaining_request.append((request_queue_name, serialized_request))
continue
else:
if current_prefill_tokens > max_num_batched_tokens:
remaining_request.append((request_queue_name, serialized_request))
continue
scheduled_requests.append(request)

View File

@@ -247,38 +247,33 @@ class LocalScheduler:
)
requests: List[Request] = []
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
required_total_blocks = 0
current_prefill_tokens = 0
long_partial_requests, short_partial_requests = 0, 0
for request_id in batch_ids:
request = self.requests[request_id]
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
current_prefill_tokens += request.prompt_tokens_ids_len
required_total_blocks += required_input_blocks + reserved_output_blocks
if required_total_blocks > available_blocks:
break
required_total_blocks = 0
current_prefill_tokens = 0
long_partial_requests, short_partial_requests = 0, 0
for request_id in batch_ids:
request = self.requests[request_id]
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
current_prefill_tokens += request.prompt_tokens_ids_len
required_total_blocks += required_input_blocks + reserved_output_blocks
if required_total_blocks > available_blocks:
break
if not envs.FD_ENABLE_MAX_PREFILL:
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
# 长请求
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
break
else:
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
if not envs.FD_ENABLE_MAX_PREFILL:
if self.enable_chunked_prefill:
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
# 长请求
long_partial_requests += 1
if long_partial_requests > self.max_long_partial_prefills:
break
else:
if current_prefill_tokens > max_num_batched_tokens:
break
requests.append(request.raw)
else:
for request_id in batch_ids:
request = self.requests[request_id]
requests.append(request.raw)
short_partial_requests += 1
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
break
else:
if current_prefill_tokens > max_num_batched_tokens:
break
requests.append(request.raw)
self.ids_read_cursor += len(requests)