mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
test scheduler (#4739)
This commit is contained in:
@@ -533,33 +533,32 @@ class GlobalScheduler:
|
||||
continue
|
||||
|
||||
request: ScheduledRequest = ScheduledRequest.unserialize(serialized_request)
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
|
||||
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
|
||||
|
||||
current_prefill_tokens += request.prompt_tokens_ids_len
|
||||
required_total_blocks += required_input_blocks + reserved_output_blocks
|
||||
current_prefill_tokens += request.prompt_tokens_ids_len
|
||||
required_total_blocks += required_input_blocks + reserved_output_blocks
|
||||
|
||||
if required_total_blocks > available_blocks:
|
||||
remaining_request.append((request_queue_name, serialized_request))
|
||||
continue
|
||||
if required_total_blocks > available_blocks:
|
||||
remaining_request.append((request_queue_name, serialized_request))
|
||||
continue
|
||||
|
||||
if not envs.FD_ENABLE_MAX_PREFILL:
|
||||
if self.enable_chunked_prefill:
|
||||
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
|
||||
long_partial_requests += 1
|
||||
if long_partial_requests > self.max_long_partial_prefills:
|
||||
remaining_request.append((request_queue_name, serialized_request))
|
||||
continue
|
||||
else:
|
||||
short_partial_requests += 1
|
||||
|
||||
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
|
||||
if not envs.FD_ENABLE_MAX_PREFILL:
|
||||
if self.enable_chunked_prefill:
|
||||
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
|
||||
long_partial_requests += 1
|
||||
if long_partial_requests > self.max_long_partial_prefills:
|
||||
remaining_request.append((request_queue_name, serialized_request))
|
||||
continue
|
||||
else:
|
||||
if current_prefill_tokens > max_num_batched_tokens:
|
||||
remaining_request.append((request_queue_name, serialized_request))
|
||||
continue
|
||||
short_partial_requests += 1
|
||||
|
||||
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
|
||||
remaining_request.append((request_queue_name, serialized_request))
|
||||
continue
|
||||
else:
|
||||
if current_prefill_tokens > max_num_batched_tokens:
|
||||
remaining_request.append((request_queue_name, serialized_request))
|
||||
continue
|
||||
|
||||
scheduled_requests.append(request)
|
||||
|
||||
|
||||
@@ -247,38 +247,33 @@ class LocalScheduler:
|
||||
)
|
||||
|
||||
requests: List[Request] = []
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
required_total_blocks = 0
|
||||
current_prefill_tokens = 0
|
||||
long_partial_requests, short_partial_requests = 0, 0
|
||||
for request_id in batch_ids:
|
||||
request = self.requests[request_id]
|
||||
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
|
||||
current_prefill_tokens += request.prompt_tokens_ids_len
|
||||
required_total_blocks += required_input_blocks + reserved_output_blocks
|
||||
if required_total_blocks > available_blocks:
|
||||
break
|
||||
required_total_blocks = 0
|
||||
current_prefill_tokens = 0
|
||||
long_partial_requests, short_partial_requests = 0, 0
|
||||
for request_id in batch_ids:
|
||||
request = self.requests[request_id]
|
||||
required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size)
|
||||
current_prefill_tokens += request.prompt_tokens_ids_len
|
||||
required_total_blocks += required_input_blocks + reserved_output_blocks
|
||||
if required_total_blocks > available_blocks:
|
||||
break
|
||||
|
||||
if not envs.FD_ENABLE_MAX_PREFILL:
|
||||
if self.enable_chunked_prefill:
|
||||
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
|
||||
# 长请求
|
||||
long_partial_requests += 1
|
||||
if long_partial_requests > self.max_long_partial_prefills:
|
||||
break
|
||||
else:
|
||||
short_partial_requests += 1
|
||||
|
||||
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
|
||||
if not envs.FD_ENABLE_MAX_PREFILL:
|
||||
if self.enable_chunked_prefill:
|
||||
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
|
||||
# 长请求
|
||||
long_partial_requests += 1
|
||||
if long_partial_requests > self.max_long_partial_prefills:
|
||||
break
|
||||
else:
|
||||
if current_prefill_tokens > max_num_batched_tokens:
|
||||
break
|
||||
requests.append(request.raw)
|
||||
else:
|
||||
for request_id in batch_ids:
|
||||
request = self.requests[request_id]
|
||||
requests.append(request.raw)
|
||||
short_partial_requests += 1
|
||||
|
||||
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
|
||||
break
|
||||
else:
|
||||
if current_prefill_tokens > max_num_batched_tokens:
|
||||
break
|
||||
requests.append(request.raw)
|
||||
|
||||
self.ids_read_cursor += len(requests)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user