mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 00:06:38 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
@@ -14,29 +14,66 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import threading
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.utils import llm_logger
|
||||
from fastdeploy.engine.request import Request, RequestOutput
|
||||
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
|
||||
from fastdeploy.utils import llm_logger
|
||||
|
||||
|
||||
class LocalScheduler(object):
|
||||
"""
|
||||
LocalScheduler Class
|
||||
A local in-memory task scheduler for request/response management.
|
||||
|
||||
This class provides functionality for:
|
||||
- Enqueuing and dequeuing requests
|
||||
- Managing request lifecycle with TTL
|
||||
- Handling request/response flow
|
||||
- Thread-safe operations with condition variables
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_size: int,
|
||||
ttl: int,
|
||||
wait_response_timeout: float):
|
||||
def __init__(
|
||||
self,
|
||||
max_size: int,
|
||||
ttl: int,
|
||||
enable_chunked_prefill: bool,
|
||||
max_num_partial_prefills: int,
|
||||
max_long_partial_prefills: int,
|
||||
long_prefill_token_threshold: int,
|
||||
):
|
||||
"""
|
||||
Initializes a local in-memory scheduler for managing inference requests.
|
||||
|
||||
Args:
|
||||
max_size: Maximum number of concurrent requests the scheduler can handle (0 for unlimited)
|
||||
ttl: Time-to-live in seconds for requests before automatic timeout
|
||||
enable_chunked_prefill: Whether to enable chunked prefill processing
|
||||
max_num_partial_prefills: Maximum number of partial prefill operations allowed
|
||||
max_long_partial_prefills: Maximum number of long-running partial prefill operations
|
||||
long_prefill_token_threshold: Token count threshold to classify as long prefill
|
||||
|
||||
Initializes:
|
||||
- Thread synchronization primitives (mutex, condition variables)
|
||||
- Request and response tracking structures
|
||||
- Chunked prefill configuration parameters
|
||||
- Request queue management system
|
||||
|
||||
Note:
|
||||
- Uses thread-safe operations for concurrent access
|
||||
- Automatically recycles expired requests based on TTL
|
||||
- Supports both batched and individual request processing
|
||||
"""
|
||||
self.max_size = max_size
|
||||
self.ttl = ttl
|
||||
self.mutex = threading.Lock()
|
||||
|
||||
self.enable_chunked_prefill = enable_chunked_prefill
|
||||
self.max_num_partial_prefills = max_num_partial_prefills
|
||||
self.max_long_partial_prefills = max_long_partial_prefills
|
||||
self.long_prefill_token_threshold = long_prefill_token_threshold
|
||||
|
||||
self.ids_read_cursor = 0
|
||||
self.ids: List[str] = list()
|
||||
|
||||
@@ -44,14 +81,49 @@ class LocalScheduler(object):
|
||||
self.responses: Dict[str, List[ScheduledResponse]] = dict()
|
||||
|
||||
self.wait_request_timeout = 10
|
||||
self.wait_response_timeout = wait_response_timeout
|
||||
self.wait_response_timeout = 0.001
|
||||
|
||||
self.requests_not_empty = threading.Condition(self.mutex)
|
||||
self.responses_not_empty = threading.Condition(self.mutex)
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset the local scheduler to its initial empty state by:
|
||||
1. Resetting the request ID tracking cursor to 0
|
||||
2. Clearing all stored request IDs
|
||||
3. Clearing all pending requests
|
||||
4. Clearing all cached responses
|
||||
|
||||
This method is thread-safe and should be called when:
|
||||
- The scheduler needs to be cleanly restarted
|
||||
- Recovering from critical errors
|
||||
- Preparing for graceful shutdown
|
||||
|
||||
Effects:
|
||||
- Resets the ids_read_cursor to 0 (request processing position)
|
||||
- Clears the ids list tracking all request IDs
|
||||
- Clears the requests dictionary tracking pending requests
|
||||
- Clears the responses dictionary tracking received responses
|
||||
|
||||
Note:
|
||||
- Uses the scheduler's mutex to ensure thread safety
|
||||
- Does not affect the scheduler's configuration parameters (max_size, ttl, etc.)
|
||||
- After reset, the scheduler will be empty but still operational
|
||||
"""
|
||||
with self.mutex:
|
||||
self.ids_read_cursor = 0
|
||||
self.ids = list()
|
||||
self.requests = dict()
|
||||
self.responses = dict()
|
||||
llm_logger.info("Scheduler has been reset")
|
||||
|
||||
def _recycle(self, request_id: Optional[str] = None):
|
||||
"""
|
||||
recycle memory
|
||||
Clean up expired or completed requests to free memory.
|
||||
|
||||
Args:
|
||||
request_id: Optional specific request ID to remove.
|
||||
If None, removes all expired requests.
|
||||
"""
|
||||
if request_id is not None:
|
||||
self.requests.pop(request_id, None)
|
||||
@@ -70,9 +142,9 @@ class LocalScheduler(object):
|
||||
expired_ids = []
|
||||
for request_id in self.ids:
|
||||
request = self.requests[request_id]
|
||||
if (now - request.scheduled_time < self.ttl):
|
||||
if (now - request.schedule_time < self.ttl):
|
||||
break
|
||||
expired_ids.append(request.id)
|
||||
expired_ids.append(request.request_id)
|
||||
|
||||
for i, expired_id in enumerate(expired_ids):
|
||||
self.requests.pop(expired_id, None)
|
||||
@@ -85,14 +157,22 @@ class LocalScheduler(object):
|
||||
else:
|
||||
self.ids_read_cursor -= len(expired_ids)
|
||||
|
||||
def put_requests(self, requests: List[Request]) -> List[Tuple[str, Optional[str]]]:
|
||||
""" submit requests to scheduler
|
||||
Args:
|
||||
requests: List[Request]
|
||||
def put_requests(
|
||||
self, requests: List[Request]) -> List[Tuple[str, Optional[str]]]:
|
||||
"""
|
||||
Add new requests to the scheduler queue.
|
||||
|
||||
Args:
|
||||
requests: List of Request objects to enqueue
|
||||
|
||||
Returns:
|
||||
List of tuples containing (request_id, error_message) for each request.
|
||||
error_message is None for successful enqueues.
|
||||
"""
|
||||
with self.mutex:
|
||||
self._recycle()
|
||||
if self.max_size > 0 and len(self.requests) + len(requests) > self.max_size:
|
||||
if self.max_size > 0 and len(
|
||||
self.requests) + len(requests) > self.max_size:
|
||||
msg = f"Exceeding the max length of the local scheduler (max_size={self.max_size})"
|
||||
return [(request.request_id, msg) for request in requests]
|
||||
|
||||
@@ -103,38 +183,55 @@ class LocalScheduler(object):
|
||||
duplicated_ids.append(request.request_id)
|
||||
else:
|
||||
scheduled_request = ScheduledRequest(request)
|
||||
self.requests[scheduled_request.id] = scheduled_request
|
||||
valid_ids.append(scheduled_request.id)
|
||||
self.requests[
|
||||
scheduled_request.request_id] = scheduled_request
|
||||
valid_ids.append(scheduled_request.request_id)
|
||||
|
||||
self.ids += valid_ids
|
||||
self.requests_not_empty.notify_all()
|
||||
|
||||
llm_logger.info(
|
||||
f"Scheduler has put some requests: {valid_ids}")
|
||||
main_process_metrics.num_requests_waiting.inc(len(valid_ids))
|
||||
llm_logger.info(f"Scheduler has enqueued some requests: {valid_ids}")
|
||||
|
||||
if len(duplicated_ids) > 0:
|
||||
llm_logger.warning(
|
||||
f"Scheduler has received some duplicated requests: {duplicated_ids}")
|
||||
f"Scheduler has received some duplicated requests: {duplicated_ids}"
|
||||
)
|
||||
|
||||
results = [(request_id, None) for request_id in valid_ids]
|
||||
results += [(request_id, "duplicated request_id")
|
||||
for request_id in duplicated_ids]
|
||||
for request_id in duplicated_ids]
|
||||
return results
|
||||
|
||||
def calc_required_blocks(self, token_num, block_size):
|
||||
"""calculate required blocks for given token number"""
|
||||
"""
|
||||
Calculate the number of blocks needed for a given number of tokens.
|
||||
|
||||
Args:
|
||||
token_num: Number of tokens
|
||||
block_size: Size of each block
|
||||
|
||||
Returns:
|
||||
Number of blocks required (rounded up)
|
||||
"""
|
||||
return (token_num + block_size - 1) // block_size
|
||||
|
||||
def get_requests(self, available_blocks, block_size,
|
||||
reserved_output_blocks, max_num_batched_tokens, batch=1) -> List[Request]:
|
||||
"""get requests from local cache
|
||||
Args:
|
||||
available_blocks: int
|
||||
block_size: int
|
||||
reserved_output_blocks: int
|
||||
max_num_batched_tokens: int
|
||||
batch: int
|
||||
def get_requests(self,
|
||||
available_blocks,
|
||||
block_size,
|
||||
reserved_output_blocks,
|
||||
max_num_batched_tokens,
|
||||
batch=1) -> List[Request]:
|
||||
"""
|
||||
Retrieve requests from the scheduler based on available resources.
|
||||
|
||||
Args:
|
||||
available_blocks: Number of available processing blocks
|
||||
block_size: Size of each processing block
|
||||
reserved_output_blocks: Blocks reserved for output
|
||||
max_num_batched_tokens: Maximum tokens that can be batched
|
||||
batch: Preferred batch size
|
||||
|
||||
Returns:
|
||||
List of Request objects ready for processing
|
||||
"""
|
||||
if available_blocks <= reserved_output_blocks or batch < 1:
|
||||
llm_logger.debug(
|
||||
@@ -145,62 +242,112 @@ class LocalScheduler(object):
|
||||
|
||||
with self.requests_not_empty:
|
||||
batch_ids = self.requests_not_empty.wait_for(
|
||||
lambda: self.ids[self.ids_read_cursor:
|
||||
self.ids_read_cursor + batch], self.wait_request_timeout)
|
||||
lambda: self.ids[self.ids_read_cursor:self.ids_read_cursor +
|
||||
batch], self.wait_request_timeout)
|
||||
|
||||
required_total_blocks = 0
|
||||
current_prefill_tokens = 0
|
||||
requests: List[Request] = []
|
||||
long_partial_requests, short_partial_requests = 0, 0
|
||||
for request_id in batch_ids:
|
||||
request = self.requests[request_id]
|
||||
required_input_blocks = self.calc_required_blocks(
|
||||
request.size, block_size)
|
||||
current_prefill_tokens += request.size
|
||||
request.prompt_tokens_ids_len, block_size)
|
||||
current_prefill_tokens += request.prompt_tokens_ids_len
|
||||
required_total_blocks += required_input_blocks + reserved_output_blocks
|
||||
if required_total_blocks > available_blocks or current_prefill_tokens > max_num_batched_tokens:
|
||||
if required_total_blocks > available_blocks:
|
||||
break
|
||||
|
||||
if self.enable_chunked_prefill:
|
||||
if request.prompt_tokens_ids_len > self.long_prefill_token_threshold:
|
||||
# 长请求
|
||||
long_partial_requests += 1
|
||||
if long_partial_requests > self.max_long_partial_prefills:
|
||||
break
|
||||
else:
|
||||
short_partial_requests += 1
|
||||
|
||||
if short_partial_requests + long_partial_requests > self.max_num_partial_prefills:
|
||||
break
|
||||
else:
|
||||
if current_prefill_tokens > max_num_batched_tokens:
|
||||
break
|
||||
|
||||
requests.append(request.raw)
|
||||
self.ids_read_cursor += len(requests)
|
||||
|
||||
if len(batch_ids) > 0 and len(requests) == 0:
|
||||
llm_logger.debug(
|
||||
f"Scheduler has put all just-pulled request into the queue: {len(batch_ids)}"
|
||||
)
|
||||
|
||||
if len(requests) > 0:
|
||||
llm_logger.info(
|
||||
f"Scheduler has pulled some request: {[request.request_id for request in requests]}")
|
||||
main_process_metrics.num_requests_waiting.dec(len(requests))
|
||||
main_process_metrics.num_requests_running.inc(len(requests))
|
||||
f"Scheduler has pulled some request: {[request.request_id for request in requests]}"
|
||||
)
|
||||
|
||||
return requests
|
||||
|
||||
def put_results(self, results: List[RequestOutput]):
|
||||
"""put results into local cache"""
|
||||
"""
|
||||
Add processing results back to the scheduler.
|
||||
|
||||
Args:
|
||||
results: List of RequestOutput objects containing results
|
||||
"""
|
||||
responses: List[ScheduledResponse] = [
|
||||
ScheduledResponse(result) for result in results]
|
||||
ScheduledResponse(result) for result in results
|
||||
]
|
||||
|
||||
finished_responses = [
|
||||
response.id for response in responses if response.finished]
|
||||
response.request_id for response in responses if response.finished
|
||||
]
|
||||
if len(finished_responses) > 0:
|
||||
llm_logger.info(
|
||||
f"Scheduler has received a finished response: {finished_responses}")
|
||||
f"Scheduler has received some finished responses: {finished_responses}"
|
||||
)
|
||||
|
||||
with self.mutex:
|
||||
for response in responses:
|
||||
if response.id not in self.requests:
|
||||
if response.request_id not in self.requests:
|
||||
llm_logger.warning(
|
||||
f"Scheduler has received a expired response: {[response.id]}")
|
||||
f"Scheduler has received a expired response: {[response.request_id]}"
|
||||
)
|
||||
continue
|
||||
|
||||
if response.id not in self.responses:
|
||||
self.responses[response.id] = [response]
|
||||
if response.request_id not in self.responses:
|
||||
self.responses[response.request_id] = [response]
|
||||
continue
|
||||
self.responses[response.id].append(response)
|
||||
self.responses[response.request_id].append(response)
|
||||
self.responses_not_empty.notify_all()
|
||||
|
||||
def get_results(self, request_ids: List[str]) -> Dict[str, List[RequestOutput]]:
|
||||
"""get results from local cache"""
|
||||
def get_results(self) -> Dict[str, List[RequestOutput]]:
|
||||
"""
|
||||
Retrieve all available results from the scheduler and clean up completed requests.
|
||||
|
||||
This method:
|
||||
- Waits for new responses using a condition variable
|
||||
- Returns all currently available responses
|
||||
- Automatically removes completed requests from the scheduler
|
||||
- Logs finished requests
|
||||
|
||||
Returns:
|
||||
Dict[str, List[RequestOutput]]:
|
||||
A dictionary where:
|
||||
- Key is the request ID
|
||||
- Value is a list of RequestOutput objects for that request
|
||||
Completed requests are automatically removed from the scheduler
|
||||
|
||||
Note:
|
||||
- Thread-safe operation using condition variables
|
||||
- Has a short timeout (0.001s) to avoid blocking
|
||||
- Automatically recycles completed requests to free memory
|
||||
- Logs finished requests via llm_logger
|
||||
"""
|
||||
|
||||
def _get_results():
|
||||
responses = dict()
|
||||
for request_id in request_ids:
|
||||
if request_id not in responses:
|
||||
responses[request_id] = []
|
||||
responses[request_id] += self.responses.pop(request_id, [])
|
||||
responses = self.responses
|
||||
self.responses = dict()
|
||||
return responses
|
||||
|
||||
with self.responses_not_empty:
|
||||
@@ -218,5 +365,6 @@ class LocalScheduler(object):
|
||||
if finished:
|
||||
self._recycle(request_id)
|
||||
llm_logger.info(
|
||||
f"Scheduler has pulled a finished response: {[request_id]}")
|
||||
f"Scheduler has pulled a finished response: {[request_id]}"
|
||||
)
|
||||
return results
|
||||
|
Reference in New Issue
Block a user