Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

View File

@@ -18,35 +18,68 @@ import redis
from fastdeploy.utils import llm_logger
from .global_scheduler import GlobalScheduler
from .local_scheduler import LocalScheduler
from .splitwise_scheduler import SplitWiseScheduler, SplitWiseSchedulerConfig
class LocalSchedulerConfig:
"""
LocalSchedulerConfig class
Configuration class for LocalScheduler.
Attributes:
max_size: Maximum number of concurrent requests (-1 for unlimited)
ttl: Time-to-live in seconds for request expiration
"""
def __init__(self,
max_size: int = -1,
ttl: int = 900,
wait_response_timeout: float = 1,
max_model_len: int = 8192,
enable_chunked_prefill: bool = False,
max_num_partial_prefills: int = 1,
max_long_partial_prefills: int = 1,
long_prefill_token_threshold: int = 0,
**kwargs
):
"""
Initialize LocalScheduler configuration.
Args:
max_size: Maximum concurrent requests (-1 for unlimited, 0 for disabled)
ttl: Time-to-live in seconds for request expiration (default 900s)
max_model_len: Maximum model context length in tokens
enable_chunked_prefill: Whether to enable chunked prefill processing
max_num_partial_prefills: Max partial prefill operations allowed
max_long_partial_prefills: Max long-running partial prefill ops
long_prefill_token_threshold: Token count threshold for long prefill
**kwargs: Additional unused arguments (for forward compatibility)
Note:
- If long_prefill_token_threshold is 0, it's auto-calculated as 4% of max_model_len
- See LocalScheduler class for implementation details
"""
self.max_size = max_size
self.ttl = ttl
self.wait_response_timeout = wait_response_timeout
self.max_model_len = max_model_len
self.enable_chunked_prefill = enable_chunked_prefill
self.max_num_partial_prefills = max_num_partial_prefills
self.max_long_partial_prefills = max_long_partial_prefills
self.long_prefill_token_threshold = long_prefill_token_threshold
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
def check(self):
"""
check config
Validate the configuration values.
Currently performs no validation as all values are acceptable.
"""
assert self.wait_response_timeout > 0, \
"LocalScheduler: `wait_response_timeout` must be greater than zero"
assert self.ttl > self.wait_response_timeout, \
"LocalScheduler: `ttl` must be greater than `wait_response_timeout`"
pass
def print(self):
"""
print config
Print the current configuration to logs.
"""
llm_logger.info("LocalScheduler Configuration Information :")
for k, v in self.__dict__.items():
@@ -57,7 +90,15 @@ class LocalSchedulerConfig:
class GlobalSchedulerConfig:
"""
GlobalSchedulerConfig class
Configuration class for GlobalScheduler (Redis-based).
Attributes:
host: Redis server hostname
port: Redis server port
db: Redis database number
password: Optional Redis password
topic: Namespace prefix for queues
ttl: Time-to-live in seconds for Redis keys
"""
def __init__(self,
@@ -67,31 +108,69 @@ class GlobalSchedulerConfig:
password=None,
topic: str = "default",
ttl: int = 900,
wait_response_timeout: float = 1,
remote_write_time: int = 3,
min_load_score: float = 3,
max_model_len: int = 8192,
load_shrads_num: int = 1,
enable_chunked_prefill: bool = False,
max_num_partial_prefills: int = 1,
max_long_partial_prefills: int = 1,
long_prefill_token_threshold: int = 0,
**kwargs
):
"""
Initialize GlobalScheduler (Redis-based) configuration.
Args:
host: Redis server hostname (default "127.0.0.1")
port: Redis server port (default 6379)
db: Redis database number (default 0)
password: Optional Redis password
topic: Namespace prefix for queues (default "default")
ttl: Time-to-live in seconds for Redis keys (default 900s)
min_load_score: Minimum load score for task assignment (default 3)
max_model_len: Maximum model context length in tokens
load_shrads_num: Number of load balancing shards
enable_chunked_prefill: Whether to enable chunked prefill processing
max_num_partial_prefills: Max partial prefill operations allowed
max_long_partial_prefills: Max long-running partial prefill ops
long_prefill_token_threshold: Token count threshold for long prefill
**kwargs: Additional unused arguments (for forward compatibility)
Note:
- If long_prefill_token_threshold is 0, it's auto-calculated as 4% of max_model_len
- See GlobalScheduler class for implementation details
"""
self.host = host
self.port = port
self.db = db
self.password = password
self.topic = topic
self.ttl = ttl
self.wait_response_timeout = wait_response_timeout
self.remote_write_time = remote_write_time
self.min_load_score = min_load_score
self.load_shrads_num = load_shrads_num
self.max_model_len = max_model_len
self.enable_chunked_prefill = enable_chunked_prefill
self.max_num_partial_prefills = max_num_partial_prefills
self.max_long_partial_prefills = max_long_partial_prefills
self.long_prefill_token_threshold = long_prefill_token_threshold
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
def check(self):
"""
check config
Validate the configuration by testing Redis connection.
Raises:
Exception: If connection to Redis fails
"""
assert self.wait_response_timeout > 0, \
"GlobalScheduler: `wait_response_timeout` must be greater than zero"
assert self.remote_write_time > 0, \
"GlobalScheduler: `remote_write_time` must be greater than zero"
assert self.ttl > self.remote_write_time, \
"GlobalScheduler: `ttl` must be greater than `remote_write_time`"
assert self.ttl > self.wait_response_timeout, \
"GlobalScheduler: `ttl` must be greater than `wait_response_timeout`"
if self.ttl <= 0:
raise ValueError("ttl should be greater than 60")
if self.min_load_score < 1:
raise ValueError("min_load_score should be greater than 0")
if self.load_shrads_num < 1:
raise ValueError("load_shrads_num should be greater than 0")
r = redis.Redis(self.host, self.port, self.db, self.password)
try:
@@ -103,21 +182,40 @@ class GlobalSchedulerConfig:
def print(self):
"""
print config
Print the current configuration to logs.
"""
llm_logger.info("GlobalScheduler Configuration Information :")
password = self.password
self.password = "******"
for k, v in self.__dict__.items():
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
self.password = password
llm_logger.info(
"=============================================================")
class SchedulerConfig:
"""
SchedulerConfig class
Factory class for scheduler configurations.
Creates appropriate config based on scheduler type (local/global).
"""
def __init__(self, name="local", **kwargs):
"""
Initialize scheduler configuration factory.
Args:
name: Scheduler type ("local" for LocalScheduler or "global" for GlobalScheduler)
**kwargs: Configuration parameters for the specific scheduler type
Initializes:
- Appropriate config object based on scheduler type
- Validates configuration parameters
Raises:
Exception: If invalid scheduler type is specified
"""
self.name = name
self.config = None
@@ -126,26 +224,34 @@ class SchedulerConfig:
if name == "global":
self.config = GlobalSchedulerConfig(**kwargs)
if name == "splitwise":
self.config = SplitWiseSchedulerConfig(**kwargs)
def check(self):
"""
check config
Validate the configuration.
Raises:
Exception: If invalid scheduler type is specified
"""
if self.name not in ["local", "global"]:
raise Exception(
"SchedulerConfig: `name` must be `local` or `global`")
if self.name not in ["local", "global", "splitwise"]:
raise Exception(f'Unknown scheduler type {self.name}')
self.config.check()
def print(self):
"""
print config
Print the current configuration to logs.
"""
self.config.print()
def scheduler(self):
"""
create scheduler by config
Create a scheduler instance based on the configuration.
Returns:
Initialized scheduler instance (LocalScheduler or GlobalScheduler)
"""
if self.name == "global":
@@ -155,10 +261,19 @@ class SchedulerConfig:
password=self.config.password,
topic=self.config.topic,
ttl=self.config.ttl,
remote_write_time=self.config.remote_write_time,
wait_response_timeout=self.config.wait_response_timeout)
min_load_score=self.config.min_load_score,
load_shrads_num=self.config.load_shrads_num,
enable_chunked_prefill=self.config.enable_chunked_prefill,
max_num_partial_prefills=self.config.max_num_partial_prefills,
max_long_partial_prefills=self.config.max_long_partial_prefills,
long_prefill_token_threshold=self.config.long_prefill_token_threshold,)
if self.name == "splitwise":
return SplitWiseScheduler(self.config)
return LocalScheduler(max_size=self.config.max_size,
ttl=self.config.ttl,
wait_response_timeout=self.config.wait_response_timeout
)
enable_chunked_prefill=self.config.enable_chunked_prefill,
max_num_partial_prefills=self.config.max_num_partial_prefills,
max_long_partial_prefills=self.config.max_long_partial_prefills,
long_prefill_token_threshold=self.config.long_prefill_token_threshold,)