diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 433ebd1fc..8a0b1afbc 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -348,7 +348,7 @@ def reset_scheduler(): if llm_engine is None: return Response("Engine not loaded", status_code=500) - llm_engine.scheduler.reset_scheduler() + llm_engine.scheduler.reset() return Response("Scheduler Reset Successfully", status_code=200) @@ -366,7 +366,7 @@ def control_scheduler(request: ControlSchedulerRequest): return JSONResponse(content=content.model_dump(), status_code=500) if request.reset: - llm_engine.scheduler.reset_scheduler() + llm_engine.scheduler.reset() if request.load_shards_num or request.reallocate_shard: if hasattr(llm_engine.scheduler, "update_config") and callable(llm_engine.scheduler.update_config): diff --git a/fastdeploy/scheduler/config.py b/fastdeploy/scheduler/config.py index 297577d28..f6cab4b2d 100644 --- a/fastdeploy/scheduler/config.py +++ b/fastdeploy/scheduler/config.py @@ -110,7 +110,7 @@ class GlobalSchedulerConfig: ttl: int = 900, min_load_score: float = 3, max_model_len: int = 8192, - load_shrads_num: int = 1, + load_shards_num: int = 1, enable_chunked_prefill: bool = False, max_num_partial_prefills: int = 1, max_long_partial_prefills: int = 1, @@ -129,7 +129,7 @@ class GlobalSchedulerConfig: ttl: Time-to-live in seconds for Redis keys (default 900s) min_load_score: Minimum load score for task assignment (default 3) max_model_len: Maximum model context length in tokens - load_shrads_num: Number of load balancing shards + load_shards_num: Number of load balancing shards enable_chunked_prefill: Whether to enable chunked prefill processing max_num_partial_prefills: Max partial prefill operations allowed max_long_partial_prefills: Max long-running partial prefill ops @@ -147,7 +147,7 @@ class GlobalSchedulerConfig: self.topic = topic self.ttl = ttl self.min_load_score = min_load_score - self.load_shrads_num = load_shrads_num + self.load_shards_num = load_shards_num self.max_model_len = max_model_len self.enable_chunked_prefill = enable_chunked_prefill @@ -169,8 +169,8 @@ class GlobalSchedulerConfig: raise ValueError("ttl should be greater than 60") if self.min_load_score < 1: raise ValueError("min_load_score should be greater than 0") - if self.load_shrads_num < 1: - raise ValueError("load_shrads_num should be greater than 0") + if self.load_shards_num < 1: + raise ValueError("load_shards_num should be greater than 0") r = redis.Redis(self.host, self.port, self.db, self.password) try: @@ -262,7 +262,7 @@ class SchedulerConfig: topic=self.config.topic, ttl=self.config.ttl, min_load_score=self.config.min_load_score, - load_shrads_num=self.config.load_shrads_num, + load_shards_num=self.config.load_shards_num, enable_chunked_prefill=self.config.enable_chunked_prefill, max_num_partial_prefills=self.config.max_num_partial_prefills, max_long_partial_prefills=self.config.max_long_partial_prefills,