diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 4beaffc2c..6fa029ac2 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -275,7 +275,6 @@ class ParallelConfig: From old wersion worker args TODO(gongshaotian): Reclassify """ - self.max_num_seqs: int = 34 # Set default block num for profile run self.total_block_num: int = 2000 # block size @@ -297,7 +296,6 @@ class ParallelConfig: # Do profile or not self.do_profile: bool = False - self.max_num_batched_tokens: int = 2048 # splitwise role self.splitwise_role: str = "mixed" # guided decoding backend @@ -1109,8 +1107,6 @@ class FDConfig: speculative_config: SpeculativeConfig = None, tokenizer: str = None, max_model_len: int = 8192, - max_num_seqs: int = 8, - max_num_batched_tokens: Optional[int] = None, ips: str = None, use_warmup: bool = False, engine_worker_queue_port: str = "8002", @@ -1143,19 +1139,18 @@ class FDConfig: self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config # Initialize cuda graph capture list if self.graph_opt_config.cudagraph_capture_sizes is None: - self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs) + self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs) if self.graph_opt_config.cudagraph_only_prefill: self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512) else: - self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.parallel_config.max_num_seqs) + self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs) # TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn if self.graph_opt_config.graph_opt_level == 2: self.graph_opt_config.graph_opt_level = 1 self.tokenizer = tokenizer - self.max_num_batched_tokens = max_num_batched_tokens self.ips = ips self.tool_parser = tool_parser @@ -1177,7 +1172,6 @@ class FDConfig: self.node_rank = idx self.max_model_len = max_model_len - self.max_num_seqs = max_num_seqs self.limit_mm_per_prompt = limit_mm_per_prompt self.mm_processor_kwargs = mm_processor_kwargs self.use_warmup = use_warmup @@ -1243,22 +1237,22 @@ class FDConfig: self.paddle_commit_id = paddle.version.commit - if self.max_num_batched_tokens is None: + if self.scheduler_config.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): if paddle.is_compiled_with_xpu(): - self.max_num_batched_tokens = self.max_model_len + self.scheduler_config.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.cache_config.enable_chunked_prefill: - self.max_num_batched_tokens = 2048 + self.scheduler_config.max_num_batched_tokens = 2048 else: - self.max_num_batched_tokens = self.max_model_len + self.scheduler_config.max_num_batched_tokens = self.max_model_len if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) - self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs) + self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs) self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) if self.guided_decoding_backend == "auto": @@ -1272,19 +1266,24 @@ class FDConfig: """ check the legality of config """ - assert self.max_num_seqs <= 256, ( - "The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}." + assert self.scheduler_config.max_num_seqs <= 256, ( + "The parameter `max_num_seqs` is not allowed to exceed 256, " + f"but now it's {self.scheduler_config.max_num_seqs}." ) assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1" assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16" - assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1" - assert self.max_num_batched_tokens >= self.max_num_seqs, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " - f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}" + assert ( + self.scheduler_config.max_num_seqs >= 1 + ), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1" + assert self.scheduler_config.max_num_batched_tokens >= self.scheduler_config.max_num_seqs, ( + f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} " + f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}" ) - assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" - f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}" + assert ( + self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs + ), ( + f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger" + f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}" ) assert ( self.max_num_partial_prefills >= 1 @@ -1305,13 +1304,13 @@ class FDConfig: if not self.cache_config.enable_chunked_prefill: if not envs.ENABLE_V1_KVCACHE_SCHEDULER: - assert self.max_num_batched_tokens >= self.max_model_len, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " + assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, ( + f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} " f"should be larger than or equal to max_model_len: {self.max_model_len}" ) else: - assert self.max_num_batched_tokens >= self.cache_config.block_size, ( - f"max_num_batched_tokens: {self.max_num_batched_tokens} " + assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, ( + f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} " f"should be larger than or equal to block_size: {self.cache_config.block_size}" ) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 2d1532ae5..4040a6f92 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -943,23 +943,15 @@ class EngineArgs: """ prefix = "scheduler_" prefix_len = len(prefix) - extra_params = [ - "max_model_len", - "enable_chunked_prefill", - "max_num_partial_prefills", - "max_long_partial_prefills", - "long_prefill_token_threshold", - ] all = asdict(self) params = dict() for k, v in all.items(): if k[:prefix_len] == prefix: params[k[prefix_len:]] = v - elif k in extra_params: + else: params[k] = v - - return SchedulerConfig(**params) + return SchedulerConfig(params) def create_graph_optimization_config(self) -> GraphOptimizationConfig: """ @@ -1059,9 +1051,7 @@ class EngineArgs: load_config=load_cfg, parallel_config=parallel_cfg, max_model_len=self.max_model_len, - max_num_seqs=self.max_num_seqs, speculative_config=speculative_cfg, - max_num_batched_tokens=self.max_num_batched_tokens, ips=self.ips, use_warmup=self.use_warmup, engine_worker_queue_port=self.engine_worker_queue_port, diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 4375452b2..7b9b0bdc6 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -71,7 +71,7 @@ class EngineService: if envs.ENABLE_V1_KVCACHE_SCHEDULER: self.resource_manager = ResourceManagerV1( - cfg.max_num_seqs, + cfg.scheduler_config.max_num_seqs, cfg, cfg.parallel_config.tensor_parallel_size, cfg.splitwise_role, @@ -83,7 +83,7 @@ class EngineService: ) else: self.resource_manager = ResourceManager( - cfg.max_num_seqs, + cfg.scheduler_config.max_num_seqs, cfg, cfg.parallel_config.tensor_parallel_size, cfg.splitwise_role, @@ -109,7 +109,7 @@ class EngineService: self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1) for idx in range(1, self.cfg.max_num_partial_prefills + 1): self.partial_chunked_tokens[idx] = ( - (self.cfg.max_num_batched_tokens // idx) + (self.cfg.scheduler_config.max_num_batched_tokens // idx) // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size ) @@ -356,7 +356,7 @@ class EngineService: requests_chunk = [[] for _ in range(len(requests))] chunk_request_num = len(current_request_size) while chunk_request_num >= 1: - remain_batched_tokens = self.cfg.max_num_batched_tokens + remain_batched_tokens = self.cfg.scheduler_config.max_num_batched_tokens for idx in range(len(current_request_size)): if current_request_size[idx] <= 0: continue @@ -496,7 +496,7 @@ class EngineService: available_blocks=self.resource_manager.available_block_num(), block_size=self.cfg.cache_config.block_size, reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, - max_num_batched_tokens=self.cfg.max_num_batched_tokens, + max_num_batched_tokens=self.cfg.scheduler_config.max_num_batched_tokens, batch=num_prefill_batch, ) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 623dec456..482ccbf74 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -469,7 +469,7 @@ class LLMEngine: ips = ",".join(self.cfg.ips) arguments = ( f" --devices {self.cfg.device_ids} {py_script}" - f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}" + f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.max_model_len}" f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}" f" --model {self.cfg.model_config.model!s}" f" --device_ids {self.cfg.device_ids}" @@ -482,7 +482,7 @@ class LLMEngine: f" --eos_tokens_lens {self.data_processor.eos_token_id_len}" f" --pad_token_id {self.data_processor.pad_token_id}" f" --engine_pid {self.cfg.engine_worker_queue_port[0]}" - f" --max_num_batched_tokens {self.cfg.max_num_batched_tokens}" + f" --max_num_batched_tokens {self.cfg.scheduler_config.max_num_batched_tokens}" f" --splitwise_role {self.cfg.splitwise_role}" f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}" f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}" diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index f1a663d35..254ba478a 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -289,7 +289,7 @@ class ResourceManagerV1(ResourceManager): with self.lock: scheduled_reqs: list[Request] = [] preempted_reqs: list[Request] = [] - token_budget = self.config.max_num_batched_tokens + token_budget = self.config.scheduler_config.max_num_batched_tokens # First, schedule the RUNNING requests. req_index = 0 diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py index d32d57f3c..4a72ccf3e 100644 --- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py +++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py @@ -210,7 +210,7 @@ class XGrammarBackend(BackendBase): ): super().__init__(fd_config=fd_config) self.vocab_size = fd_config.model_config.vocab_size - self.batch_size = fd_config.parallel_config.max_num_seqs + self.batch_size = fd_config.scheduler_config.max_num_seqs self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index ce9c04fbf..d3730c9f3 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -152,7 +152,7 @@ class Attention(nn.Layer): self.cache_k_block_means = paddle.zeros( [ - fd_config.parallel_config.max_num_seqs, + fd_config.scheduler_config.max_num_seqs, moba_max_seq_length // moba_block_size, self.kv_num_heads, self.head_dim, diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index 6038fe4ca..15750d090 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -156,7 +156,7 @@ class FlashAttentionBackend(AttentionBackend): self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) self.max_partition_size: int = int(os.getenv("FLAGS_max_partition_size", "32768")) self.zero_seq_enc_lens_for_decode = paddle.zeros( - shape=[fd_config.parallel_config.max_num_seqs, 1], dtype=paddle.int32 + shape=[fd_config.scheduler_config.max_num_seqs, 1], dtype=paddle.int32 ) def get_attntion_meta(self): diff --git a/fastdeploy/model_executor/layers/attention/moba_attention_backend.py b/fastdeploy/model_executor/layers/attention/moba_attention_backend.py index 024e97ee2..47c65624f 100644 --- a/fastdeploy/model_executor/layers/attention/moba_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/moba_attention_backend.py @@ -77,7 +77,7 @@ class MobaAttentionBackend(AttentionBackend): assert fd_config.moba_attention_config is not None, "moba_attention_config is None" self.block_size = fd_config.parallel_config.block_size self.max_seq_len = fd_config.parallel_config.max_model_len - self.max_num_seqs = fd_config.parallel_config.max_num_seqs + self.max_num_seqs = fd_config.scheduler_config.max_num_seqs self.kv_num_heads = kv_num_heads self.num_heads = num_heads self.head_dim = fd_config.model_config.head_dim diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py index a77c2f255..a7135a0e0 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py @@ -86,7 +86,7 @@ class GCUFlashAttnBackend(AttentionBackend): self.attention_metadata: GCUFlashAttnMetadata = None self.block_size = fd_config.cache_config.block_size self.max_seq_len = fd_config.parallel_config.max_model_len - self.max_num_seqs = fd_config.parallel_config.max_num_seqs + self.max_num_seqs = fd_config.scheduler_config.max_num_seqs self.causal = getattr(fd_config.model_config, "causal", True) diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py index 6af54ee9a..4901ecce2 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py @@ -84,7 +84,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend): self.attention_metadata: GCUMemEfficientAttnMetadata = None self.block_size = fd_config.cache_config.block_size self.max_seq_len = fd_config.parallel_config.max_model_len - self.max_num_seqs = fd_config.parallel_config.max_num_seqs + self.max_num_seqs = fd_config.scheduler_config.max_num_seqs self.causal = getattr(fd_config.model_config, "causal", True) diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index 51ae0aec4..60d2e663b 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -221,7 +221,7 @@ class Sampler(nn.Layer): ): early_stopper_cls = get_early_stopper_cls_from_stragegy(fd_config.early_stop_config.strategy) self.early_stopper = early_stopper_cls() - self.early_stopper.initialize(fd_config.parallel_config.max_num_seqs, fd_config.early_stop_config) + self.early_stopper.initialize(fd_config.scheduler_config.max_num_seqs, fd_config.early_stop_config) def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): """set reasoning parser""" diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 9058cda4a..07ca23e5d 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -607,9 +607,11 @@ class DeepseekV3ForCausalLM(ModelForCasualLM): num_embeddings=fd_config.model_config.vocab_size, prefix="lm_head", ) - self.position_ids_buffer = paddle.empty([fd_config.parallel_config.max_num_batched_tokens], dtype=paddle.int32) + self.position_ids_buffer = paddle.empty( + [fd_config.scheduler_config.max_num_batched_tokens], dtype=paddle.int32 + ) self.mask_encoder_batch_buffer = paddle.empty( - [fd_config.parallel_config.max_num_batched_tokens, 1], dtype=paddle.int32 + [fd_config.scheduler_config.max_num_batched_tokens, 1], dtype=paddle.int32 ) @classmethod diff --git a/fastdeploy/scheduler/config.py b/fastdeploy/scheduler/config.py index cd0a72af1..c9d65dfca 100644 --- a/fastdeploy/scheduler/config.py +++ b/fastdeploy/scheduler/config.py @@ -202,13 +202,12 @@ class SchedulerConfig: Creates appropriate config based on scheduler type (local/global). """ - def __init__(self, name="local", **kwargs): + def __init__(self, args): """ Initialize scheduler configuration factory. Args: - name: Scheduler type ("local" for LocalScheduler or "global" for GlobalScheduler) - **kwargs: Configuration parameters for the specific scheduler type + args: Configuration parameters for the specific scheduler type Initializes: - Appropriate config object based on scheduler type @@ -217,17 +216,23 @@ class SchedulerConfig: Raises: Exception: If invalid scheduler type is specified """ - self.name = name + self.name = "local" # "local" for LocalScheduler or "global" for GlobalScheduler + self.max_num_batched_tokens = 2048 + self.max_num_seqs = 34 self.config = None - if name == "local": - self.config = LocalSchedulerConfig(**kwargs) + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) - if name == "global": - self.config = GlobalSchedulerConfig(**kwargs) + if self.name == "local": + self.config = LocalSchedulerConfig(**args) - if name == "splitwise": - self.config = SplitWiseSchedulerConfig(**kwargs) + if self.name == "global": + self.config = GlobalSchedulerConfig(**args) + + if self.name == "splitwise": + self.config = SplitWiseSchedulerConfig(**args) def check(self): """ diff --git a/fastdeploy/spec_decode/base.py b/fastdeploy/spec_decode/base.py index 114bcc00c..5c395b0d5 100644 --- a/fastdeploy/spec_decode/base.py +++ b/fastdeploy/spec_decode/base.py @@ -50,8 +50,9 @@ class Proposer(ABC): self.speculative_config = self.cfg.speculative_config self.cache_config = self.cfg.cache_config self.quant_config = self.cfg.quant_config + self.scheduler_config = self.cfg.scheduler_config - self.max_num_seqs = self.parallel_config.max_num_seqs + self.max_num_seqs = self.scheduler_config.max_num_seqs self.max_model_len = self.parallel_config.max_model_len self.speculative_method = self.speculative_config.method self.max_draft_token_num = self.speculative_config.num_speculative_tokens diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index ee56d032e..3a1e425e3 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -89,9 +89,9 @@ class GCUModelRunner(ModelRunnerBase): self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes # Initialize share inputs - self._init_share_inputs(self.parallel_config.max_num_seqs) + self._init_share_inputs(self.scheduler_config.max_num_seqs) self.infer_seed_increment = paddle.full( - shape=[self.parallel_config.max_num_seqs, 1], + shape=[self.scheduler_config.max_num_seqs, 1], fill_value=4, dtype="int64", ).cpu() @@ -689,13 +689,13 @@ class GCUModelRunner(ModelRunnerBase): decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1 group_size = np.ceil(num_heads / self.model_config.kv_num_heads) - decode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( + decode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( (decoder_step_token_num * group_size) / decoder_block_shape_q ) - encode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( + encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( (self.model_config.max_model_len * group_size) / encoder_block_shape_q ) - kv_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( + kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( self.model_config.max_model_len / self.fd_config.cache_config.block_size ) self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") @@ -914,7 +914,7 @@ class GCUModelRunner(ModelRunnerBase): capture_sizes = self.cudagraph_capture_sizes.copy() for batch_size in sorted(capture_sizes, reverse=True): self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, + num_tokens=self.scheduler_config.max_num_batched_tokens, batch_size=batch_size, in_capturing=True, expected_decode_len=expected_decode_len, @@ -929,7 +929,7 @@ class GCUModelRunner(ModelRunnerBase): start_time = time.perf_counter() for batch_size in self.sot_warmup_sizes: self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, + num_tokens=self.scheduler_config.max_num_batched_tokens, batch_size=batch_size, ) logger.info(f"SOT warmup the model with the batch size:{batch_size}") @@ -1140,8 +1140,8 @@ class GCUModelRunner(ModelRunnerBase): # 2. Dummy run self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, - batch_size=min(self.parallel_config.max_num_seqs, 3), + num_tokens=self.scheduler_config.max_num_batched_tokens, + batch_size=min(self.scheduler_config.max_num_seqs, 3), ) # 3. gc diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 690740175..4883b8697 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -145,9 +145,9 @@ class GPUModelRunner(ModelRunnerBase): self.cudagraph_only_prefill = self.graph_opt_config.cudagraph_only_prefill # Initialize share inputs - self._init_share_inputs(self.parallel_config.max_num_seqs) + self._init_share_inputs(self.scheduler_config.max_num_seqs) self.infer_seed_increment = paddle.full( - shape=[self.parallel_config.max_num_seqs, 1], + shape=[self.scheduler_config.max_num_seqs, 1], fill_value=4, dtype="int64", ).cpu() @@ -1208,13 +1208,13 @@ class GPUModelRunner(ModelRunnerBase): # decode_max_tile_size must take into account the maximum case, where *1024 can cover 128K. decode_max_tile_size = ( 1024 - * self.parallel_config.max_num_seqs + * self.scheduler_config.max_num_seqs * np.ceil((decoder_step_token_num * group_size) / decoder_block_shape_q) ) - encode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( + encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( (self.model_config.max_model_len * group_size) / encoder_block_shape_q ) - kv_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( + kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( self.model_config.max_model_len / self.fd_config.cache_config.block_size ) self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") @@ -1508,7 +1508,7 @@ class GPUModelRunner(ModelRunnerBase): for num_tokens in sorted(capture_sizes, reverse=True): self._dummy_run( num_tokens=num_tokens, - batch_size=self.parallel_config.max_num_seqs, + batch_size=self.scheduler_config.max_num_seqs, in_capturing=True, expected_decode_len=expected_decode_len, capture_prefill=True, @@ -1519,7 +1519,7 @@ class GPUModelRunner(ModelRunnerBase): else: for batch_size in sorted(capture_sizes, reverse=True): self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, + num_tokens=self.scheduler_config.max_num_batched_tokens, batch_size=batch_size, in_capturing=True, expected_decode_len=expected_decode_len, @@ -1536,7 +1536,7 @@ class GPUModelRunner(ModelRunnerBase): start_time = time.perf_counter() for batch_size in self.sot_warmup_sizes: self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, + num_tokens=self.scheduler_config.max_num_batched_tokens, batch_size=batch_size, ) logger.info(f"SOT warmup the model with the batch size:{batch_size}") @@ -1815,8 +1815,8 @@ class GPUModelRunner(ModelRunnerBase): # 2. Dummy run self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, - batch_size=min(self.parallel_config.max_num_seqs, 3), + num_tokens=self.scheduler_config.max_num_batched_tokens, + batch_size=min(self.scheduler_config.max_num_seqs, 3), ) # 3. gc diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index b16d3f984..e92d43d7d 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -121,9 +121,9 @@ class MetaxModelRunner(ModelRunnerBase): self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes # Initialize share inputs - self._init_share_inputs(self.parallel_config.max_num_seqs) + self._init_share_inputs(self.scheduler_config.max_num_seqs) self.infer_seed_increment = paddle.full( - shape=[self.parallel_config.max_num_seqs, 1], + shape=[self.scheduler_config.max_num_seqs, 1], fill_value=4, dtype="int64", ).cpu() @@ -995,7 +995,7 @@ class MetaxModelRunner(ModelRunnerBase): encoder_block_shape_q = 64 decoder_block_shape_q = 16 decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1 - decode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( + decode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( (decoder_step_token_num * np.ceil(num_heads / self.model_config.kv_num_heads)) / decoder_block_shape_q ) self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") @@ -1242,7 +1242,7 @@ class MetaxModelRunner(ModelRunnerBase): capture_sizes = self.cudagraph_capture_sizes.copy() for batch_size in sorted(capture_sizes, reverse=True): self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, + num_tokens=self.scheduler_config.max_num_batched_tokens, batch_size=batch_size, in_capturing=True, expected_decode_len=expected_decode_len, @@ -1257,7 +1257,7 @@ class MetaxModelRunner(ModelRunnerBase): start_time = time.perf_counter() for batch_size in self.sot_warmup_sizes: self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, + num_tokens=self.scheduler_config.max_num_batched_tokens, batch_size=batch_size, ) logger.info(f"SOT warmup the model with the batch size:{batch_size}") @@ -1489,8 +1489,8 @@ class MetaxModelRunner(ModelRunnerBase): # 2. Dummy run self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, - batch_size=min(self.parallel_config.max_num_seqs, 3), + num_tokens=self.scheduler_config.max_num_batched_tokens, + batch_size=min(self.scheduler_config.max_num_seqs, 3), ) # 3. gc diff --git a/fastdeploy/worker/model_runner_base.py b/fastdeploy/worker/model_runner_base.py index 4bebd02ef..699182576 100644 --- a/fastdeploy/worker/model_runner_base.py +++ b/fastdeploy/worker/model_runner_base.py @@ -45,6 +45,7 @@ class ModelRunnerBase(ABC): self.graph_opt_config = fd_config.graph_opt_config self.quant_config = fd_config.quant_config self.cache_config = fd_config.cache_config + self.scheduler_config = fd_config.scheduler_config # ... config self.device = device diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 6befe5fbd..411418977 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -44,6 +44,7 @@ from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import IPCSignal from fastdeploy.model_executor.layers.quantization import parse_quant_config from fastdeploy.platforms import current_platform +from fastdeploy.scheduler import SchedulerConfig from fastdeploy.utils import get_logger from fastdeploy.worker.worker_base import WorkerBase @@ -662,6 +663,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: speculative_config = SpeculativeConfig(args.speculative_config) parallel_config = ParallelConfig(vars(args)) cache_config = CacheConfig(vars(args)) + scheduler_config = SchedulerConfig(vars(args)) parallel_config.tensor_parallel_rank = local_rank % parallel_config.tensor_parallel_size parallel_config.data_parallel_rank = local_rank // parallel_config.tensor_parallel_size # config for EP @@ -758,6 +760,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: graph_opt_config=graph_opt_config, early_stop_config=early_stop_config, cache_config=cache_config, + scheduler_config=scheduler_config, engine_worker_queue_port=args.engine_worker_queue_port, ips=args.ips, moba_attention_config=moba_attention_config, diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index b3bbf1fbf..452a56aa2 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -353,12 +353,12 @@ class XPUModelRunner(ModelRunnerBase): self.graph_opt_level = self.graph_opt_config.graph_opt_level self.use_cudagraph = False self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes - self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") + self.input_ids = paddle.zeros(self.scheduler_config.max_num_seqs, dtype="int32") # Initialize share inputs - self._init_share_inputs(self.fd_config.parallel_config.max_num_seqs) + self._init_share_inputs(self.fd_config.scheduler_config.max_num_seqs) self.infer_seed_increment = paddle.full( - shape=[self.parallel_config.max_num_seqs, 1], + shape=[self.scheduler_config.max_num_seqs, 1], fill_value=4, dtype="int64", ).cpu() @@ -812,7 +812,7 @@ class XPUModelRunner(ModelRunnerBase): start_time = time.perf_counter() for batch_size in self.sot_warmup_sizes: self._dummy_run( - num_tokens=self.parallel_config.max_num_batched_tokens, + num_tokens=self.scheduler_config.max_num_batched_tokens, batch_size=batch_size, ) logger.info(f"SOT warmup the model with the batch size:{batch_size}") @@ -987,8 +987,8 @@ class XPUModelRunner(ModelRunnerBase): """Execute a forward pass with dummy inputs to profile the memory usage of the model.""" self._dummy_run( - num_tokens=int(self.parallel_config.max_num_batched_tokens), - batch_size=min(self.parallel_config.max_num_seqs, 1), + num_tokens=int(self.scheduler_config.max_num_batched_tokens), + batch_size=min(self.scheduler_config.max_num_seqs, 1), ) def clear_block_table(self) -> None: diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py index 4143fcfd6..958ade453 100644 --- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py +++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py @@ -23,6 +23,7 @@ from fastdeploy.config import ( FDConfig, GraphOptimizationConfig, ParallelConfig, + SchedulerConfig, ) from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import ( @@ -152,14 +153,16 @@ class TestCUDAGrpahSubgraph(unittest.TestCase): # Set FastDeploy config graph_opt_config = GraphOptimizationConfig(args={}) graph_opt_config.use_cudagraph = True - parallel_config = ParallelConfig(args={}) - parallel_config.max_num_seqs = 8 + scheduler_config = SchedulerConfig(args={}) + scheduler_config.max_num_seqs = 8 cache_config = CacheConfig({}) + parallel_config = ParallelConfig(args={}) # Initialize cuda graph capture list - graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs) - graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs) + graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs) + graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) fd_config = FDConfig( graph_opt_config=graph_opt_config, + scheduler_config=scheduler_config, parallel_config=parallel_config, cache_config=cache_config, test_mode=True, diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py index 6b9fb5de5..94674b788 100644 --- a/tests/graph_optimization/test_cuda_graph_recapture.py +++ b/tests/graph_optimization/test_cuda_graph_recapture.py @@ -7,6 +7,7 @@ from fastdeploy.config import ( FDConfig, GraphOptimizationConfig, ParallelConfig, + SchedulerConfig, ) from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import ( @@ -90,11 +91,15 @@ class TestCUDAGrpahRecapture(unittest.TestCase): # Set FastDeploy config graph_opt_config = GraphOptimizationConfig(args={}) graph_opt_config.use_cudagraph = True - parallel_config = ParallelConfig(args={}) + scheduler_config = SchedulerConfig(args={}) cache_config = CacheConfig(args={}) - parallel_config.max_num_seqs = 1 + scheduler_config.max_num_seqs = 1 + parallel_config = ParallelConfig(args={}) fd_config = FDConfig( - graph_opt_config=graph_opt_config, parallel_config=parallel_config, cache_config=cache_config + graph_opt_config=graph_opt_config, + scheduler_config=scheduler_config, + cache_config=cache_config, + parallel_config=parallel_config, ) # Run Test Case1 diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py index 9162d7173..ecd981901 100644 --- a/tests/graph_optimization/test_cuda_graph_spec_decode.py +++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py @@ -23,6 +23,7 @@ from fastdeploy.config import ( FDConfig, GraphOptimizationConfig, ParallelConfig, + SchedulerConfig, ) from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import ( @@ -99,16 +100,18 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase): """Run test case""" graph_opt_config = GraphOptimizationConfig(args={}) graph_opt_config.use_cudagraph = True - parallel_config = ParallelConfig(args={}) - parallel_config.max_num_seqs = 1 + scheduler_config = SchedulerConfig(args={}) + scheduler_config.max_num_seqs = 1 cache_config = CacheConfig({}) + parallel_config = ParallelConfig(args={}) # Initialize cuda graph capture list - graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs) - graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs) + graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs) + graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) fd_config = FDConfig( graph_opt_config=graph_opt_config, - parallel_config=parallel_config, + scheduler_config=scheduler_config, cache_config=cache_config, + parallel_config=parallel_config, test_mode=True, ) diff --git a/tests/graph_optimization/test_graph_opt_backend.py b/tests/graph_optimization/test_graph_opt_backend.py index 3a0c6d051..69059aad9 100644 --- a/tests/graph_optimization/test_graph_opt_backend.py +++ b/tests/graph_optimization/test_graph_opt_backend.py @@ -25,6 +25,7 @@ from fastdeploy.config import ( FDConfig, GraphOptimizationConfig, ParallelConfig, + SchedulerConfig, ) from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import ( @@ -85,15 +86,16 @@ class TestGraphOptBackend(unittest.TestCase): baseline_graph_opt_config.use_cudagraph = False baseline_graph_opt_config.graph_opt_level = 0 - baseline_parallel_config = ParallelConfig(args={}) - baseline_parallel_config.max_num_seqs = self.max_num_seqs + baseline_scheduler_config = SchedulerConfig(args={}) + baseline_scheduler_config.max_num_seqs = self.max_num_seqs baseline_cache_config = CacheConfig({}) - + baseline_parallel_config = ParallelConfig(args={}) self.baseline_fd_config = FDConfig( graph_opt_config=baseline_graph_opt_config, - parallel_config=baseline_parallel_config, + scheduler_config=baseline_scheduler_config, cache_config=baseline_cache_config, + parallel_config=baseline_parallel_config, test_mode=True, ) @@ -129,17 +131,19 @@ class TestGraphOptBackend(unittest.TestCase): graph_opt_config.graph_opt_level = graph_opt_level # Setup parallel config - parallel_config = ParallelConfig(args={}) - parallel_config.max_num_seqs = self.max_num_seqs + scheduler_config = SchedulerConfig(args={}) + scheduler_config.max_num_seqs = self.max_num_seqs # Setup cache config cache_config = CacheConfig({}) + parallel_config = ParallelConfig(args={}) # Create FD config return FDConfig( graph_opt_config=graph_opt_config, - parallel_config=parallel_config, + scheduler_config=scheduler_config, cache_config=cache_config, + parallel_config=parallel_config, test_mode=True, ) diff --git a/tests/graph_optimization/test_static_graph_cuda_graph_split.py b/tests/graph_optimization/test_static_graph_cuda_graph_split.py index faaad4127..6171416af 100644 --- a/tests/graph_optimization/test_static_graph_cuda_graph_split.py +++ b/tests/graph_optimization/test_static_graph_cuda_graph_split.py @@ -29,6 +29,7 @@ from fastdeploy.config import ( FDConfig, GraphOptimizationConfig, ParallelConfig, + SchedulerConfig, ) from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import ( @@ -88,15 +89,17 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase): """Run test case""" # Set FastDeploy config graph_opt_config = GraphOptimizationConfig({"use_cudagraph": True, "graph_opt_level": 1}) - parallel_config = ParallelConfig({"max_num_seqs": 1}) - graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs) - graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs) + scheduler_config = SchedulerConfig({"max_num_seqs": 1}) + graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs) + graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) cache_config = CacheConfig({}) + parallel_config = ParallelConfig(args={}) fd_config = FDConfig( graph_opt_config=graph_opt_config, - parallel_config=parallel_config, + scheduler_config=scheduler_config, cache_config=cache_config, + parallel_config=parallel_config, test_mode=True, ) diff --git a/tests/utils.py b/tests/utils.py index 048410fc9..d62c8f173 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -21,6 +21,7 @@ from fastdeploy.config import ( FDConfig, GraphOptimizationConfig, ParallelConfig, + SchedulerConfig, ) @@ -50,12 +51,17 @@ class FakeModelConfig: def get_default_test_fd_config(): graph_opt_config = GraphOptimizationConfig(args={}) + scheduler_config = SchedulerConfig(args={}) + scheduler_config.max_num_seqs = 1 parallel_config = ParallelConfig(args={}) - parallel_config.max_num_seqs = 1 parallel_config.data_parallel_rank = 1 cache_config = CacheConfig({}) fd_config = FDConfig( - graph_opt_config=graph_opt_config, parallel_config=parallel_config, cache_config=cache_config, test_mode=True + graph_opt_config=graph_opt_config, + parallel_config=parallel_config, + cache_config=cache_config, + scheduler_config=scheduler_config, + test_mode=True, ) fd_config.model_config = FakeModelConfig() return fd_config diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index c68154013..ea41e54ee 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -6,6 +6,7 @@ from fastdeploy.config import ( FDConfig, GraphOptimizationConfig, ParallelConfig, + SchedulerConfig, ) @@ -14,10 +15,12 @@ class TestConfig(unittest.TestCase): parallel_config = ParallelConfig({"tensor_parallel_size": 16, "expert_parallel_size": 1}) graph_opt_config = GraphOptimizationConfig({}) cache_config = CacheConfig({}) + scheduler_config = SchedulerConfig({}) fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, cache_config=cache_config, + scheduler_config=scheduler_config, ips=["1.1.1.1", "0.0.0.0"], test_mode=True, ) @@ -28,10 +31,12 @@ class TestConfig(unittest.TestCase): parallel_config = ParallelConfig({}) graph_opt_config = GraphOptimizationConfig({}) cache_config = CacheConfig({}) + scheduler_config = SchedulerConfig({}) fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, cache_config=cache_config, + scheduler_config=scheduler_config, ips="0.0.0.0", test_mode=True, ) @@ -42,26 +47,29 @@ class TestConfig(unittest.TestCase): graph_opt_config = GraphOptimizationConfig({}) cache_config = CacheConfig({}) cache_config.enable_chunked_prefill = True + scheduler_config = SchedulerConfig({}) fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, cache_config=cache_config, + scheduler_config=scheduler_config, ips="0.0.0.0", test_mode=True, ) if not envs.ENABLE_V1_KVCACHE_SCHEDULER: - assert fd_config.max_num_batched_tokens == 2048 + assert fd_config.scheduler_config.max_num_batched_tokens == 2048 cache_config.enable_chunked_prefill = False fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, cache_config=cache_config, + scheduler_config=scheduler_config, ips="0.0.0.0", test_mode=True, ) if not envs.ENABLE_V1_KVCACHE_SCHEDULER: - assert fd_config.max_num_batched_tokens == 8192 + assert fd_config.scheduler_config.max_num_batched_tokens == 8192 def test_fdconfig_init_cache(self): parallel_config = ParallelConfig({}) @@ -69,10 +77,12 @@ class TestConfig(unittest.TestCase): cache_config = CacheConfig({}) cache_config.cache_transfer_protocol = "rdma,ipc" cache_config.pd_comm_port = "2334" + scheduler_config = SchedulerConfig({}) fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, cache_config=cache_config, + scheduler_config=scheduler_config, splitwise_role="prefill", test_mode=True, ) diff --git a/tests/v1/test_prefix_cache.py b/tests/v1/test_prefix_cache.py index b2ded9018..d671968bf 100644 --- a/tests/v1/test_prefix_cache.py +++ b/tests/v1/test_prefix_cache.py @@ -2,7 +2,7 @@ from dataclasses import asdict from types import SimpleNamespace from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager -from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig +from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.request import Request @@ -18,6 +18,7 @@ def test_normal_case(): model_cfg.print = print cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) + scheduler_cfg = SchedulerConfig(args) graph_opt_cfg = engine_args.create_graph_optimization_config() fd_config = FDConfig( model_config=model_cfg, @@ -25,7 +26,7 @@ def test_normal_case(): parallel_config=parallel_cfg, graph_opt_config=graph_opt_cfg, speculative_config=speculative_cfg, - max_num_batched_tokens=engine_args.max_num_batched_tokens, + scheduler_cfg=scheduler_cfg, ) cache_manager = PrefixCacheManager(config=fd_config, tensor_parallel_size=8, splitwise_role="mixed") req1 = Request.from_dict({"request_id": "req1", "prompt_token_ids": [1] * 3200, "prompt_token_ids_len": 3200}) diff --git a/tests/v1/test_schedule_output.py b/tests/v1/test_schedule_output.py index d99a4ce76..b934a1434 100644 --- a/tests/v1/test_schedule_output.py +++ b/tests/v1/test_schedule_output.py @@ -1,7 +1,7 @@ from dataclasses import asdict from types import SimpleNamespace -from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig +from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.request import Request from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1 @@ -17,6 +17,7 @@ def test_normal_schedule(): model_cfg.print = print cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) + scheduler_cfg = SchedulerConfig(args) graph_opt_cfg = engine_args.create_graph_optimization_config() fd_config = FDConfig( model_config=model_cfg, @@ -24,7 +25,7 @@ def test_normal_schedule(): parallel_config=parallel_cfg, speculative_config=speculative_cfg, graph_opt_config=graph_opt_cfg, - max_num_batched_tokens=engine_args.max_num_batched_tokens, + scheduler_config=scheduler_cfg, ) resource_manager_v1 = ResourceManagerV1( max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed" @@ -80,6 +81,7 @@ def test_preempted_request(): model_cfg.print = print cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) + scheduler_cfg = SchedulerConfig(args) graph_opt_cfg = engine_args.create_graph_optimization_config() fd_config = FDConfig( model_config=model_cfg, @@ -87,7 +89,7 @@ def test_preempted_request(): parallel_config=parallel_cfg, graph_opt_config=graph_opt_cfg, speculative_config=speculative_cfg, - max_num_batched_tokens=engine_args.max_num_batched_tokens, + scheduler_config=scheduler_cfg, ) resource_manager_v1 = ResourceManagerV1( max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"