[FDConfig]Remove max_num_batched_tokens/max_num_seqs in parallel config (#4116)

* remove max_num_batched_tokens in parallel config

* remove max_num_seqs

* update test case

* fix test

* fix

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
YuanRisheng
2025-09-17 10:43:35 +08:00
committed by GitHub
parent c01a756912
commit 2e9e53ff7e
30 changed files with 169 additions and 131 deletions

View File

@@ -275,7 +275,6 @@ class ParallelConfig:
From old wersion worker args From old wersion worker args
TODO(gongshaotian): Reclassify TODO(gongshaotian): Reclassify
""" """
self.max_num_seqs: int = 34
# Set default block num for profile run # Set default block num for profile run
self.total_block_num: int = 2000 self.total_block_num: int = 2000
# block size # block size
@@ -297,7 +296,6 @@ class ParallelConfig:
# Do profile or not # Do profile or not
self.do_profile: bool = False self.do_profile: bool = False
self.max_num_batched_tokens: int = 2048
# splitwise role # splitwise role
self.splitwise_role: str = "mixed" self.splitwise_role: str = "mixed"
# guided decoding backend # guided decoding backend
@@ -1109,8 +1107,6 @@ class FDConfig:
speculative_config: SpeculativeConfig = None, speculative_config: SpeculativeConfig = None,
tokenizer: str = None, tokenizer: str = None,
max_model_len: int = 8192, max_model_len: int = 8192,
max_num_seqs: int = 8,
max_num_batched_tokens: Optional[int] = None,
ips: str = None, ips: str = None,
use_warmup: bool = False, use_warmup: bool = False,
engine_worker_queue_port: str = "8002", engine_worker_queue_port: str = "8002",
@@ -1143,19 +1139,18 @@ class FDConfig:
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
# Initialize cuda graph capture list # Initialize cuda graph capture list
if self.graph_opt_config.cudagraph_capture_sizes is None: if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs) self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
if self.graph_opt_config.cudagraph_only_prefill: if self.graph_opt_config.cudagraph_only_prefill:
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512) self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
else: else:
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.parallel_config.max_num_seqs) self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
# TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn # TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
if self.graph_opt_config.graph_opt_level == 2: if self.graph_opt_config.graph_opt_level == 2:
self.graph_opt_config.graph_opt_level = 1 self.graph_opt_config.graph_opt_level = 1
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.max_num_batched_tokens = max_num_batched_tokens
self.ips = ips self.ips = ips
self.tool_parser = tool_parser self.tool_parser = tool_parser
@@ -1177,7 +1172,6 @@ class FDConfig:
self.node_rank = idx self.node_rank = idx
self.max_model_len = max_model_len self.max_model_len = max_model_len
self.max_num_seqs = max_num_seqs
self.limit_mm_per_prompt = limit_mm_per_prompt self.limit_mm_per_prompt = limit_mm_per_prompt
self.mm_processor_kwargs = mm_processor_kwargs self.mm_processor_kwargs = mm_processor_kwargs
self.use_warmup = use_warmup self.use_warmup = use_warmup
@@ -1243,22 +1237,22 @@ class FDConfig:
self.paddle_commit_id = paddle.version.commit self.paddle_commit_id = paddle.version.commit
if self.max_num_batched_tokens is None: if self.scheduler_config.max_num_batched_tokens is None:
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
if paddle.is_compiled_with_xpu(): if paddle.is_compiled_with_xpu():
self.max_num_batched_tokens = self.max_model_len self.scheduler_config.max_num_batched_tokens = self.max_model_len
else: else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
else: else:
if self.cache_config.enable_chunked_prefill: if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048 self.scheduler_config.max_num_batched_tokens = 2048
else: else:
self.max_num_batched_tokens = self.max_model_len self.scheduler_config.max_num_batched_tokens = self.max_model_len
if self.long_prefill_token_threshold == 0: if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04) self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs) self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
if self.guided_decoding_backend == "auto": if self.guided_decoding_backend == "auto":
@@ -1272,19 +1266,24 @@ class FDConfig:
""" """
check the legality of config check the legality of config
""" """
assert self.max_num_seqs <= 256, ( assert self.scheduler_config.max_num_seqs <= 256, (
"The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}." "The parameter `max_num_seqs` is not allowed to exceed 256, "
f"but now it's {self.scheduler_config.max_num_seqs}."
) )
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1" assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16" assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1" assert (
assert self.max_num_batched_tokens >= self.max_num_seqs, ( self.scheduler_config.max_num_seqs >= 1
f"max_num_batched_tokens: {self.max_num_batched_tokens} " ), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1"
f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}" assert self.scheduler_config.max_num_batched_tokens >= self.scheduler_config.max_num_seqs, (
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}"
) )
assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, ( assert (
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}" ), (
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger"
f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}"
) )
assert ( assert (
self.max_num_partial_prefills >= 1 self.max_num_partial_prefills >= 1
@@ -1305,13 +1304,13 @@ class FDConfig:
if not self.cache_config.enable_chunked_prefill: if not self.cache_config.enable_chunked_prefill:
if not envs.ENABLE_V1_KVCACHE_SCHEDULER: if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert self.max_num_batched_tokens >= self.max_model_len, ( assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
f"should be larger than or equal to max_model_len: {self.max_model_len}" f"should be larger than or equal to max_model_len: {self.max_model_len}"
) )
else: else:
assert self.max_num_batched_tokens >= self.cache_config.block_size, ( assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
f"should be larger than or equal to block_size: {self.cache_config.block_size}" f"should be larger than or equal to block_size: {self.cache_config.block_size}"
) )

View File

@@ -943,23 +943,15 @@ class EngineArgs:
""" """
prefix = "scheduler_" prefix = "scheduler_"
prefix_len = len(prefix) prefix_len = len(prefix)
extra_params = [
"max_model_len",
"enable_chunked_prefill",
"max_num_partial_prefills",
"max_long_partial_prefills",
"long_prefill_token_threshold",
]
all = asdict(self) all = asdict(self)
params = dict() params = dict()
for k, v in all.items(): for k, v in all.items():
if k[:prefix_len] == prefix: if k[:prefix_len] == prefix:
params[k[prefix_len:]] = v params[k[prefix_len:]] = v
elif k in extra_params: else:
params[k] = v params[k] = v
return SchedulerConfig(params)
return SchedulerConfig(**params)
def create_graph_optimization_config(self) -> GraphOptimizationConfig: def create_graph_optimization_config(self) -> GraphOptimizationConfig:
""" """
@@ -1059,9 +1051,7 @@ class EngineArgs:
load_config=load_cfg, load_config=load_cfg,
parallel_config=parallel_cfg, parallel_config=parallel_cfg,
max_model_len=self.max_model_len, max_model_len=self.max_model_len,
max_num_seqs=self.max_num_seqs,
speculative_config=speculative_cfg, speculative_config=speculative_cfg,
max_num_batched_tokens=self.max_num_batched_tokens,
ips=self.ips, ips=self.ips,
use_warmup=self.use_warmup, use_warmup=self.use_warmup,
engine_worker_queue_port=self.engine_worker_queue_port, engine_worker_queue_port=self.engine_worker_queue_port,

View File

@@ -71,7 +71,7 @@ class EngineService:
if envs.ENABLE_V1_KVCACHE_SCHEDULER: if envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.resource_manager = ResourceManagerV1( self.resource_manager = ResourceManagerV1(
cfg.max_num_seqs, cfg.scheduler_config.max_num_seqs,
cfg, cfg,
cfg.parallel_config.tensor_parallel_size, cfg.parallel_config.tensor_parallel_size,
cfg.splitwise_role, cfg.splitwise_role,
@@ -83,7 +83,7 @@ class EngineService:
) )
else: else:
self.resource_manager = ResourceManager( self.resource_manager = ResourceManager(
cfg.max_num_seqs, cfg.scheduler_config.max_num_seqs,
cfg, cfg,
cfg.parallel_config.tensor_parallel_size, cfg.parallel_config.tensor_parallel_size,
cfg.splitwise_role, cfg.splitwise_role,
@@ -109,7 +109,7 @@ class EngineService:
self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1) self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1)
for idx in range(1, self.cfg.max_num_partial_prefills + 1): for idx in range(1, self.cfg.max_num_partial_prefills + 1):
self.partial_chunked_tokens[idx] = ( self.partial_chunked_tokens[idx] = (
(self.cfg.max_num_batched_tokens // idx) (self.cfg.scheduler_config.max_num_batched_tokens // idx)
// self.cfg.cache_config.block_size // self.cfg.cache_config.block_size
* self.cfg.cache_config.block_size * self.cfg.cache_config.block_size
) )
@@ -356,7 +356,7 @@ class EngineService:
requests_chunk = [[] for _ in range(len(requests))] requests_chunk = [[] for _ in range(len(requests))]
chunk_request_num = len(current_request_size) chunk_request_num = len(current_request_size)
while chunk_request_num >= 1: while chunk_request_num >= 1:
remain_batched_tokens = self.cfg.max_num_batched_tokens remain_batched_tokens = self.cfg.scheduler_config.max_num_batched_tokens
for idx in range(len(current_request_size)): for idx in range(len(current_request_size)):
if current_request_size[idx] <= 0: if current_request_size[idx] <= 0:
continue continue
@@ -496,7 +496,7 @@ class EngineService:
available_blocks=self.resource_manager.available_block_num(), available_blocks=self.resource_manager.available_block_num(),
block_size=self.cfg.cache_config.block_size, block_size=self.cfg.cache_config.block_size,
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
max_num_batched_tokens=self.cfg.max_num_batched_tokens, max_num_batched_tokens=self.cfg.scheduler_config.max_num_batched_tokens,
batch=num_prefill_batch, batch=num_prefill_batch,
) )

View File

@@ -469,7 +469,7 @@ class LLMEngine:
ips = ",".join(self.cfg.ips) ips = ",".join(self.cfg.ips)
arguments = ( arguments = (
f" --devices {self.cfg.device_ids} {py_script}" f" --devices {self.cfg.device_ids} {py_script}"
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}" f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}" f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
f" --model {self.cfg.model_config.model!s}" f" --model {self.cfg.model_config.model!s}"
f" --device_ids {self.cfg.device_ids}" f" --device_ids {self.cfg.device_ids}"
@@ -482,7 +482,7 @@ class LLMEngine:
f" --eos_tokens_lens {self.data_processor.eos_token_id_len}" f" --eos_tokens_lens {self.data_processor.eos_token_id_len}"
f" --pad_token_id {self.data_processor.pad_token_id}" f" --pad_token_id {self.data_processor.pad_token_id}"
f" --engine_pid {self.cfg.engine_worker_queue_port[0]}" f" --engine_pid {self.cfg.engine_worker_queue_port[0]}"
f" --max_num_batched_tokens {self.cfg.max_num_batched_tokens}" f" --max_num_batched_tokens {self.cfg.scheduler_config.max_num_batched_tokens}"
f" --splitwise_role {self.cfg.splitwise_role}" f" --splitwise_role {self.cfg.splitwise_role}"
f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}" f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}" f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"

View File

@@ -289,7 +289,7 @@ class ResourceManagerV1(ResourceManager):
with self.lock: with self.lock:
scheduled_reqs: list[Request] = [] scheduled_reqs: list[Request] = []
preempted_reqs: list[Request] = [] preempted_reqs: list[Request] = []
token_budget = self.config.max_num_batched_tokens token_budget = self.config.scheduler_config.max_num_batched_tokens
# First, schedule the RUNNING requests. # First, schedule the RUNNING requests.
req_index = 0 req_index = 0

View File

@@ -210,7 +210,7 @@ class XGrammarBackend(BackendBase):
): ):
super().__init__(fd_config=fd_config) super().__init__(fd_config=fd_config)
self.vocab_size = fd_config.model_config.vocab_size self.vocab_size = fd_config.model_config.vocab_size
self.batch_size = fd_config.parallel_config.max_num_seqs self.batch_size = fd_config.scheduler_config.max_num_seqs
self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace

View File

@@ -152,7 +152,7 @@ class Attention(nn.Layer):
self.cache_k_block_means = paddle.zeros( self.cache_k_block_means = paddle.zeros(
[ [
fd_config.parallel_config.max_num_seqs, fd_config.scheduler_config.max_num_seqs,
moba_max_seq_length // moba_block_size, moba_max_seq_length // moba_block_size,
self.kv_num_heads, self.kv_num_heads,
self.head_dim, self.head_dim,

View File

@@ -156,7 +156,7 @@ class FlashAttentionBackend(AttentionBackend):
self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)
self.max_partition_size: int = int(os.getenv("FLAGS_max_partition_size", "32768")) self.max_partition_size: int = int(os.getenv("FLAGS_max_partition_size", "32768"))
self.zero_seq_enc_lens_for_decode = paddle.zeros( self.zero_seq_enc_lens_for_decode = paddle.zeros(
shape=[fd_config.parallel_config.max_num_seqs, 1], dtype=paddle.int32 shape=[fd_config.scheduler_config.max_num_seqs, 1], dtype=paddle.int32
) )
def get_attntion_meta(self): def get_attntion_meta(self):

View File

@@ -77,7 +77,7 @@ class MobaAttentionBackend(AttentionBackend):
assert fd_config.moba_attention_config is not None, "moba_attention_config is None" assert fd_config.moba_attention_config is not None, "moba_attention_config is None"
self.block_size = fd_config.parallel_config.block_size self.block_size = fd_config.parallel_config.block_size
self.max_seq_len = fd_config.parallel_config.max_model_len self.max_seq_len = fd_config.parallel_config.max_model_len
self.max_num_seqs = fd_config.parallel_config.max_num_seqs self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
self.kv_num_heads = kv_num_heads self.kv_num_heads = kv_num_heads
self.num_heads = num_heads self.num_heads = num_heads
self.head_dim = fd_config.model_config.head_dim self.head_dim = fd_config.model_config.head_dim

View File

@@ -86,7 +86,7 @@ class GCUFlashAttnBackend(AttentionBackend):
self.attention_metadata: GCUFlashAttnMetadata = None self.attention_metadata: GCUFlashAttnMetadata = None
self.block_size = fd_config.cache_config.block_size self.block_size = fd_config.cache_config.block_size
self.max_seq_len = fd_config.parallel_config.max_model_len self.max_seq_len = fd_config.parallel_config.max_model_len
self.max_num_seqs = fd_config.parallel_config.max_num_seqs self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
self.causal = getattr(fd_config.model_config, "causal", True) self.causal = getattr(fd_config.model_config, "causal", True)

View File

@@ -84,7 +84,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
self.attention_metadata: GCUMemEfficientAttnMetadata = None self.attention_metadata: GCUMemEfficientAttnMetadata = None
self.block_size = fd_config.cache_config.block_size self.block_size = fd_config.cache_config.block_size
self.max_seq_len = fd_config.parallel_config.max_model_len self.max_seq_len = fd_config.parallel_config.max_model_len
self.max_num_seqs = fd_config.parallel_config.max_num_seqs self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
self.causal = getattr(fd_config.model_config, "causal", True) self.causal = getattr(fd_config.model_config, "causal", True)

View File

@@ -221,7 +221,7 @@ class Sampler(nn.Layer):
): ):
early_stopper_cls = get_early_stopper_cls_from_stragegy(fd_config.early_stop_config.strategy) early_stopper_cls = get_early_stopper_cls_from_stragegy(fd_config.early_stop_config.strategy)
self.early_stopper = early_stopper_cls() self.early_stopper = early_stopper_cls()
self.early_stopper.initialize(fd_config.parallel_config.max_num_seqs, fd_config.early_stop_config) self.early_stopper.initialize(fd_config.scheduler_config.max_num_seqs, fd_config.early_stop_config)
def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None): def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
"""set reasoning parser""" """set reasoning parser"""

View File

@@ -607,9 +607,11 @@ class DeepseekV3ForCausalLM(ModelForCasualLM):
num_embeddings=fd_config.model_config.vocab_size, num_embeddings=fd_config.model_config.vocab_size,
prefix="lm_head", prefix="lm_head",
) )
self.position_ids_buffer = paddle.empty([fd_config.parallel_config.max_num_batched_tokens], dtype=paddle.int32) self.position_ids_buffer = paddle.empty(
[fd_config.scheduler_config.max_num_batched_tokens], dtype=paddle.int32
)
self.mask_encoder_batch_buffer = paddle.empty( self.mask_encoder_batch_buffer = paddle.empty(
[fd_config.parallel_config.max_num_batched_tokens, 1], dtype=paddle.int32 [fd_config.scheduler_config.max_num_batched_tokens, 1], dtype=paddle.int32
) )
@classmethod @classmethod

View File

@@ -202,13 +202,12 @@ class SchedulerConfig:
Creates appropriate config based on scheduler type (local/global). Creates appropriate config based on scheduler type (local/global).
""" """
def __init__(self, name="local", **kwargs): def __init__(self, args):
""" """
Initialize scheduler configuration factory. Initialize scheduler configuration factory.
Args: Args:
name: Scheduler type ("local" for LocalScheduler or "global" for GlobalScheduler) args: Configuration parameters for the specific scheduler type
**kwargs: Configuration parameters for the specific scheduler type
Initializes: Initializes:
- Appropriate config object based on scheduler type - Appropriate config object based on scheduler type
@@ -217,17 +216,23 @@ class SchedulerConfig:
Raises: Raises:
Exception: If invalid scheduler type is specified Exception: If invalid scheduler type is specified
""" """
self.name = name self.name = "local" # "local" for LocalScheduler or "global" for GlobalScheduler
self.max_num_batched_tokens = 2048
self.max_num_seqs = 34
self.config = None self.config = None
if name == "local": for key, value in args.items():
self.config = LocalSchedulerConfig(**kwargs) if hasattr(self, key):
setattr(self, key, value)
if name == "global": if self.name == "local":
self.config = GlobalSchedulerConfig(**kwargs) self.config = LocalSchedulerConfig(**args)
if name == "splitwise": if self.name == "global":
self.config = SplitWiseSchedulerConfig(**kwargs) self.config = GlobalSchedulerConfig(**args)
if self.name == "splitwise":
self.config = SplitWiseSchedulerConfig(**args)
def check(self): def check(self):
""" """

View File

@@ -50,8 +50,9 @@ class Proposer(ABC):
self.speculative_config = self.cfg.speculative_config self.speculative_config = self.cfg.speculative_config
self.cache_config = self.cfg.cache_config self.cache_config = self.cfg.cache_config
self.quant_config = self.cfg.quant_config self.quant_config = self.cfg.quant_config
self.scheduler_config = self.cfg.scheduler_config
self.max_num_seqs = self.parallel_config.max_num_seqs self.max_num_seqs = self.scheduler_config.max_num_seqs
self.max_model_len = self.parallel_config.max_model_len self.max_model_len = self.parallel_config.max_model_len
self.speculative_method = self.speculative_config.method self.speculative_method = self.speculative_config.method
self.max_draft_token_num = self.speculative_config.num_speculative_tokens self.max_draft_token_num = self.speculative_config.num_speculative_tokens

View File

@@ -89,9 +89,9 @@ class GCUModelRunner(ModelRunnerBase):
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
# Initialize share inputs # Initialize share inputs
self._init_share_inputs(self.parallel_config.max_num_seqs) self._init_share_inputs(self.scheduler_config.max_num_seqs)
self.infer_seed_increment = paddle.full( self.infer_seed_increment = paddle.full(
shape=[self.parallel_config.max_num_seqs, 1], shape=[self.scheduler_config.max_num_seqs, 1],
fill_value=4, fill_value=4,
dtype="int64", dtype="int64",
).cpu() ).cpu()
@@ -689,13 +689,13 @@ class GCUModelRunner(ModelRunnerBase):
decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1 decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1
group_size = np.ceil(num_heads / self.model_config.kv_num_heads) group_size = np.ceil(num_heads / self.model_config.kv_num_heads)
decode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( decode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
(decoder_step_token_num * group_size) / decoder_block_shape_q (decoder_step_token_num * group_size) / decoder_block_shape_q
) )
encode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
(self.model_config.max_model_len * group_size) / encoder_block_shape_q (self.model_config.max_model_len * group_size) / encoder_block_shape_q
) )
kv_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
self.model_config.max_model_len / self.fd_config.cache_config.block_size self.model_config.max_model_len / self.fd_config.cache_config.block_size
) )
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
@@ -914,7 +914,7 @@ class GCUModelRunner(ModelRunnerBase):
capture_sizes = self.cudagraph_capture_sizes.copy() capture_sizes = self.cudagraph_capture_sizes.copy()
for batch_size in sorted(capture_sizes, reverse=True): for batch_size in sorted(capture_sizes, reverse=True):
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=batch_size, batch_size=batch_size,
in_capturing=True, in_capturing=True,
expected_decode_len=expected_decode_len, expected_decode_len=expected_decode_len,
@@ -929,7 +929,7 @@ class GCUModelRunner(ModelRunnerBase):
start_time = time.perf_counter() start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes: for batch_size in self.sot_warmup_sizes:
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=batch_size, batch_size=batch_size,
) )
logger.info(f"SOT warmup the model with the batch size:{batch_size}") logger.info(f"SOT warmup the model with the batch size:{batch_size}")
@@ -1140,8 +1140,8 @@ class GCUModelRunner(ModelRunnerBase):
# 2. Dummy run # 2. Dummy run
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=min(self.parallel_config.max_num_seqs, 3), batch_size=min(self.scheduler_config.max_num_seqs, 3),
) )
# 3. gc # 3. gc

View File

@@ -145,9 +145,9 @@ class GPUModelRunner(ModelRunnerBase):
self.cudagraph_only_prefill = self.graph_opt_config.cudagraph_only_prefill self.cudagraph_only_prefill = self.graph_opt_config.cudagraph_only_prefill
# Initialize share inputs # Initialize share inputs
self._init_share_inputs(self.parallel_config.max_num_seqs) self._init_share_inputs(self.scheduler_config.max_num_seqs)
self.infer_seed_increment = paddle.full( self.infer_seed_increment = paddle.full(
shape=[self.parallel_config.max_num_seqs, 1], shape=[self.scheduler_config.max_num_seqs, 1],
fill_value=4, fill_value=4,
dtype="int64", dtype="int64",
).cpu() ).cpu()
@@ -1208,13 +1208,13 @@ class GPUModelRunner(ModelRunnerBase):
# decode_max_tile_size must take into account the maximum case, where *1024 can cover 128K. # decode_max_tile_size must take into account the maximum case, where *1024 can cover 128K.
decode_max_tile_size = ( decode_max_tile_size = (
1024 1024
* self.parallel_config.max_num_seqs * self.scheduler_config.max_num_seqs
* np.ceil((decoder_step_token_num * group_size) / decoder_block_shape_q) * np.ceil((decoder_step_token_num * group_size) / decoder_block_shape_q)
) )
encode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
(self.model_config.max_model_len * group_size) / encoder_block_shape_q (self.model_config.max_model_len * group_size) / encoder_block_shape_q
) )
kv_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
self.model_config.max_model_len / self.fd_config.cache_config.block_size self.model_config.max_model_len / self.fd_config.cache_config.block_size
) )
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
@@ -1508,7 +1508,7 @@ class GPUModelRunner(ModelRunnerBase):
for num_tokens in sorted(capture_sizes, reverse=True): for num_tokens in sorted(capture_sizes, reverse=True):
self._dummy_run( self._dummy_run(
num_tokens=num_tokens, num_tokens=num_tokens,
batch_size=self.parallel_config.max_num_seqs, batch_size=self.scheduler_config.max_num_seqs,
in_capturing=True, in_capturing=True,
expected_decode_len=expected_decode_len, expected_decode_len=expected_decode_len,
capture_prefill=True, capture_prefill=True,
@@ -1519,7 +1519,7 @@ class GPUModelRunner(ModelRunnerBase):
else: else:
for batch_size in sorted(capture_sizes, reverse=True): for batch_size in sorted(capture_sizes, reverse=True):
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=batch_size, batch_size=batch_size,
in_capturing=True, in_capturing=True,
expected_decode_len=expected_decode_len, expected_decode_len=expected_decode_len,
@@ -1536,7 +1536,7 @@ class GPUModelRunner(ModelRunnerBase):
start_time = time.perf_counter() start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes: for batch_size in self.sot_warmup_sizes:
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=batch_size, batch_size=batch_size,
) )
logger.info(f"SOT warmup the model with the batch size:{batch_size}") logger.info(f"SOT warmup the model with the batch size:{batch_size}")
@@ -1815,8 +1815,8 @@ class GPUModelRunner(ModelRunnerBase):
# 2. Dummy run # 2. Dummy run
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=min(self.parallel_config.max_num_seqs, 3), batch_size=min(self.scheduler_config.max_num_seqs, 3),
) )
# 3. gc # 3. gc

View File

@@ -121,9 +121,9 @@ class MetaxModelRunner(ModelRunnerBase):
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
# Initialize share inputs # Initialize share inputs
self._init_share_inputs(self.parallel_config.max_num_seqs) self._init_share_inputs(self.scheduler_config.max_num_seqs)
self.infer_seed_increment = paddle.full( self.infer_seed_increment = paddle.full(
shape=[self.parallel_config.max_num_seqs, 1], shape=[self.scheduler_config.max_num_seqs, 1],
fill_value=4, fill_value=4,
dtype="int64", dtype="int64",
).cpu() ).cpu()
@@ -995,7 +995,7 @@ class MetaxModelRunner(ModelRunnerBase):
encoder_block_shape_q = 64 encoder_block_shape_q = 64
decoder_block_shape_q = 16 decoder_block_shape_q = 16
decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1 decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1
decode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil( decode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
(decoder_step_token_num * np.ceil(num_heads / self.model_config.kv_num_heads)) / decoder_block_shape_q (decoder_step_token_num * np.ceil(num_heads / self.model_config.kv_num_heads)) / decoder_block_shape_q
) )
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
@@ -1242,7 +1242,7 @@ class MetaxModelRunner(ModelRunnerBase):
capture_sizes = self.cudagraph_capture_sizes.copy() capture_sizes = self.cudagraph_capture_sizes.copy()
for batch_size in sorted(capture_sizes, reverse=True): for batch_size in sorted(capture_sizes, reverse=True):
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=batch_size, batch_size=batch_size,
in_capturing=True, in_capturing=True,
expected_decode_len=expected_decode_len, expected_decode_len=expected_decode_len,
@@ -1257,7 +1257,7 @@ class MetaxModelRunner(ModelRunnerBase):
start_time = time.perf_counter() start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes: for batch_size in self.sot_warmup_sizes:
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=batch_size, batch_size=batch_size,
) )
logger.info(f"SOT warmup the model with the batch size:{batch_size}") logger.info(f"SOT warmup the model with the batch size:{batch_size}")
@@ -1489,8 +1489,8 @@ class MetaxModelRunner(ModelRunnerBase):
# 2. Dummy run # 2. Dummy run
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=min(self.parallel_config.max_num_seqs, 3), batch_size=min(self.scheduler_config.max_num_seqs, 3),
) )
# 3. gc # 3. gc

View File

@@ -45,6 +45,7 @@ class ModelRunnerBase(ABC):
self.graph_opt_config = fd_config.graph_opt_config self.graph_opt_config = fd_config.graph_opt_config
self.quant_config = fd_config.quant_config self.quant_config = fd_config.quant_config
self.cache_config = fd_config.cache_config self.cache_config = fd_config.cache_config
self.scheduler_config = fd_config.scheduler_config
# ... config # ... config
self.device = device self.device = device

View File

@@ -44,6 +44,7 @@ from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import IPCSignal from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.model_executor.layers.quantization import parse_quant_config from fastdeploy.model_executor.layers.quantization import parse_quant_config
from fastdeploy.platforms import current_platform from fastdeploy.platforms import current_platform
from fastdeploy.scheduler import SchedulerConfig
from fastdeploy.utils import get_logger from fastdeploy.utils import get_logger
from fastdeploy.worker.worker_base import WorkerBase from fastdeploy.worker.worker_base import WorkerBase
@@ -662,6 +663,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
speculative_config = SpeculativeConfig(args.speculative_config) speculative_config = SpeculativeConfig(args.speculative_config)
parallel_config = ParallelConfig(vars(args)) parallel_config = ParallelConfig(vars(args))
cache_config = CacheConfig(vars(args)) cache_config = CacheConfig(vars(args))
scheduler_config = SchedulerConfig(vars(args))
parallel_config.tensor_parallel_rank = local_rank % parallel_config.tensor_parallel_size parallel_config.tensor_parallel_rank = local_rank % parallel_config.tensor_parallel_size
parallel_config.data_parallel_rank = local_rank // parallel_config.tensor_parallel_size parallel_config.data_parallel_rank = local_rank // parallel_config.tensor_parallel_size
# config for EP # config for EP
@@ -758,6 +760,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
early_stop_config=early_stop_config, early_stop_config=early_stop_config,
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config,
engine_worker_queue_port=args.engine_worker_queue_port, engine_worker_queue_port=args.engine_worker_queue_port,
ips=args.ips, ips=args.ips,
moba_attention_config=moba_attention_config, moba_attention_config=moba_attention_config,

View File

@@ -353,12 +353,12 @@ class XPUModelRunner(ModelRunnerBase):
self.graph_opt_level = self.graph_opt_config.graph_opt_level self.graph_opt_level = self.graph_opt_config.graph_opt_level
self.use_cudagraph = False self.use_cudagraph = False
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") self.input_ids = paddle.zeros(self.scheduler_config.max_num_seqs, dtype="int32")
# Initialize share inputs # Initialize share inputs
self._init_share_inputs(self.fd_config.parallel_config.max_num_seqs) self._init_share_inputs(self.fd_config.scheduler_config.max_num_seqs)
self.infer_seed_increment = paddle.full( self.infer_seed_increment = paddle.full(
shape=[self.parallel_config.max_num_seqs, 1], shape=[self.scheduler_config.max_num_seqs, 1],
fill_value=4, fill_value=4,
dtype="int64", dtype="int64",
).cpu() ).cpu()
@@ -812,7 +812,7 @@ class XPUModelRunner(ModelRunnerBase):
start_time = time.perf_counter() start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes: for batch_size in self.sot_warmup_sizes:
self._dummy_run( self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens, num_tokens=self.scheduler_config.max_num_batched_tokens,
batch_size=batch_size, batch_size=batch_size,
) )
logger.info(f"SOT warmup the model with the batch size:{batch_size}") logger.info(f"SOT warmup the model with the batch size:{batch_size}")
@@ -987,8 +987,8 @@ class XPUModelRunner(ModelRunnerBase):
"""Execute a forward pass with dummy inputs to profile the memory usage of the model.""" """Execute a forward pass with dummy inputs to profile the memory usage of the model."""
self._dummy_run( self._dummy_run(
num_tokens=int(self.parallel_config.max_num_batched_tokens), num_tokens=int(self.scheduler_config.max_num_batched_tokens),
batch_size=min(self.parallel_config.max_num_seqs, 1), batch_size=min(self.scheduler_config.max_num_seqs, 1),
) )
def clear_block_table(self) -> None: def clear_block_table(self) -> None:

View File

@@ -23,6 +23,7 @@ from fastdeploy.config import (
FDConfig, FDConfig,
GraphOptimizationConfig, GraphOptimizationConfig,
ParallelConfig, ParallelConfig,
SchedulerConfig,
) )
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import ( from fastdeploy.model_executor.graph_optimization.decorator import (
@@ -152,14 +153,16 @@ class TestCUDAGrpahSubgraph(unittest.TestCase):
# Set FastDeploy config # Set FastDeploy config
graph_opt_config = GraphOptimizationConfig(args={}) graph_opt_config = GraphOptimizationConfig(args={})
graph_opt_config.use_cudagraph = True graph_opt_config.use_cudagraph = True
parallel_config = ParallelConfig(args={}) scheduler_config = SchedulerConfig(args={})
parallel_config.max_num_seqs = 8 scheduler_config.max_num_seqs = 8
cache_config = CacheConfig({}) cache_config = CacheConfig({})
parallel_config = ParallelConfig(args={})
# Initialize cuda graph capture list # Initialize cuda graph capture list
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs) graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
fd_config = FDConfig( fd_config = FDConfig(
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
scheduler_config=scheduler_config,
parallel_config=parallel_config, parallel_config=parallel_config,
cache_config=cache_config, cache_config=cache_config,
test_mode=True, test_mode=True,

View File

@@ -7,6 +7,7 @@ from fastdeploy.config import (
FDConfig, FDConfig,
GraphOptimizationConfig, GraphOptimizationConfig,
ParallelConfig, ParallelConfig,
SchedulerConfig,
) )
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import ( from fastdeploy.model_executor.graph_optimization.decorator import (
@@ -90,11 +91,15 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
# Set FastDeploy config # Set FastDeploy config
graph_opt_config = GraphOptimizationConfig(args={}) graph_opt_config = GraphOptimizationConfig(args={})
graph_opt_config.use_cudagraph = True graph_opt_config.use_cudagraph = True
parallel_config = ParallelConfig(args={}) scheduler_config = SchedulerConfig(args={})
cache_config = CacheConfig(args={}) cache_config = CacheConfig(args={})
parallel_config.max_num_seqs = 1 scheduler_config.max_num_seqs = 1
parallel_config = ParallelConfig(args={})
fd_config = FDConfig( fd_config = FDConfig(
graph_opt_config=graph_opt_config, parallel_config=parallel_config, cache_config=cache_config graph_opt_config=graph_opt_config,
scheduler_config=scheduler_config,
cache_config=cache_config,
parallel_config=parallel_config,
) )
# Run Test Case1 # Run Test Case1

View File

@@ -23,6 +23,7 @@ from fastdeploy.config import (
FDConfig, FDConfig,
GraphOptimizationConfig, GraphOptimizationConfig,
ParallelConfig, ParallelConfig,
SchedulerConfig,
) )
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import ( from fastdeploy.model_executor.graph_optimization.decorator import (
@@ -99,16 +100,18 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase):
"""Run test case""" """Run test case"""
graph_opt_config = GraphOptimizationConfig(args={}) graph_opt_config = GraphOptimizationConfig(args={})
graph_opt_config.use_cudagraph = True graph_opt_config.use_cudagraph = True
parallel_config = ParallelConfig(args={}) scheduler_config = SchedulerConfig(args={})
parallel_config.max_num_seqs = 1 scheduler_config.max_num_seqs = 1
cache_config = CacheConfig({}) cache_config = CacheConfig({})
parallel_config = ParallelConfig(args={})
# Initialize cuda graph capture list # Initialize cuda graph capture list
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs) graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
fd_config = FDConfig( fd_config = FDConfig(
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
parallel_config=parallel_config, scheduler_config=scheduler_config,
cache_config=cache_config, cache_config=cache_config,
parallel_config=parallel_config,
test_mode=True, test_mode=True,
) )

View File

@@ -25,6 +25,7 @@ from fastdeploy.config import (
FDConfig, FDConfig,
GraphOptimizationConfig, GraphOptimizationConfig,
ParallelConfig, ParallelConfig,
SchedulerConfig,
) )
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import ( from fastdeploy.model_executor.graph_optimization.decorator import (
@@ -85,15 +86,16 @@ class TestGraphOptBackend(unittest.TestCase):
baseline_graph_opt_config.use_cudagraph = False baseline_graph_opt_config.use_cudagraph = False
baseline_graph_opt_config.graph_opt_level = 0 baseline_graph_opt_config.graph_opt_level = 0
baseline_parallel_config = ParallelConfig(args={}) baseline_scheduler_config = SchedulerConfig(args={})
baseline_parallel_config.max_num_seqs = self.max_num_seqs baseline_scheduler_config.max_num_seqs = self.max_num_seqs
baseline_cache_config = CacheConfig({}) baseline_cache_config = CacheConfig({})
baseline_parallel_config = ParallelConfig(args={})
self.baseline_fd_config = FDConfig( self.baseline_fd_config = FDConfig(
graph_opt_config=baseline_graph_opt_config, graph_opt_config=baseline_graph_opt_config,
parallel_config=baseline_parallel_config, scheduler_config=baseline_scheduler_config,
cache_config=baseline_cache_config, cache_config=baseline_cache_config,
parallel_config=baseline_parallel_config,
test_mode=True, test_mode=True,
) )
@@ -129,17 +131,19 @@ class TestGraphOptBackend(unittest.TestCase):
graph_opt_config.graph_opt_level = graph_opt_level graph_opt_config.graph_opt_level = graph_opt_level
# Setup parallel config # Setup parallel config
parallel_config = ParallelConfig(args={}) scheduler_config = SchedulerConfig(args={})
parallel_config.max_num_seqs = self.max_num_seqs scheduler_config.max_num_seqs = self.max_num_seqs
# Setup cache config # Setup cache config
cache_config = CacheConfig({}) cache_config = CacheConfig({})
parallel_config = ParallelConfig(args={})
# Create FD config # Create FD config
return FDConfig( return FDConfig(
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
parallel_config=parallel_config, scheduler_config=scheduler_config,
cache_config=cache_config, cache_config=cache_config,
parallel_config=parallel_config,
test_mode=True, test_mode=True,
) )

View File

@@ -29,6 +29,7 @@ from fastdeploy.config import (
FDConfig, FDConfig,
GraphOptimizationConfig, GraphOptimizationConfig,
ParallelConfig, ParallelConfig,
SchedulerConfig,
) )
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import ( from fastdeploy.model_executor.graph_optimization.decorator import (
@@ -88,15 +89,17 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase):
"""Run test case""" """Run test case"""
# Set FastDeploy config # Set FastDeploy config
graph_opt_config = GraphOptimizationConfig({"use_cudagraph": True, "graph_opt_level": 1}) graph_opt_config = GraphOptimizationConfig({"use_cudagraph": True, "graph_opt_level": 1})
parallel_config = ParallelConfig({"max_num_seqs": 1}) scheduler_config = SchedulerConfig({"max_num_seqs": 1})
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs) graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
cache_config = CacheConfig({}) cache_config = CacheConfig({})
parallel_config = ParallelConfig(args={})
fd_config = FDConfig( fd_config = FDConfig(
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
parallel_config=parallel_config, scheduler_config=scheduler_config,
cache_config=cache_config, cache_config=cache_config,
parallel_config=parallel_config,
test_mode=True, test_mode=True,
) )

View File

@@ -21,6 +21,7 @@ from fastdeploy.config import (
FDConfig, FDConfig,
GraphOptimizationConfig, GraphOptimizationConfig,
ParallelConfig, ParallelConfig,
SchedulerConfig,
) )
@@ -50,12 +51,17 @@ class FakeModelConfig:
def get_default_test_fd_config(): def get_default_test_fd_config():
graph_opt_config = GraphOptimizationConfig(args={}) graph_opt_config = GraphOptimizationConfig(args={})
scheduler_config = SchedulerConfig(args={})
scheduler_config.max_num_seqs = 1
parallel_config = ParallelConfig(args={}) parallel_config = ParallelConfig(args={})
parallel_config.max_num_seqs = 1
parallel_config.data_parallel_rank = 1 parallel_config.data_parallel_rank = 1
cache_config = CacheConfig({}) cache_config = CacheConfig({})
fd_config = FDConfig( fd_config = FDConfig(
graph_opt_config=graph_opt_config, parallel_config=parallel_config, cache_config=cache_config, test_mode=True graph_opt_config=graph_opt_config,
parallel_config=parallel_config,
cache_config=cache_config,
scheduler_config=scheduler_config,
test_mode=True,
) )
fd_config.model_config = FakeModelConfig() fd_config.model_config = FakeModelConfig()
return fd_config return fd_config

View File

@@ -6,6 +6,7 @@ from fastdeploy.config import (
FDConfig, FDConfig,
GraphOptimizationConfig, GraphOptimizationConfig,
ParallelConfig, ParallelConfig,
SchedulerConfig,
) )
@@ -14,10 +15,12 @@ class TestConfig(unittest.TestCase):
parallel_config = ParallelConfig({"tensor_parallel_size": 16, "expert_parallel_size": 1}) parallel_config = ParallelConfig({"tensor_parallel_size": 16, "expert_parallel_size": 1})
graph_opt_config = GraphOptimizationConfig({}) graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig({}) cache_config = CacheConfig({})
scheduler_config = SchedulerConfig({})
fd_config = FDConfig( fd_config = FDConfig(
parallel_config=parallel_config, parallel_config=parallel_config,
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config,
ips=["1.1.1.1", "0.0.0.0"], ips=["1.1.1.1", "0.0.0.0"],
test_mode=True, test_mode=True,
) )
@@ -28,10 +31,12 @@ class TestConfig(unittest.TestCase):
parallel_config = ParallelConfig({}) parallel_config = ParallelConfig({})
graph_opt_config = GraphOptimizationConfig({}) graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig({}) cache_config = CacheConfig({})
scheduler_config = SchedulerConfig({})
fd_config = FDConfig( fd_config = FDConfig(
parallel_config=parallel_config, parallel_config=parallel_config,
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config,
ips="0.0.0.0", ips="0.0.0.0",
test_mode=True, test_mode=True,
) )
@@ -42,26 +47,29 @@ class TestConfig(unittest.TestCase):
graph_opt_config = GraphOptimizationConfig({}) graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig({}) cache_config = CacheConfig({})
cache_config.enable_chunked_prefill = True cache_config.enable_chunked_prefill = True
scheduler_config = SchedulerConfig({})
fd_config = FDConfig( fd_config = FDConfig(
parallel_config=parallel_config, parallel_config=parallel_config,
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config,
ips="0.0.0.0", ips="0.0.0.0",
test_mode=True, test_mode=True,
) )
if not envs.ENABLE_V1_KVCACHE_SCHEDULER: if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert fd_config.max_num_batched_tokens == 2048 assert fd_config.scheduler_config.max_num_batched_tokens == 2048
cache_config.enable_chunked_prefill = False cache_config.enable_chunked_prefill = False
fd_config = FDConfig( fd_config = FDConfig(
parallel_config=parallel_config, parallel_config=parallel_config,
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config,
ips="0.0.0.0", ips="0.0.0.0",
test_mode=True, test_mode=True,
) )
if not envs.ENABLE_V1_KVCACHE_SCHEDULER: if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert fd_config.max_num_batched_tokens == 8192 assert fd_config.scheduler_config.max_num_batched_tokens == 8192
def test_fdconfig_init_cache(self): def test_fdconfig_init_cache(self):
parallel_config = ParallelConfig({}) parallel_config = ParallelConfig({})
@@ -69,10 +77,12 @@ class TestConfig(unittest.TestCase):
cache_config = CacheConfig({}) cache_config = CacheConfig({})
cache_config.cache_transfer_protocol = "rdma,ipc" cache_config.cache_transfer_protocol = "rdma,ipc"
cache_config.pd_comm_port = "2334" cache_config.pd_comm_port = "2334"
scheduler_config = SchedulerConfig({})
fd_config = FDConfig( fd_config = FDConfig(
parallel_config=parallel_config, parallel_config=parallel_config,
graph_opt_config=graph_opt_config, graph_opt_config=graph_opt_config,
cache_config=cache_config, cache_config=cache_config,
scheduler_config=scheduler_config,
splitwise_role="prefill", splitwise_role="prefill",
test_mode=True, test_mode=True,
) )

View File

@@ -2,7 +2,7 @@ from dataclasses import asdict
from types import SimpleNamespace from types import SimpleNamespace
from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig
from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.args_utils import EngineArgs
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
@@ -18,6 +18,7 @@ def test_normal_case():
model_cfg.print = print model_cfg.print = print
cache_cfg.bytes_per_layer_per_block = 1 cache_cfg.bytes_per_layer_per_block = 1
parallel_cfg = ParallelConfig(args) parallel_cfg = ParallelConfig(args)
scheduler_cfg = SchedulerConfig(args)
graph_opt_cfg = engine_args.create_graph_optimization_config() graph_opt_cfg = engine_args.create_graph_optimization_config()
fd_config = FDConfig( fd_config = FDConfig(
model_config=model_cfg, model_config=model_cfg,
@@ -25,7 +26,7 @@ def test_normal_case():
parallel_config=parallel_cfg, parallel_config=parallel_cfg,
graph_opt_config=graph_opt_cfg, graph_opt_config=graph_opt_cfg,
speculative_config=speculative_cfg, speculative_config=speculative_cfg,
max_num_batched_tokens=engine_args.max_num_batched_tokens, scheduler_cfg=scheduler_cfg,
) )
cache_manager = PrefixCacheManager(config=fd_config, tensor_parallel_size=8, splitwise_role="mixed") cache_manager = PrefixCacheManager(config=fd_config, tensor_parallel_size=8, splitwise_role="mixed")
req1 = Request.from_dict({"request_id": "req1", "prompt_token_ids": [1] * 3200, "prompt_token_ids_len": 3200}) req1 = Request.from_dict({"request_id": "req1", "prompt_token_ids": [1] * 3200, "prompt_token_ids_len": 3200})

View File

@@ -1,7 +1,7 @@
from dataclasses import asdict from dataclasses import asdict
from types import SimpleNamespace from types import SimpleNamespace
from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig
from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.args_utils import EngineArgs
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1 from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
@@ -17,6 +17,7 @@ def test_normal_schedule():
model_cfg.print = print model_cfg.print = print
cache_cfg.bytes_per_layer_per_block = 1 cache_cfg.bytes_per_layer_per_block = 1
parallel_cfg = ParallelConfig(args) parallel_cfg = ParallelConfig(args)
scheduler_cfg = SchedulerConfig(args)
graph_opt_cfg = engine_args.create_graph_optimization_config() graph_opt_cfg = engine_args.create_graph_optimization_config()
fd_config = FDConfig( fd_config = FDConfig(
model_config=model_cfg, model_config=model_cfg,
@@ -24,7 +25,7 @@ def test_normal_schedule():
parallel_config=parallel_cfg, parallel_config=parallel_cfg,
speculative_config=speculative_cfg, speculative_config=speculative_cfg,
graph_opt_config=graph_opt_cfg, graph_opt_config=graph_opt_cfg,
max_num_batched_tokens=engine_args.max_num_batched_tokens, scheduler_config=scheduler_cfg,
) )
resource_manager_v1 = ResourceManagerV1( resource_manager_v1 = ResourceManagerV1(
max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed" max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"
@@ -80,6 +81,7 @@ def test_preempted_request():
model_cfg.print = print model_cfg.print = print
cache_cfg.bytes_per_layer_per_block = 1 cache_cfg.bytes_per_layer_per_block = 1
parallel_cfg = ParallelConfig(args) parallel_cfg = ParallelConfig(args)
scheduler_cfg = SchedulerConfig(args)
graph_opt_cfg = engine_args.create_graph_optimization_config() graph_opt_cfg = engine_args.create_graph_optimization_config()
fd_config = FDConfig( fd_config = FDConfig(
model_config=model_cfg, model_config=model_cfg,
@@ -87,7 +89,7 @@ def test_preempted_request():
parallel_config=parallel_cfg, parallel_config=parallel_cfg,
graph_opt_config=graph_opt_cfg, graph_opt_config=graph_opt_cfg,
speculative_config=speculative_cfg, speculative_config=speculative_cfg,
max_num_batched_tokens=engine_args.max_num_batched_tokens, scheduler_config=scheduler_cfg,
) )
resource_manager_v1 = ResourceManagerV1( resource_manager_v1 = ResourceManagerV1(
max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed" max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"