[FDConfig]Remove max_num_batched_tokens/max_num_seqs in parallel config (#4116)

* remove max_num_batched_tokens in parallel config

* remove max_num_seqs

* update test case

* fix test

* fix

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
YuanRisheng
2025-09-17 10:43:35 +08:00
committed by GitHub
parent c01a756912
commit 2e9e53ff7e
30 changed files with 169 additions and 131 deletions

View File

@@ -275,7 +275,6 @@ class ParallelConfig:
From old wersion worker args
TODO(gongshaotian): Reclassify
"""
self.max_num_seqs: int = 34
# Set default block num for profile run
self.total_block_num: int = 2000
# block size
@@ -297,7 +296,6 @@ class ParallelConfig:
# Do profile or not
self.do_profile: bool = False
self.max_num_batched_tokens: int = 2048
# splitwise role
self.splitwise_role: str = "mixed"
# guided decoding backend
@@ -1109,8 +1107,6 @@ class FDConfig:
speculative_config: SpeculativeConfig = None,
tokenizer: str = None,
max_model_len: int = 8192,
max_num_seqs: int = 8,
max_num_batched_tokens: Optional[int] = None,
ips: str = None,
use_warmup: bool = False,
engine_worker_queue_port: str = "8002",
@@ -1143,19 +1139,18 @@ class FDConfig:
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
# Initialize cuda graph capture list
if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
if self.graph_opt_config.cudagraph_only_prefill:
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
else:
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.parallel_config.max_num_seqs)
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
# TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
if self.graph_opt_config.graph_opt_level == 2:
self.graph_opt_config.graph_opt_level = 1
self.tokenizer = tokenizer
self.max_num_batched_tokens = max_num_batched_tokens
self.ips = ips
self.tool_parser = tool_parser
@@ -1177,7 +1172,6 @@ class FDConfig:
self.node_rank = idx
self.max_model_len = max_model_len
self.max_num_seqs = max_num_seqs
self.limit_mm_per_prompt = limit_mm_per_prompt
self.mm_processor_kwargs = mm_processor_kwargs
self.use_warmup = use_warmup
@@ -1243,22 +1237,22 @@ class FDConfig:
self.paddle_commit_id = paddle.version.commit
if self.max_num_batched_tokens is None:
if self.scheduler_config.max_num_batched_tokens is None:
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
if paddle.is_compiled_with_xpu():
self.max_num_batched_tokens = self.max_model_len
self.scheduler_config.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
else:
if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
self.scheduler_config.max_num_batched_tokens = 2048
else:
self.max_num_batched_tokens = self.max_model_len
self.scheduler_config.max_num_batched_tokens = self.max_model_len
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs)
self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
if self.guided_decoding_backend == "auto":
@@ -1272,19 +1266,24 @@ class FDConfig:
"""
check the legality of config
"""
assert self.max_num_seqs <= 256, (
"The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}."
assert self.scheduler_config.max_num_seqs <= 256, (
"The parameter `max_num_seqs` is not allowed to exceed 256, "
f"but now it's {self.scheduler_config.max_num_seqs}."
)
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
assert self.max_num_batched_tokens >= self.max_num_seqs, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
assert (
self.scheduler_config.max_num_seqs >= 1
), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1"
assert self.scheduler_config.max_num_batched_tokens >= self.scheduler_config.max_num_seqs, (
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}"
)
assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
assert (
self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs
), (
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger"
f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}"
)
assert (
self.max_num_partial_prefills >= 1
@@ -1305,13 +1304,13 @@ class FDConfig:
if not self.cache_config.enable_chunked_prefill:
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert self.max_num_batched_tokens >= self.max_model_len, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, (
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
f"should be larger than or equal to max_model_len: {self.max_model_len}"
)
else:
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, (
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
f"should be larger than or equal to block_size: {self.cache_config.block_size}"
)