mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
[FDConfig]Remove max_num_batched_tokens/max_num_seqs in parallel config (#4116)
* remove max_num_batched_tokens in parallel config * remove max_num_seqs * update test case * fix test * fix --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -275,7 +275,6 @@ class ParallelConfig:
|
||||
From old wersion worker args
|
||||
TODO(gongshaotian): Reclassify
|
||||
"""
|
||||
self.max_num_seqs: int = 34
|
||||
# Set default block num for profile run
|
||||
self.total_block_num: int = 2000
|
||||
# block size
|
||||
@@ -297,7 +296,6 @@ class ParallelConfig:
|
||||
# Do profile or not
|
||||
self.do_profile: bool = False
|
||||
|
||||
self.max_num_batched_tokens: int = 2048
|
||||
# splitwise role
|
||||
self.splitwise_role: str = "mixed"
|
||||
# guided decoding backend
|
||||
@@ -1109,8 +1107,6 @@ class FDConfig:
|
||||
speculative_config: SpeculativeConfig = None,
|
||||
tokenizer: str = None,
|
||||
max_model_len: int = 8192,
|
||||
max_num_seqs: int = 8,
|
||||
max_num_batched_tokens: Optional[int] = None,
|
||||
ips: str = None,
|
||||
use_warmup: bool = False,
|
||||
engine_worker_queue_port: str = "8002",
|
||||
@@ -1143,19 +1139,18 @@ class FDConfig:
|
||||
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
|
||||
# Initialize cuda graph capture list
|
||||
if self.graph_opt_config.cudagraph_capture_sizes is None:
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
|
||||
|
||||
if self.graph_opt_config.cudagraph_only_prefill:
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
|
||||
else:
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.parallel_config.max_num_seqs)
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
|
||||
|
||||
# TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
|
||||
if self.graph_opt_config.graph_opt_level == 2:
|
||||
self.graph_opt_config.graph_opt_level = 1
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.max_num_batched_tokens = max_num_batched_tokens
|
||||
self.ips = ips
|
||||
self.tool_parser = tool_parser
|
||||
|
||||
@@ -1177,7 +1172,6 @@ class FDConfig:
|
||||
self.node_rank = idx
|
||||
|
||||
self.max_model_len = max_model_len
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
self.use_warmup = use_warmup
|
||||
@@ -1243,22 +1237,22 @@ class FDConfig:
|
||||
|
||||
self.paddle_commit_id = paddle.version.commit
|
||||
|
||||
if self.max_num_batched_tokens is None:
|
||||
if self.scheduler_config.max_num_batched_tokens is None:
|
||||
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
||||
if paddle.is_compiled_with_xpu():
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
self.scheduler_config.max_num_batched_tokens = self.max_model_len
|
||||
else:
|
||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||
self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||
else:
|
||||
if self.cache_config.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = 2048
|
||||
self.scheduler_config.max_num_batched_tokens = 2048
|
||||
else:
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
self.scheduler_config.max_num_batched_tokens = self.max_model_len
|
||||
|
||||
if self.long_prefill_token_threshold == 0:
|
||||
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
||||
|
||||
self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs)
|
||||
self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
|
||||
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
|
||||
|
||||
if self.guided_decoding_backend == "auto":
|
||||
@@ -1272,19 +1266,24 @@ class FDConfig:
|
||||
"""
|
||||
check the legality of config
|
||||
"""
|
||||
assert self.max_num_seqs <= 256, (
|
||||
"The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}."
|
||||
assert self.scheduler_config.max_num_seqs <= 256, (
|
||||
"The parameter `max_num_seqs` is not allowed to exceed 256, "
|
||||
f"but now it's {self.scheduler_config.max_num_seqs}."
|
||||
)
|
||||
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
|
||||
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
|
||||
assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
|
||||
assert self.max_num_batched_tokens >= self.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
|
||||
assert (
|
||||
self.scheduler_config.max_num_seqs >= 1
|
||||
), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1"
|
||||
assert self.scheduler_config.max_num_batched_tokens >= self.scheduler_config.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}"
|
||||
)
|
||||
assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
|
||||
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
|
||||
assert (
|
||||
self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs
|
||||
), (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger"
|
||||
f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}"
|
||||
)
|
||||
assert (
|
||||
self.max_num_partial_prefills >= 1
|
||||
@@ -1305,13 +1304,13 @@ class FDConfig:
|
||||
|
||||
if not self.cache_config.enable_chunked_prefill:
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
assert self.max_num_batched_tokens >= self.max_model_len, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||
)
|
||||
else:
|
||||
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to block_size: {self.cache_config.block_size}"
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user