mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-26 20:41:53 +08:00
[FDConfig]Remove max_num_batched_tokens/max_num_seqs in parallel config (#4116)
* remove max_num_batched_tokens in parallel config * remove max_num_seqs * update test case * fix test * fix --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -275,7 +275,6 @@ class ParallelConfig:
|
||||
From old wersion worker args
|
||||
TODO(gongshaotian): Reclassify
|
||||
"""
|
||||
self.max_num_seqs: int = 34
|
||||
# Set default block num for profile run
|
||||
self.total_block_num: int = 2000
|
||||
# block size
|
||||
@@ -297,7 +296,6 @@ class ParallelConfig:
|
||||
# Do profile or not
|
||||
self.do_profile: bool = False
|
||||
|
||||
self.max_num_batched_tokens: int = 2048
|
||||
# splitwise role
|
||||
self.splitwise_role: str = "mixed"
|
||||
# guided decoding backend
|
||||
@@ -1109,8 +1107,6 @@ class FDConfig:
|
||||
speculative_config: SpeculativeConfig = None,
|
||||
tokenizer: str = None,
|
||||
max_model_len: int = 8192,
|
||||
max_num_seqs: int = 8,
|
||||
max_num_batched_tokens: Optional[int] = None,
|
||||
ips: str = None,
|
||||
use_warmup: bool = False,
|
||||
engine_worker_queue_port: str = "8002",
|
||||
@@ -1143,19 +1139,18 @@ class FDConfig:
|
||||
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
|
||||
# Initialize cuda graph capture list
|
||||
if self.graph_opt_config.cudagraph_capture_sizes is None:
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
|
||||
|
||||
if self.graph_opt_config.cudagraph_only_prefill:
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
|
||||
else:
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.parallel_config.max_num_seqs)
|
||||
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
|
||||
|
||||
# TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
|
||||
if self.graph_opt_config.graph_opt_level == 2:
|
||||
self.graph_opt_config.graph_opt_level = 1
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.max_num_batched_tokens = max_num_batched_tokens
|
||||
self.ips = ips
|
||||
self.tool_parser = tool_parser
|
||||
|
||||
@@ -1177,7 +1172,6 @@ class FDConfig:
|
||||
self.node_rank = idx
|
||||
|
||||
self.max_model_len = max_model_len
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
self.use_warmup = use_warmup
|
||||
@@ -1243,22 +1237,22 @@ class FDConfig:
|
||||
|
||||
self.paddle_commit_id = paddle.version.commit
|
||||
|
||||
if self.max_num_batched_tokens is None:
|
||||
if self.scheduler_config.max_num_batched_tokens is None:
|
||||
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
||||
if paddle.is_compiled_with_xpu():
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
self.scheduler_config.max_num_batched_tokens = self.max_model_len
|
||||
else:
|
||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||
self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||
else:
|
||||
if self.cache_config.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = 2048
|
||||
self.scheduler_config.max_num_batched_tokens = 2048
|
||||
else:
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
self.scheduler_config.max_num_batched_tokens = self.max_model_len
|
||||
|
||||
if self.long_prefill_token_threshold == 0:
|
||||
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
||||
|
||||
self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs)
|
||||
self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
|
||||
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
|
||||
|
||||
if self.guided_decoding_backend == "auto":
|
||||
@@ -1272,19 +1266,24 @@ class FDConfig:
|
||||
"""
|
||||
check the legality of config
|
||||
"""
|
||||
assert self.max_num_seqs <= 256, (
|
||||
"The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}."
|
||||
assert self.scheduler_config.max_num_seqs <= 256, (
|
||||
"The parameter `max_num_seqs` is not allowed to exceed 256, "
|
||||
f"but now it's {self.scheduler_config.max_num_seqs}."
|
||||
)
|
||||
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
|
||||
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
|
||||
assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
|
||||
assert self.max_num_batched_tokens >= self.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
|
||||
assert (
|
||||
self.scheduler_config.max_num_seqs >= 1
|
||||
), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1"
|
||||
assert self.scheduler_config.max_num_batched_tokens >= self.scheduler_config.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}"
|
||||
)
|
||||
assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
|
||||
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
|
||||
assert (
|
||||
self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs
|
||||
), (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger"
|
||||
f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}"
|
||||
)
|
||||
assert (
|
||||
self.max_num_partial_prefills >= 1
|
||||
@@ -1305,13 +1304,13 @@ class FDConfig:
|
||||
|
||||
if not self.cache_config.enable_chunked_prefill:
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
assert self.max_num_batched_tokens >= self.max_model_len, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||
)
|
||||
else:
|
||||
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, (
|
||||
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to block_size: {self.cache_config.block_size}"
|
||||
)
|
||||
|
||||
|
@@ -943,23 +943,15 @@ class EngineArgs:
|
||||
"""
|
||||
prefix = "scheduler_"
|
||||
prefix_len = len(prefix)
|
||||
extra_params = [
|
||||
"max_model_len",
|
||||
"enable_chunked_prefill",
|
||||
"max_num_partial_prefills",
|
||||
"max_long_partial_prefills",
|
||||
"long_prefill_token_threshold",
|
||||
]
|
||||
|
||||
all = asdict(self)
|
||||
params = dict()
|
||||
for k, v in all.items():
|
||||
if k[:prefix_len] == prefix:
|
||||
params[k[prefix_len:]] = v
|
||||
elif k in extra_params:
|
||||
else:
|
||||
params[k] = v
|
||||
|
||||
return SchedulerConfig(**params)
|
||||
return SchedulerConfig(params)
|
||||
|
||||
def create_graph_optimization_config(self) -> GraphOptimizationConfig:
|
||||
"""
|
||||
@@ -1059,9 +1051,7 @@ class EngineArgs:
|
||||
load_config=load_cfg,
|
||||
parallel_config=parallel_cfg,
|
||||
max_model_len=self.max_model_len,
|
||||
max_num_seqs=self.max_num_seqs,
|
||||
speculative_config=speculative_cfg,
|
||||
max_num_batched_tokens=self.max_num_batched_tokens,
|
||||
ips=self.ips,
|
||||
use_warmup=self.use_warmup,
|
||||
engine_worker_queue_port=self.engine_worker_queue_port,
|
||||
|
@@ -71,7 +71,7 @@ class EngineService:
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.resource_manager = ResourceManagerV1(
|
||||
cfg.max_num_seqs,
|
||||
cfg.scheduler_config.max_num_seqs,
|
||||
cfg,
|
||||
cfg.parallel_config.tensor_parallel_size,
|
||||
cfg.splitwise_role,
|
||||
@@ -83,7 +83,7 @@ class EngineService:
|
||||
)
|
||||
else:
|
||||
self.resource_manager = ResourceManager(
|
||||
cfg.max_num_seqs,
|
||||
cfg.scheduler_config.max_num_seqs,
|
||||
cfg,
|
||||
cfg.parallel_config.tensor_parallel_size,
|
||||
cfg.splitwise_role,
|
||||
@@ -109,7 +109,7 @@ class EngineService:
|
||||
self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1)
|
||||
for idx in range(1, self.cfg.max_num_partial_prefills + 1):
|
||||
self.partial_chunked_tokens[idx] = (
|
||||
(self.cfg.max_num_batched_tokens // idx)
|
||||
(self.cfg.scheduler_config.max_num_batched_tokens // idx)
|
||||
// self.cfg.cache_config.block_size
|
||||
* self.cfg.cache_config.block_size
|
||||
)
|
||||
@@ -356,7 +356,7 @@ class EngineService:
|
||||
requests_chunk = [[] for _ in range(len(requests))]
|
||||
chunk_request_num = len(current_request_size)
|
||||
while chunk_request_num >= 1:
|
||||
remain_batched_tokens = self.cfg.max_num_batched_tokens
|
||||
remain_batched_tokens = self.cfg.scheduler_config.max_num_batched_tokens
|
||||
for idx in range(len(current_request_size)):
|
||||
if current_request_size[idx] <= 0:
|
||||
continue
|
||||
@@ -496,7 +496,7 @@ class EngineService:
|
||||
available_blocks=self.resource_manager.available_block_num(),
|
||||
block_size=self.cfg.cache_config.block_size,
|
||||
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
|
||||
max_num_batched_tokens=self.cfg.max_num_batched_tokens,
|
||||
max_num_batched_tokens=self.cfg.scheduler_config.max_num_batched_tokens,
|
||||
batch=num_prefill_batch,
|
||||
)
|
||||
|
||||
|
@@ -469,7 +469,7 @@ class LLMEngine:
|
||||
ips = ",".join(self.cfg.ips)
|
||||
arguments = (
|
||||
f" --devices {self.cfg.device_ids} {py_script}"
|
||||
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
|
||||
f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
|
||||
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
|
||||
f" --model {self.cfg.model_config.model!s}"
|
||||
f" --device_ids {self.cfg.device_ids}"
|
||||
@@ -482,7 +482,7 @@ class LLMEngine:
|
||||
f" --eos_tokens_lens {self.data_processor.eos_token_id_len}"
|
||||
f" --pad_token_id {self.data_processor.pad_token_id}"
|
||||
f" --engine_pid {self.cfg.engine_worker_queue_port[0]}"
|
||||
f" --max_num_batched_tokens {self.cfg.max_num_batched_tokens}"
|
||||
f" --max_num_batched_tokens {self.cfg.scheduler_config.max_num_batched_tokens}"
|
||||
f" --splitwise_role {self.cfg.splitwise_role}"
|
||||
f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
|
||||
f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
|
||||
|
@@ -289,7 +289,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
with self.lock:
|
||||
scheduled_reqs: list[Request] = []
|
||||
preempted_reqs: list[Request] = []
|
||||
token_budget = self.config.max_num_batched_tokens
|
||||
token_budget = self.config.scheduler_config.max_num_batched_tokens
|
||||
|
||||
# First, schedule the RUNNING requests.
|
||||
req_index = 0
|
||||
|
@@ -210,7 +210,7 @@ class XGrammarBackend(BackendBase):
|
||||
):
|
||||
super().__init__(fd_config=fd_config)
|
||||
self.vocab_size = fd_config.model_config.vocab_size
|
||||
self.batch_size = fd_config.parallel_config.max_num_seqs
|
||||
self.batch_size = fd_config.scheduler_config.max_num_seqs
|
||||
|
||||
self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace
|
||||
|
||||
|
@@ -152,7 +152,7 @@ class Attention(nn.Layer):
|
||||
|
||||
self.cache_k_block_means = paddle.zeros(
|
||||
[
|
||||
fd_config.parallel_config.max_num_seqs,
|
||||
fd_config.scheduler_config.max_num_seqs,
|
||||
moba_max_seq_length // moba_block_size,
|
||||
self.kv_num_heads,
|
||||
self.head_dim,
|
||||
|
@@ -156,7 +156,7 @@ class FlashAttentionBackend(AttentionBackend):
|
||||
self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)
|
||||
self.max_partition_size: int = int(os.getenv("FLAGS_max_partition_size", "32768"))
|
||||
self.zero_seq_enc_lens_for_decode = paddle.zeros(
|
||||
shape=[fd_config.parallel_config.max_num_seqs, 1], dtype=paddle.int32
|
||||
shape=[fd_config.scheduler_config.max_num_seqs, 1], dtype=paddle.int32
|
||||
)
|
||||
|
||||
def get_attntion_meta(self):
|
||||
|
@@ -77,7 +77,7 @@ class MobaAttentionBackend(AttentionBackend):
|
||||
assert fd_config.moba_attention_config is not None, "moba_attention_config is None"
|
||||
self.block_size = fd_config.parallel_config.block_size
|
||||
self.max_seq_len = fd_config.parallel_config.max_model_len
|
||||
self.max_num_seqs = fd_config.parallel_config.max_num_seqs
|
||||
self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
|
||||
self.kv_num_heads = kv_num_heads
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = fd_config.model_config.head_dim
|
||||
|
@@ -86,7 +86,7 @@ class GCUFlashAttnBackend(AttentionBackend):
|
||||
self.attention_metadata: GCUFlashAttnMetadata = None
|
||||
self.block_size = fd_config.cache_config.block_size
|
||||
self.max_seq_len = fd_config.parallel_config.max_model_len
|
||||
self.max_num_seqs = fd_config.parallel_config.max_num_seqs
|
||||
self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
|
||||
|
||||
self.causal = getattr(fd_config.model_config, "causal", True)
|
||||
|
||||
|
@@ -84,7 +84,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
|
||||
self.attention_metadata: GCUMemEfficientAttnMetadata = None
|
||||
self.block_size = fd_config.cache_config.block_size
|
||||
self.max_seq_len = fd_config.parallel_config.max_model_len
|
||||
self.max_num_seqs = fd_config.parallel_config.max_num_seqs
|
||||
self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
|
||||
|
||||
self.causal = getattr(fd_config.model_config, "causal", True)
|
||||
|
||||
|
@@ -221,7 +221,7 @@ class Sampler(nn.Layer):
|
||||
):
|
||||
early_stopper_cls = get_early_stopper_cls_from_stragegy(fd_config.early_stop_config.strategy)
|
||||
self.early_stopper = early_stopper_cls()
|
||||
self.early_stopper.initialize(fd_config.parallel_config.max_num_seqs, fd_config.early_stop_config)
|
||||
self.early_stopper.initialize(fd_config.scheduler_config.max_num_seqs, fd_config.early_stop_config)
|
||||
|
||||
def set_reasoning_parser(self, reasoning_parser: Optional[ReasoningParser] = None):
|
||||
"""set reasoning parser"""
|
||||
|
@@ -607,9 +607,11 @@ class DeepseekV3ForCausalLM(ModelForCasualLM):
|
||||
num_embeddings=fd_config.model_config.vocab_size,
|
||||
prefix="lm_head",
|
||||
)
|
||||
self.position_ids_buffer = paddle.empty([fd_config.parallel_config.max_num_batched_tokens], dtype=paddle.int32)
|
||||
self.position_ids_buffer = paddle.empty(
|
||||
[fd_config.scheduler_config.max_num_batched_tokens], dtype=paddle.int32
|
||||
)
|
||||
self.mask_encoder_batch_buffer = paddle.empty(
|
||||
[fd_config.parallel_config.max_num_batched_tokens, 1], dtype=paddle.int32
|
||||
[fd_config.scheduler_config.max_num_batched_tokens, 1], dtype=paddle.int32
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
@@ -202,13 +202,12 @@ class SchedulerConfig:
|
||||
Creates appropriate config based on scheduler type (local/global).
|
||||
"""
|
||||
|
||||
def __init__(self, name="local", **kwargs):
|
||||
def __init__(self, args):
|
||||
"""
|
||||
Initialize scheduler configuration factory.
|
||||
|
||||
Args:
|
||||
name: Scheduler type ("local" for LocalScheduler or "global" for GlobalScheduler)
|
||||
**kwargs: Configuration parameters for the specific scheduler type
|
||||
args: Configuration parameters for the specific scheduler type
|
||||
|
||||
Initializes:
|
||||
- Appropriate config object based on scheduler type
|
||||
@@ -217,17 +216,23 @@ class SchedulerConfig:
|
||||
Raises:
|
||||
Exception: If invalid scheduler type is specified
|
||||
"""
|
||||
self.name = name
|
||||
self.name = "local" # "local" for LocalScheduler or "global" for GlobalScheduler
|
||||
self.max_num_batched_tokens = 2048
|
||||
self.max_num_seqs = 34
|
||||
self.config = None
|
||||
|
||||
if name == "local":
|
||||
self.config = LocalSchedulerConfig(**kwargs)
|
||||
for key, value in args.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
|
||||
if name == "global":
|
||||
self.config = GlobalSchedulerConfig(**kwargs)
|
||||
if self.name == "local":
|
||||
self.config = LocalSchedulerConfig(**args)
|
||||
|
||||
if name == "splitwise":
|
||||
self.config = SplitWiseSchedulerConfig(**kwargs)
|
||||
if self.name == "global":
|
||||
self.config = GlobalSchedulerConfig(**args)
|
||||
|
||||
if self.name == "splitwise":
|
||||
self.config = SplitWiseSchedulerConfig(**args)
|
||||
|
||||
def check(self):
|
||||
"""
|
||||
|
@@ -50,8 +50,9 @@ class Proposer(ABC):
|
||||
self.speculative_config = self.cfg.speculative_config
|
||||
self.cache_config = self.cfg.cache_config
|
||||
self.quant_config = self.cfg.quant_config
|
||||
self.scheduler_config = self.cfg.scheduler_config
|
||||
|
||||
self.max_num_seqs = self.parallel_config.max_num_seqs
|
||||
self.max_num_seqs = self.scheduler_config.max_num_seqs
|
||||
self.max_model_len = self.parallel_config.max_model_len
|
||||
self.speculative_method = self.speculative_config.method
|
||||
self.max_draft_token_num = self.speculative_config.num_speculative_tokens
|
||||
|
@@ -89,9 +89,9 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
|
||||
|
||||
# Initialize share inputs
|
||||
self._init_share_inputs(self.parallel_config.max_num_seqs)
|
||||
self._init_share_inputs(self.scheduler_config.max_num_seqs)
|
||||
self.infer_seed_increment = paddle.full(
|
||||
shape=[self.parallel_config.max_num_seqs, 1],
|
||||
shape=[self.scheduler_config.max_num_seqs, 1],
|
||||
fill_value=4,
|
||||
dtype="int64",
|
||||
).cpu()
|
||||
@@ -689,13 +689,13 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1
|
||||
group_size = np.ceil(num_heads / self.model_config.kv_num_heads)
|
||||
|
||||
decode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil(
|
||||
decode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
(decoder_step_token_num * group_size) / decoder_block_shape_q
|
||||
)
|
||||
encode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil(
|
||||
encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
(self.model_config.max_model_len * group_size) / encoder_block_shape_q
|
||||
)
|
||||
kv_max_tile_size = self.parallel_config.max_num_seqs * np.ceil(
|
||||
kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
self.model_config.max_model_len / self.fd_config.cache_config.block_size
|
||||
)
|
||||
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
@@ -914,7 +914,7 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
capture_sizes = self.cudagraph_capture_sizes.copy()
|
||||
for batch_size in sorted(capture_sizes, reverse=True):
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
in_capturing=True,
|
||||
expected_decode_len=expected_decode_len,
|
||||
@@ -929,7 +929,7 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
start_time = time.perf_counter()
|
||||
for batch_size in self.sot_warmup_sizes:
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||
@@ -1140,8 +1140,8 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
|
||||
# 2. Dummy run
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
batch_size=min(self.parallel_config.max_num_seqs, 3),
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=min(self.scheduler_config.max_num_seqs, 3),
|
||||
)
|
||||
|
||||
# 3. gc
|
||||
|
@@ -145,9 +145,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.cudagraph_only_prefill = self.graph_opt_config.cudagraph_only_prefill
|
||||
|
||||
# Initialize share inputs
|
||||
self._init_share_inputs(self.parallel_config.max_num_seqs)
|
||||
self._init_share_inputs(self.scheduler_config.max_num_seqs)
|
||||
self.infer_seed_increment = paddle.full(
|
||||
shape=[self.parallel_config.max_num_seqs, 1],
|
||||
shape=[self.scheduler_config.max_num_seqs, 1],
|
||||
fill_value=4,
|
||||
dtype="int64",
|
||||
).cpu()
|
||||
@@ -1208,13 +1208,13 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# decode_max_tile_size must take into account the maximum case, where *1024 can cover 128K.
|
||||
decode_max_tile_size = (
|
||||
1024
|
||||
* self.parallel_config.max_num_seqs
|
||||
* self.scheduler_config.max_num_seqs
|
||||
* np.ceil((decoder_step_token_num * group_size) / decoder_block_shape_q)
|
||||
)
|
||||
encode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil(
|
||||
encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
(self.model_config.max_model_len * group_size) / encoder_block_shape_q
|
||||
)
|
||||
kv_max_tile_size = self.parallel_config.max_num_seqs * np.ceil(
|
||||
kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
self.model_config.max_model_len / self.fd_config.cache_config.block_size
|
||||
)
|
||||
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
@@ -1508,7 +1508,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
for num_tokens in sorted(capture_sizes, reverse=True):
|
||||
self._dummy_run(
|
||||
num_tokens=num_tokens,
|
||||
batch_size=self.parallel_config.max_num_seqs,
|
||||
batch_size=self.scheduler_config.max_num_seqs,
|
||||
in_capturing=True,
|
||||
expected_decode_len=expected_decode_len,
|
||||
capture_prefill=True,
|
||||
@@ -1519,7 +1519,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
else:
|
||||
for batch_size in sorted(capture_sizes, reverse=True):
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
in_capturing=True,
|
||||
expected_decode_len=expected_decode_len,
|
||||
@@ -1536,7 +1536,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
start_time = time.perf_counter()
|
||||
for batch_size in self.sot_warmup_sizes:
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||
@@ -1815,8 +1815,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
# 2. Dummy run
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
batch_size=min(self.parallel_config.max_num_seqs, 3),
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=min(self.scheduler_config.max_num_seqs, 3),
|
||||
)
|
||||
|
||||
# 3. gc
|
||||
|
@@ -121,9 +121,9 @@ class MetaxModelRunner(ModelRunnerBase):
|
||||
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
|
||||
|
||||
# Initialize share inputs
|
||||
self._init_share_inputs(self.parallel_config.max_num_seqs)
|
||||
self._init_share_inputs(self.scheduler_config.max_num_seqs)
|
||||
self.infer_seed_increment = paddle.full(
|
||||
shape=[self.parallel_config.max_num_seqs, 1],
|
||||
shape=[self.scheduler_config.max_num_seqs, 1],
|
||||
fill_value=4,
|
||||
dtype="int64",
|
||||
).cpu()
|
||||
@@ -995,7 +995,7 @@ class MetaxModelRunner(ModelRunnerBase):
|
||||
encoder_block_shape_q = 64
|
||||
decoder_block_shape_q = 16
|
||||
decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1
|
||||
decode_max_tile_size = self.parallel_config.max_num_seqs * np.ceil(
|
||||
decode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
|
||||
(decoder_step_token_num * np.ceil(num_heads / self.model_config.kv_num_heads)) / decoder_block_shape_q
|
||||
)
|
||||
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
@@ -1242,7 +1242,7 @@ class MetaxModelRunner(ModelRunnerBase):
|
||||
capture_sizes = self.cudagraph_capture_sizes.copy()
|
||||
for batch_size in sorted(capture_sizes, reverse=True):
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
in_capturing=True,
|
||||
expected_decode_len=expected_decode_len,
|
||||
@@ -1257,7 +1257,7 @@ class MetaxModelRunner(ModelRunnerBase):
|
||||
start_time = time.perf_counter()
|
||||
for batch_size in self.sot_warmup_sizes:
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||
@@ -1489,8 +1489,8 @@ class MetaxModelRunner(ModelRunnerBase):
|
||||
|
||||
# 2. Dummy run
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
batch_size=min(self.parallel_config.max_num_seqs, 3),
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=min(self.scheduler_config.max_num_seqs, 3),
|
||||
)
|
||||
|
||||
# 3. gc
|
||||
|
@@ -45,6 +45,7 @@ class ModelRunnerBase(ABC):
|
||||
self.graph_opt_config = fd_config.graph_opt_config
|
||||
self.quant_config = fd_config.quant_config
|
||||
self.cache_config = fd_config.cache_config
|
||||
self.scheduler_config = fd_config.scheduler_config
|
||||
# ... config
|
||||
|
||||
self.device = device
|
||||
|
@@ -44,6 +44,7 @@ from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
|
||||
from fastdeploy.inter_communicator import IPCSignal
|
||||
from fastdeploy.model_executor.layers.quantization import parse_quant_config
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.scheduler import SchedulerConfig
|
||||
from fastdeploy.utils import get_logger
|
||||
from fastdeploy.worker.worker_base import WorkerBase
|
||||
|
||||
@@ -662,6 +663,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
speculative_config = SpeculativeConfig(args.speculative_config)
|
||||
parallel_config = ParallelConfig(vars(args))
|
||||
cache_config = CacheConfig(vars(args))
|
||||
scheduler_config = SchedulerConfig(vars(args))
|
||||
parallel_config.tensor_parallel_rank = local_rank % parallel_config.tensor_parallel_size
|
||||
parallel_config.data_parallel_rank = local_rank // parallel_config.tensor_parallel_size
|
||||
# config for EP
|
||||
@@ -758,6 +760,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
graph_opt_config=graph_opt_config,
|
||||
early_stop_config=early_stop_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
engine_worker_queue_port=args.engine_worker_queue_port,
|
||||
ips=args.ips,
|
||||
moba_attention_config=moba_attention_config,
|
||||
|
@@ -353,12 +353,12 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
self.graph_opt_level = self.graph_opt_config.graph_opt_level
|
||||
self.use_cudagraph = False
|
||||
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
|
||||
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
|
||||
self.input_ids = paddle.zeros(self.scheduler_config.max_num_seqs, dtype="int32")
|
||||
|
||||
# Initialize share inputs
|
||||
self._init_share_inputs(self.fd_config.parallel_config.max_num_seqs)
|
||||
self._init_share_inputs(self.fd_config.scheduler_config.max_num_seqs)
|
||||
self.infer_seed_increment = paddle.full(
|
||||
shape=[self.parallel_config.max_num_seqs, 1],
|
||||
shape=[self.scheduler_config.max_num_seqs, 1],
|
||||
fill_value=4,
|
||||
dtype="int64",
|
||||
).cpu()
|
||||
@@ -812,7 +812,7 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
start_time = time.perf_counter()
|
||||
for batch_size in self.sot_warmup_sizes:
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||
@@ -987,8 +987,8 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
||||
|
||||
self._dummy_run(
|
||||
num_tokens=int(self.parallel_config.max_num_batched_tokens),
|
||||
batch_size=min(self.parallel_config.max_num_seqs, 1),
|
||||
num_tokens=int(self.scheduler_config.max_num_batched_tokens),
|
||||
batch_size=min(self.scheduler_config.max_num_seqs, 1),
|
||||
)
|
||||
|
||||
def clear_block_table(self) -> None:
|
||||
|
@@ -23,6 +23,7 @@ from fastdeploy.config import (
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import (
|
||||
@@ -152,14 +153,16 @@ class TestCUDAGrpahSubgraph(unittest.TestCase):
|
||||
# Set FastDeploy config
|
||||
graph_opt_config = GraphOptimizationConfig(args={})
|
||||
graph_opt_config.use_cudagraph = True
|
||||
parallel_config = ParallelConfig(args={})
|
||||
parallel_config.max_num_seqs = 8
|
||||
scheduler_config = SchedulerConfig(args={})
|
||||
scheduler_config.max_num_seqs = 8
|
||||
cache_config = CacheConfig({})
|
||||
parallel_config = ParallelConfig(args={})
|
||||
# Initialize cuda graph capture list
|
||||
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs)
|
||||
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs)
|
||||
graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
|
||||
graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
|
||||
fd_config = FDConfig(
|
||||
graph_opt_config=graph_opt_config,
|
||||
scheduler_config=scheduler_config,
|
||||
parallel_config=parallel_config,
|
||||
cache_config=cache_config,
|
||||
test_mode=True,
|
||||
|
@@ -7,6 +7,7 @@ from fastdeploy.config import (
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import (
|
||||
@@ -90,11 +91,15 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
|
||||
# Set FastDeploy config
|
||||
graph_opt_config = GraphOptimizationConfig(args={})
|
||||
graph_opt_config.use_cudagraph = True
|
||||
parallel_config = ParallelConfig(args={})
|
||||
scheduler_config = SchedulerConfig(args={})
|
||||
cache_config = CacheConfig(args={})
|
||||
parallel_config.max_num_seqs = 1
|
||||
scheduler_config.max_num_seqs = 1
|
||||
parallel_config = ParallelConfig(args={})
|
||||
fd_config = FDConfig(
|
||||
graph_opt_config=graph_opt_config, parallel_config=parallel_config, cache_config=cache_config
|
||||
graph_opt_config=graph_opt_config,
|
||||
scheduler_config=scheduler_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
)
|
||||
|
||||
# Run Test Case1
|
||||
|
@@ -23,6 +23,7 @@ from fastdeploy.config import (
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import (
|
||||
@@ -99,16 +100,18 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase):
|
||||
"""Run test case"""
|
||||
graph_opt_config = GraphOptimizationConfig(args={})
|
||||
graph_opt_config.use_cudagraph = True
|
||||
parallel_config = ParallelConfig(args={})
|
||||
parallel_config.max_num_seqs = 1
|
||||
scheduler_config = SchedulerConfig(args={})
|
||||
scheduler_config.max_num_seqs = 1
|
||||
cache_config = CacheConfig({})
|
||||
parallel_config = ParallelConfig(args={})
|
||||
# Initialize cuda graph capture list
|
||||
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs)
|
||||
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs)
|
||||
graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
|
||||
graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
|
||||
fd_config = FDConfig(
|
||||
graph_opt_config=graph_opt_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
test_mode=True,
|
||||
)
|
||||
|
||||
|
@@ -25,6 +25,7 @@ from fastdeploy.config import (
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import (
|
||||
@@ -85,15 +86,16 @@ class TestGraphOptBackend(unittest.TestCase):
|
||||
baseline_graph_opt_config.use_cudagraph = False
|
||||
baseline_graph_opt_config.graph_opt_level = 0
|
||||
|
||||
baseline_parallel_config = ParallelConfig(args={})
|
||||
baseline_parallel_config.max_num_seqs = self.max_num_seqs
|
||||
baseline_scheduler_config = SchedulerConfig(args={})
|
||||
baseline_scheduler_config.max_num_seqs = self.max_num_seqs
|
||||
|
||||
baseline_cache_config = CacheConfig({})
|
||||
|
||||
baseline_parallel_config = ParallelConfig(args={})
|
||||
self.baseline_fd_config = FDConfig(
|
||||
graph_opt_config=baseline_graph_opt_config,
|
||||
parallel_config=baseline_parallel_config,
|
||||
scheduler_config=baseline_scheduler_config,
|
||||
cache_config=baseline_cache_config,
|
||||
parallel_config=baseline_parallel_config,
|
||||
test_mode=True,
|
||||
)
|
||||
|
||||
@@ -129,17 +131,19 @@ class TestGraphOptBackend(unittest.TestCase):
|
||||
graph_opt_config.graph_opt_level = graph_opt_level
|
||||
|
||||
# Setup parallel config
|
||||
parallel_config = ParallelConfig(args={})
|
||||
parallel_config.max_num_seqs = self.max_num_seqs
|
||||
scheduler_config = SchedulerConfig(args={})
|
||||
scheduler_config.max_num_seqs = self.max_num_seqs
|
||||
|
||||
# Setup cache config
|
||||
cache_config = CacheConfig({})
|
||||
parallel_config = ParallelConfig(args={})
|
||||
|
||||
# Create FD config
|
||||
return FDConfig(
|
||||
graph_opt_config=graph_opt_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
test_mode=True,
|
||||
)
|
||||
|
||||
|
@@ -29,6 +29,7 @@ from fastdeploy.config import (
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import (
|
||||
@@ -88,15 +89,17 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase):
|
||||
"""Run test case"""
|
||||
# Set FastDeploy config
|
||||
graph_opt_config = GraphOptimizationConfig({"use_cudagraph": True, "graph_opt_level": 1})
|
||||
parallel_config = ParallelConfig({"max_num_seqs": 1})
|
||||
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs)
|
||||
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs)
|
||||
scheduler_config = SchedulerConfig({"max_num_seqs": 1})
|
||||
graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
|
||||
graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
|
||||
cache_config = CacheConfig({})
|
||||
parallel_config = ParallelConfig(args={})
|
||||
|
||||
fd_config = FDConfig(
|
||||
graph_opt_config=graph_opt_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
test_mode=True,
|
||||
)
|
||||
|
||||
|
@@ -21,6 +21,7 @@ from fastdeploy.config import (
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
)
|
||||
|
||||
|
||||
@@ -50,12 +51,17 @@ class FakeModelConfig:
|
||||
|
||||
def get_default_test_fd_config():
|
||||
graph_opt_config = GraphOptimizationConfig(args={})
|
||||
scheduler_config = SchedulerConfig(args={})
|
||||
scheduler_config.max_num_seqs = 1
|
||||
parallel_config = ParallelConfig(args={})
|
||||
parallel_config.max_num_seqs = 1
|
||||
parallel_config.data_parallel_rank = 1
|
||||
cache_config = CacheConfig({})
|
||||
fd_config = FDConfig(
|
||||
graph_opt_config=graph_opt_config, parallel_config=parallel_config, cache_config=cache_config, test_mode=True
|
||||
graph_opt_config=graph_opt_config,
|
||||
parallel_config=parallel_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
test_mode=True,
|
||||
)
|
||||
fd_config.model_config = FakeModelConfig()
|
||||
return fd_config
|
||||
|
@@ -6,6 +6,7 @@ from fastdeploy.config import (
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
)
|
||||
|
||||
|
||||
@@ -14,10 +15,12 @@ class TestConfig(unittest.TestCase):
|
||||
parallel_config = ParallelConfig({"tensor_parallel_size": 16, "expert_parallel_size": 1})
|
||||
graph_opt_config = GraphOptimizationConfig({})
|
||||
cache_config = CacheConfig({})
|
||||
scheduler_config = SchedulerConfig({})
|
||||
fd_config = FDConfig(
|
||||
parallel_config=parallel_config,
|
||||
graph_opt_config=graph_opt_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
ips=["1.1.1.1", "0.0.0.0"],
|
||||
test_mode=True,
|
||||
)
|
||||
@@ -28,10 +31,12 @@ class TestConfig(unittest.TestCase):
|
||||
parallel_config = ParallelConfig({})
|
||||
graph_opt_config = GraphOptimizationConfig({})
|
||||
cache_config = CacheConfig({})
|
||||
scheduler_config = SchedulerConfig({})
|
||||
fd_config = FDConfig(
|
||||
parallel_config=parallel_config,
|
||||
graph_opt_config=graph_opt_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
ips="0.0.0.0",
|
||||
test_mode=True,
|
||||
)
|
||||
@@ -42,26 +47,29 @@ class TestConfig(unittest.TestCase):
|
||||
graph_opt_config = GraphOptimizationConfig({})
|
||||
cache_config = CacheConfig({})
|
||||
cache_config.enable_chunked_prefill = True
|
||||
scheduler_config = SchedulerConfig({})
|
||||
fd_config = FDConfig(
|
||||
parallel_config=parallel_config,
|
||||
graph_opt_config=graph_opt_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
ips="0.0.0.0",
|
||||
test_mode=True,
|
||||
)
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
assert fd_config.max_num_batched_tokens == 2048
|
||||
assert fd_config.scheduler_config.max_num_batched_tokens == 2048
|
||||
|
||||
cache_config.enable_chunked_prefill = False
|
||||
fd_config = FDConfig(
|
||||
parallel_config=parallel_config,
|
||||
graph_opt_config=graph_opt_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
ips="0.0.0.0",
|
||||
test_mode=True,
|
||||
)
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
assert fd_config.max_num_batched_tokens == 8192
|
||||
assert fd_config.scheduler_config.max_num_batched_tokens == 8192
|
||||
|
||||
def test_fdconfig_init_cache(self):
|
||||
parallel_config = ParallelConfig({})
|
||||
@@ -69,10 +77,12 @@ class TestConfig(unittest.TestCase):
|
||||
cache_config = CacheConfig({})
|
||||
cache_config.cache_transfer_protocol = "rdma,ipc"
|
||||
cache_config.pd_comm_port = "2334"
|
||||
scheduler_config = SchedulerConfig({})
|
||||
fd_config = FDConfig(
|
||||
parallel_config=parallel_config,
|
||||
graph_opt_config=graph_opt_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
splitwise_role="prefill",
|
||||
test_mode=True,
|
||||
)
|
||||
|
@@ -2,7 +2,7 @@ from dataclasses import asdict
|
||||
from types import SimpleNamespace
|
||||
|
||||
from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
|
||||
from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig
|
||||
from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig
|
||||
from fastdeploy.engine.args_utils import EngineArgs
|
||||
from fastdeploy.engine.request import Request
|
||||
|
||||
@@ -18,6 +18,7 @@ def test_normal_case():
|
||||
model_cfg.print = print
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
scheduler_cfg = SchedulerConfig(args)
|
||||
graph_opt_cfg = engine_args.create_graph_optimization_config()
|
||||
fd_config = FDConfig(
|
||||
model_config=model_cfg,
|
||||
@@ -25,7 +26,7 @@ def test_normal_case():
|
||||
parallel_config=parallel_cfg,
|
||||
graph_opt_config=graph_opt_cfg,
|
||||
speculative_config=speculative_cfg,
|
||||
max_num_batched_tokens=engine_args.max_num_batched_tokens,
|
||||
scheduler_cfg=scheduler_cfg,
|
||||
)
|
||||
cache_manager = PrefixCacheManager(config=fd_config, tensor_parallel_size=8, splitwise_role="mixed")
|
||||
req1 = Request.from_dict({"request_id": "req1", "prompt_token_ids": [1] * 3200, "prompt_token_ids_len": 3200})
|
||||
|
@@ -1,7 +1,7 @@
|
||||
from dataclasses import asdict
|
||||
from types import SimpleNamespace
|
||||
|
||||
from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig
|
||||
from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig
|
||||
from fastdeploy.engine.args_utils import EngineArgs
|
||||
from fastdeploy.engine.request import Request
|
||||
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
|
||||
@@ -17,6 +17,7 @@ def test_normal_schedule():
|
||||
model_cfg.print = print
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
scheduler_cfg = SchedulerConfig(args)
|
||||
graph_opt_cfg = engine_args.create_graph_optimization_config()
|
||||
fd_config = FDConfig(
|
||||
model_config=model_cfg,
|
||||
@@ -24,7 +25,7 @@ def test_normal_schedule():
|
||||
parallel_config=parallel_cfg,
|
||||
speculative_config=speculative_cfg,
|
||||
graph_opt_config=graph_opt_cfg,
|
||||
max_num_batched_tokens=engine_args.max_num_batched_tokens,
|
||||
scheduler_config=scheduler_cfg,
|
||||
)
|
||||
resource_manager_v1 = ResourceManagerV1(
|
||||
max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"
|
||||
@@ -80,6 +81,7 @@ def test_preempted_request():
|
||||
model_cfg.print = print
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
scheduler_cfg = SchedulerConfig(args)
|
||||
graph_opt_cfg = engine_args.create_graph_optimization_config()
|
||||
fd_config = FDConfig(
|
||||
model_config=model_cfg,
|
||||
@@ -87,7 +89,7 @@ def test_preempted_request():
|
||||
parallel_config=parallel_cfg,
|
||||
graph_opt_config=graph_opt_cfg,
|
||||
speculative_config=speculative_cfg,
|
||||
max_num_batched_tokens=engine_args.max_num_batched_tokens,
|
||||
scheduler_config=scheduler_cfg,
|
||||
)
|
||||
resource_manager_v1 = ResourceManagerV1(
|
||||
max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"
|
||||
|
Reference in New Issue
Block a user