mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[PD Disaggregation] remove splitwise deployment on single node and refine the code (#4891)
* remove splitwise deployment on single node and refine the code * up * up * up * add test * up
This commit is contained in:
@@ -296,11 +296,6 @@ class EngineArgs:
|
||||
Port for splitwise communication.
|
||||
"""
|
||||
|
||||
innode_prefill_ports: Optional[List[int]] = None
|
||||
"""
|
||||
Ports for innode dispatch request.
|
||||
"""
|
||||
|
||||
rdma_comm_ports: Optional[List[int]] = None
|
||||
"""
|
||||
Ports for rdma communication.
|
||||
@@ -500,8 +495,33 @@ class EngineArgs:
|
||||
if self.max_logprobs == -1 and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
raise NotImplementedError("Only ENABLE_V1_KVCACHE_SCHEDULER=1 support max_logprobs=-1")
|
||||
|
||||
if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if self.splitwise_role != "mixed":
|
||||
if self.scheduler_name == "local" and self.router is None:
|
||||
raise ValueError(
|
||||
f"When using {self.splitwise_role} role and the {self.scheduler_name} "
|
||||
f"scheduler, please provide --router argument."
|
||||
)
|
||||
|
||||
if "rdma" in self.cache_transfer_protocol:
|
||||
if self.rdma_comm_ports is None:
|
||||
raise ValueError(
|
||||
"Please set --rdma_comm_ports argument when using " "rdma cache transfer protocol."
|
||||
)
|
||||
if len(self.rdma_comm_ports) != self.tensor_parallel_size:
|
||||
raise ValueError("The number of rdma comm ports must be equal to tensor parallel size.")
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER == 1:
|
||||
if "ipc" in self.cache_transfer_protocol:
|
||||
# FIXME: support ipc cache transfer protocol
|
||||
raise NotImplementedError(
|
||||
"only support rdma cache transfer protocol " "when using ENABLE_V1_KVCACHE_SCHEDULER."
|
||||
)
|
||||
# FIXME: fix this bug
|
||||
if self.splitwise_role == "prefill" and self.num_gpu_blocks_override is None:
|
||||
raise NotImplementedError(
|
||||
"please set num_gpu_blocks_override for prefill " "instance using ENABLE_V1_KVCACHE_SCHEDULER."
|
||||
)
|
||||
|
||||
if not current_platform.is_cuda() and not current_platform.is_xpu():
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if self.guided_decoding_backend != "off":
|
||||
@@ -931,13 +951,6 @@ class EngineArgs:
|
||||
'mixed'. (prefill, decode, mixed)",
|
||||
)
|
||||
|
||||
splitwise_group.add_argument(
|
||||
"--innode-prefill-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.innode_prefill_ports,
|
||||
help="port for innode prefill, only used in single machine splitwise deployment",
|
||||
)
|
||||
|
||||
splitwise_group.add_argument(
|
||||
"--cache-transfer-protocol",
|
||||
type=str,
|
||||
@@ -1233,7 +1246,6 @@ class EngineArgs:
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
tool_parser=self.tool_call_parser,
|
||||
innode_prefill_ports=self.innode_prefill_ports,
|
||||
max_num_partial_prefills=self.max_num_partial_prefills,
|
||||
max_long_partial_prefills=self.max_long_partial_prefills,
|
||||
long_prefill_token_threshold=self.long_prefill_token_threshold,
|
||||
|
||||
Reference in New Issue
Block a user