[Feature] [PD Disaggregation] simplify configuration for pd-disaggregated deployment, and refactor post-init and usage for all ports (#5415)

* [feat] simplify configuration for pd-disaggregated deployment, and refactor post-init and usage for all ports

* [fix] fix some bugs

* [fix] fix rdma port for cache manager/messager

* [fix] temporarily cancel port availability check to see if it can pass ci test

* [feat] simplify args for multi api server

* [fix] fix dp

* [fix] fix port for xpu

* [fix] add tests for ports post processing & fix ci

* [test] fix test_multi_api_server

* [fix] fix rdma_comm_ports args for multi_api_server

* [fix] fix test_common_engine

* [fix] fix test_cache_transfer_manager

* [chore] automatically setting FD_ENABLE_MULTI_API_SERVER

* [fix] avoid api server from creating engine_args twice

* [fix] fix test_run_batch

* [fix] fix test_metrics

* [fix] fix splitwise connector init

* [test] add test_rdma_transfer and test_expert_service

* [fix] fix code syntax

* [fix] fix test_rdma_transfer and build wheel with rdma script
This commit is contained in:
Yonghua Li
2025-12-17 15:50:42 +08:00
committed by GitHub
parent cdc0004894
commit 0c8c6369ed
34 changed files with 1323 additions and 409 deletions

View File

@@ -603,6 +603,19 @@ def get_random_port():
continue
def parse_ports(ports):
if ports is None:
return None
elif isinstance(ports, int):
return [ports]
elif isinstance(ports, str):
return [int(p) for p in ports.split(",")]
elif isinstance(ports, list):
return [int(p) for p in ports]
else:
raise TypeError(f"Cannot parse ports into List[int]: {ports}")
def is_port_available(host, port):
"""
Check the port is available
@@ -621,6 +634,57 @@ def is_port_available(host, port):
return True
def find_free_ports(
port_range: tuple[int, int] = (8000, 65535),
num_ports: int = 1,
host: str = "0.0.0.0",
) -> list[int]:
"""
Find available TCP ports in a given range, scanning from a random start.
Args:
port_range: (start, end), inclusive, e.g. (20000, 30000).
num_ports: number of ports to find.
host: host to bind, default "0.0.0.0".
Returns:
List of available ports with length == num_ports.
Raises:
ValueError: invalid port range or num_ports <= 0.
RuntimeError: not enough free ports in the range.
"""
start, end = port_range
if start < 0 or end > 65535 or start > end:
raise ValueError(f"Invalid port range: {port_range}")
if num_ports <= 0:
raise ValueError("num_ports must be a positive integer")
total_ports = end - start + 1
if num_ports > total_ports:
raise ValueError("num_ports is larger than range size")
# Generate all ports and rotate with a random start index
ports = list(range(start, end + 1))
offset = random.randint(0, total_ports - 1)
ports = ports[offset:] + ports[:offset]
free_ports: list[int] = []
for port in ports:
if is_port_available(host, port):
free_ports.append(port)
if len(free_ports) >= num_ports:
break
if len(free_ports) < num_ports:
raise RuntimeError(f"Only found {len(free_ports)} free ports in {port_range}, requested {num_ports}.")
return free_ports
def singleton(cls):
"""
Singleton decorator for a class.