[BugFix] fix multinode deployment (#2977)

This commit is contained in:
ltd0924
2025-07-24 15:04:04 +08:00
committed by GitHub
parent 3792345c3a
commit f935d6f862
9 changed files with 71 additions and 81 deletions

View File

@@ -6,7 +6,6 @@
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#dist_init_ip
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,7 +26,6 @@ from fastdeploy.utils import (
ceil_div,
check_unified_ckpt,
get_host_ip,
get_random_port,
is_port_available,
llm_logger,
)
@@ -644,9 +642,7 @@ class Config:
max_model_len: int = 8192,
max_num_seqs: int = 8,
max_num_batched_tokens: Optional[int] = None,
dist_init_ip: str = None,
nnodes: int = 1,
node_rank: int = 0,
ips: str = None,
speculative_config: Optional[Dict[str, Any]] = None,
graph_optimization_config: Optional[Dict[str, Any]] = None,
use_warmup: bool = False,
@@ -701,15 +697,25 @@ class Config:
self.tokenizer = tokenizer
self.max_num_batched_tokens = max_num_batched_tokens
self.tensor_parallel_size = tensor_parallel_size
self.dist_init_ip = dist_init_ip
self.ips = ips
self.nnode = nnodes
self.node_rank = node_rank
if self.dist_init_ip is None:
if self.ips is None:
self.master_ip = "0.0.0.0"
elif isinstance(self.ips, list):
self.master_ip = self.ips[0]
else:
self.master_ip = self.dist_init_ip
self.dist_init_addr = f"{self.dist_init_ip}:{get_random_port()}"
self.ips = self.ips.split(",")
self.master_ip = self.ips[0]
if self.ips is None:
self.nnode = 1
self.node_rank = 0
else:
self.nnode = len(self.ips)
for idx, ip in enumerate(self.ips):
if ip == self.master_ip:
self.node_rank = idx
self.max_model_len = max_model_len
self.max_num_seqs = max_num_seqs
@@ -773,14 +779,11 @@ class Config:
self.device_ids.split(",").__len__() == self.worker_num_per_node
), f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"
assert (
self.worker_num_per_node % self.tensor_parallel_size == 0
), f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}"
self.local_device_ids = self.device_ids.split(",")[: self.tensor_parallel_size]
self.host_ip = get_host_ip()
if self.dist_init_ip is None or self.host_ip == self.master_ip:
if self.ips is None or self.host_ip == self.master_ip:
self.is_master = True
else:
self.is_master = False
@@ -817,9 +820,6 @@ class Config:
assert is_port_available(
"0.0.0.0", self.engine_worker_queue_port
), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
assert (
self.max_chips_per_node >= self.tensor_parallel_size > 0
), f"tensor_parallel_size: {self.tensor_parallel_size} should be between 1 and {self.max_chips_per_node}"
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"