diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index ba415978b..cdd9e81d9 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -124,9 +124,19 @@ class EngineArgs: Ratio of tokens to process in a block. """ - pod_ips: Optional[List[str]] = None + dist_init_ip: Optional[str] = None """ - List of IP addresses for nodes in the cluster. + The master node ip of multinode deployment + """ + + nnodes: int = 1 + """ + The number of nodes in multinode deployment + """ + + node_rank: int = 0 + """ + The rank of the current node in multinode deployment """ swap_space: float = None @@ -485,11 +495,25 @@ class EngineArgs: # Cluster system parameters group system_group = parser.add_argument_group("System Configuration") system_group.add_argument( - "--pod-ips", - type=lambda s: s.split(",") if s else None, - default=EngineArgs.pod_ips, + "--dist-init-ip", + default=EngineArgs.dist_init_ip, help= - "List of IP addresses for nodes in the cluster (comma-separated).") + "IP addresses of master node.") + + system_group.add_argument( + "--nnodes", + type=int, + default=EngineArgs.nnodes, + help= + "The number of all nodes.") + + system_group.add_argument( + "--node-rank", + type=int, + default=EngineArgs.node_rank, + help= + "node rank id (range [0, nnodes)).") + # Performance tuning parameters group @@ -789,7 +813,9 @@ class EngineArgs: max_num_seqs=self.max_num_seqs, speculative_config=speculative_cfg, max_num_batched_tokens=self.max_num_batched_tokens, - pod_ips=self.pod_ips, + dist_init_ip=self.dist_init_ip, + nnodes=self.nnodes, + node_rank=self.node_rank, use_warmup=self.use_warmup, engine_worker_queue_port=self.engine_worker_queue_port, limit_mm_per_prompt=self.limit_mm_per_prompt, diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 2d49aa0ce..02df10328 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -6,7 +6,7 @@ # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 -# +#dist_init_ip # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,7 +24,7 @@ from fastdeploy import envs from fastdeploy.platforms import current_platform from fastdeploy.scheduler import SchedulerConfig from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip, - is_port_available, llm_logger) + is_port_available, get_random_port, llm_logger) TaskOption = Literal["generate"] @@ -642,7 +642,9 @@ class Config: max_model_len: int = 8192, max_num_seqs: int = 8, max_num_batched_tokens: Optional[int] = None, - pod_ips: Optional[List[str]] = None, + dist_init_ip: str = None, + nnodes: int = 1, + node_rank: int = 0, speculative_config: Optional[Dict[str, Any]] = None, graph_optimization_config: Optional[Dict[str, Any]] = None, use_warmup: bool = False, @@ -675,7 +677,6 @@ class Config: max_model_len (int): Maximum model length. Default is 8192. max_num_seqs (int): Maximum number of sequences. Default is 8. max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. Default is None. - pod_ips (Optional[List[str]]): List of POD IPs. Default is None. mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. Default is None. speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. Default is None. graph_optimization_config (Optional[Dict[str, Any]]): Graph optimizaion backend execution configuration. Default is None. @@ -699,7 +700,16 @@ class Config: self.tokenizer = tokenizer self.max_num_batched_tokens = max_num_batched_tokens self.tensor_parallel_size = tensor_parallel_size - self.pod_ips = pod_ips + self.dist_init_ip = dist_init_ip + + self.nnode = nnodes + self.node_rank = node_rank + if self.dist_init_ip is None: + self.master_ip = "0.0.0.0" + else: + self.master_ip = self.dist_init_ip + self.dist_init_addr = f"{self.dist_init_ip}:{get_random_port()}" + self.max_model_len = max_model_len self.max_num_seqs = max_num_seqs self.limit_mm_per_prompt = limit_mm_per_prompt @@ -716,14 +726,8 @@ class Config: self.graph_optimization_config = graph_optimization_config self.guided_decoding_backend = guided_decoding_backend self.disable_any_whitespace = disable_any_whitespace - self.is_master = True self._str_to_list("innode_prefill_ports", int) - self._str_to_list("pod_ips", str) - if self.pod_ips is None: - self.nnode = 1 - else: - self.nnode = len(self.pod_ips) assert self.splitwise_role in ["mixed", "prefill", "decode"] @@ -778,9 +782,9 @@ class Config: self.host_ip = get_host_ip() - if self.pod_ips is None: - self.pod_ips = ["0.0.0.0"] - elif self.host_ip != self.pod_ips[0]: + if self.dist_init_ip is None or self.host_ip == self.master_ip: + self.is_master = True + else: self.is_master = False import paddle diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index b546ba7da..44cf24c38 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -174,7 +174,7 @@ class LLMEngine(object): cache_config=self.cfg.cache_config, tensor_parallel_size=self.cfg.tensor_parallel_size, device_ids=device_ids, - pod_ip=self.cfg.pod_ips[0], + pod_ip=self.cfg.master_ip, engine_worker_queue_port=self.cfg.engine_worker_queue_port, pid_suffix=self.ipc_signal_suffix) @@ -239,11 +239,12 @@ class LLMEngine(object): if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1: self.dp_processed = [] - for i in range(1, self.cfg.parallel_config.data_parallel_size): + for i in range(1, self.cfg.parallel_config.data_parallel_size // self.cfg.nnode): time.sleep(1) self.dp_processed.append( multiprocessing.Process(target=start_expert_service, - args=(self.cfg, i, + args=(self.cfg, + i + self.cfg.node_rank * self.cfg.worker_num_per_node, self.ipc_signal_suffix))) llm_logger.info(f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}" \ + " data parallel id {}".format(i)) @@ -1007,8 +1008,6 @@ class LLMEngine(object): ) arguments = ( - f" --nnodes {str(self.cfg.nnode)}" - f" --ips {','.join(self.cfg.pod_ips)}" f" --devices {self.cfg.device_ids} {py_script}" f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}" f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}" @@ -1016,7 +1015,7 @@ class LLMEngine(object): f" --device_ids {self.cfg.device_ids}" f" --tensor_parallel_size {self.cfg.tensor_parallel_size}" f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}" - f" --pod_ip {self.cfg.pod_ips[0]}" + f" --pod_ip {self.cfg.master_ip}" f" --total_block_num {self.cfg.cache_config.total_block_num}" f" --block_size {self.cfg.cache_config.block_size}" f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}" @@ -1057,7 +1056,11 @@ class LLMEngine(object): if value: arguments = arguments + f" --{worker_flag}" if self.cfg.nnode > 1: - pd_cmd = pd_cmd + f" --ips {self.cfg.ips}" + pd_cmd = pd_cmd + ( + f" --master {self.cfg.dist_init_addr}" + f" --nnodes {str(self.cfg.nnode)}" + f" --rank {str(self.cfg.node_rank)}" + ) pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log" llm_logger.info("Launch worker service command: {}".format(pd_cmd)) p = subprocess.Popen( @@ -1158,7 +1161,7 @@ class LLMEngine(object): cache_config=self.cfg.cache_config, tensor_parallel_size=self.cfg.tensor_parallel_size, device_ids=device_ids, - pod_ip=self.cfg.pod_ips[0], + pod_ip=self.cfg.master_ip, engine_worker_queue_port=self.cfg.engine_worker_queue_port, pid_suffix=self.ipc_signal_suffix) def check_health(self, time_interval_threashold=30): @@ -1245,8 +1248,9 @@ class LLMEngine(object): """ start queue service for engine worker communication """ - address = (self.cfg.pod_ips[0], self.cfg.engine_worker_queue_port) - if self.cfg.host_ip == self.cfg.pod_ips[0] or self.cfg.pod_ips[0] == "0.0.0.0": + address = (self.cfg.master_ip, self.cfg.engine_worker_queue_port) + if self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0": + llm_logger.info(f"Starting engine worker queue server service at {address}") self.engine_worker_queue_server = EngineWorkerQueue( address=address, is_server=True, @@ -1256,7 +1260,7 @@ class LLMEngine(object): if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed': self.cache_task_queue = EngineCacheQueue( - address=(self.cfg.pod_ips[0], self.cfg.cache_config.cache_queue_port), + address=(self.cfg.master_ip, self.cfg.cache_config.cache_queue_port), authkey=b'cache_queue_service', is_server=True, num_client=self.cfg.tensor_parallel_size, @@ -1270,4 +1274,6 @@ class LLMEngine(object): is_server=False, num_client=self.cfg.tensor_parallel_size, client_id=0, - local_data_parallel_id=0) + local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, + local_data_parallel_id= min(self.cfg.worker_num_per_node * self.cfg.node_rank, + self.cfg.parallel_config.data_parallel_size - 1)) diff --git a/fastdeploy/engine/expert_service.py b/fastdeploy/engine/expert_service.py index 26d2d364c..66607da82 100644 --- a/fastdeploy/engine/expert_service.py +++ b/fastdeploy/engine/expert_service.py @@ -49,8 +49,8 @@ class ExpertService(object): cfg (Config): Config object containing all the configuration parameters. """ self.cfg = cfg - start_pos = local_data_parallel_id * self.cfg.tensor_parallel_size - end_pos = (local_data_parallel_id + 1) * self.cfg.tensor_parallel_size + start_pos = (local_data_parallel_id * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node + end_pos = ((local_data_parallel_id + 1) * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[ start_pos:end_pos] self.cfg.local_device_ids = self.cfg.device_ids.split( @@ -65,7 +65,7 @@ class ExpertService(object): self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id - address = (cfg.pod_ips[0], cfg.engine_worker_queue_port) + address = (cfg.master_ip, cfg.engine_worker_queue_port) self.engine_worker_queue = EngineWorkerQueue( address=address, is_server=False, @@ -118,7 +118,7 @@ class ExpertService(object): cache_config=self.cfg.cache_config, tensor_parallel_size=self.cfg.tensor_parallel_size, device_ids=self.cfg.local_device_ids, - pod_ip=self.cfg.pod_ips[0], + pod_ip=self.cfg.master_ip, engine_worker_queue_port=self.cfg.engine_worker_queue_port, pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}" ) diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 08230aa15..5601a7e4c 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -85,7 +85,7 @@ class LLM: self.mutex = threading.Lock() self.req_output = dict() - self.master_node_ip = self.llm_engine.cfg.pod_ips[0] + self.master_node_ip = self.llm_engine.cfg.master_ip self._receive_output_thread = threading.Thread( target=self._receive_output, daemon=True) self._receive_output_thread.start() diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 3a2ee5e72..5061d60cf 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -122,8 +122,8 @@ async def lifespan(app: FastAPI): args.mm_processor_kwargs, args.enable_mm, args.reasoning_parser) app.state.dynamic_load_weight = args.dynamic_load_weight - chat_handler = OpenAIServingChat(engine_client, pid, args.pod_ips) - completion_handler = OpenAIServingCompletion(engine_client, pid, args.pod_ips) + chat_handler = OpenAIServingChat(engine_client, pid, args.dist_init_ip) + completion_handler = OpenAIServingCompletion(engine_client, pid, args.dist_init_ip) engine_client.create_zmq_client(model=pid, mode=zmq.PUSH) engine_client.pid = pid app.state.engine_client = engine_client diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 90d9ffe78..d55545428 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -40,16 +40,16 @@ class OpenAIServingChat: OpenAI-style chat completions serving """ - def __init__(self, engine_client, pid, pod_ips): + def __init__(self, engine_client, pid, dist_init_ip): self.engine_client = engine_client self.pid = pid - self.pod_ips = pod_ips + self.master_ip = dist_init_ip self.host_ip = get_host_ip() def _check_master(self): - if self.pod_ips is None: + if self.master_ip is None: return True - if self.host_ip == self.pod_ips[0]: + if self.host_ip == self.master_ip: return True return False diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index eb2e23b49..acefc3d17 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -45,16 +45,16 @@ from fastdeploy.engine.request import RequestOutput class OpenAIServingCompletion: - def __init__(self, engine_client, pid, pod_ips): + def __init__(self, engine_client, pid, dist_init_ip): self.engine_client = engine_client self.pid = pid - self.pod_ips = pod_ips + self.master_ip = dist_init_ip self.host_ip = get_host_ip() def _check_master(self): - if self.pod_ips is None: + if self.master_ip is None: return True - if self.host_ip == self.pod_ips[0]: + if self.host_ip == self.master_ip: return True return False diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 0316779b4..79ee65b77 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -27,7 +27,8 @@ from datetime import datetime from logging.handlers import BaseRotatingHandler from pathlib import Path from typing import Literal, TypeVar, Union - +import random +import socket import requests import yaml from aistudio_sdk.snapshot_download import snapshot_download @@ -421,6 +422,19 @@ def get_host_ip(): return ip + + +def get_random_port(): + while True: + port = random.randint(49152, 65535) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind(("0.0.0.0", port)) + return port + except OSError: + continue + + def is_port_available(host, port): """ Check the port is available diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 70e596359..18c1b4302 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -23,6 +23,7 @@ import pynvml from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request +from fastdeploy.platforms import current_platform from fastdeploy.utils import get_logger from fastdeploy.worker.gpu_model_runner import GPUModelRunner from fastdeploy.worker.output import ModelRunnerOutput @@ -50,11 +51,12 @@ class GpuWorker(WorkerBase): """ Initialize device and construct model runner """ + self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda( ): # Set evironment variable self.device_ids = self.parallel_config.device_ids.split(",") - self.device = f"gpu:{self.local_rank}" + self.device = f"gpu:{self.local_rank % self.max_chips_per_node}" paddle.device.set_device(self.device) paddle.set_default_dtype(self.parallel_config.dtype) @@ -72,7 +74,7 @@ class GpuWorker(WorkerBase): self.model_runner: GPUModelRunner = GPUModelRunner( fd_config=self.fd_config, device=self.device, - device_id=self.device_ids[self.local_rank], + device_id=self.device_ids[self.local_rank % self.max_chips_per_node], rank=self.rank, local_rank=self.local_rank) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 8775c5de2..99504008c 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -136,9 +136,9 @@ class PaddleDisWorkerProc(): model_weights_status: """ # init worker_ready_signal - max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 + self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 array_size = min( - max_chips_per_node, self.parallel_config.tensor_parallel_size * + self.max_chips_per_node, self.parallel_config.tensor_parallel_size * self.parallel_config.expert_parallel_size) workers_ready = np.zeros(shape=[array_size], dtype=np.int32) self.worker_ready_signal = IPCSignal( @@ -148,10 +148,10 @@ class PaddleDisWorkerProc(): suffix=self.parallel_config.engine_pid, create=False) self.worker_ready_signal.value[self.local_rank % - max_chips_per_node] = 1 + self.max_chips_per_node] = 1 # init worker_healthy_live_signal - workers_alive = np.zeros(shape=[self.ranks], dtype=np.int32) + workers_alive = np.zeros(shape=[array_size], dtype=np.int32) self.worker_healthy_live_signal = IPCSignal( name="worker_healthy_live_signal", array=workers_alive, @@ -205,7 +205,7 @@ class PaddleDisWorkerProc(): Tmp loop function for ep utill DP is supported """ while True: - self.worker_healthy_live_signal.value[self.local_rank] = int( + self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int( time.time()) if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(