[LLM] support multi node deploy (#2708)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* [LLM] support multi node deploy

* Update engine.py

* fix bugs

* fix

* [LLM] support multi node deploy

* [LLM] support multi node deploy

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
ltd0924
2025-07-06 10:33:51 +08:00
committed by GitHub
parent 04a8e1ef2b
commit 68b4755587
13 changed files with 157 additions and 87 deletions

View File

@@ -98,30 +98,7 @@ class LLMEngine(object):
cfg.mm_processor_kwargs,
cfg.enable_mm)
address = ('0.0.0.0', self.cfg.engine_worker_queue_port)
self.engine_worker_queue_server = EngineWorkerQueue(
address=address,
is_server=True,
num_client=self.cfg.tensor_parallel_size,
local_data_parallel_size=self.cfg.parallel_config.
data_parallel_size)
self.engine_worker_queue = EngineWorkerQueue(
address=address,
is_server=False,
num_client=self.cfg.tensor_parallel_size,
client_id=0,
local_data_parallel_id=0)
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed':
self.cache_task_queue = EngineCacheQueue(
address=('127.0.0.1', self.cfg.cache_config.cache_queue_port),
authkey=b'cache_queue_service',
is_server=True,
num_client=self.cfg.tensor_parallel_size,
client_id=-1,
local_data_parallel_size=self.cfg.parallel_config.
data_parallel_size)
self.start_queue_service()
self.resource_manager = ResourceManager(cfg.max_num_seqs, cfg,
cfg.tensor_parallel_size,
@@ -198,9 +175,12 @@ class LLMEngine(object):
or self.cfg.splitwise_role != "mixed"):
device_ids = self.cfg.device_ids.split(",")
self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager(
self.cfg.cache_config, self.cfg.tensor_parallel_size,
device_ids, self.cfg.engine_worker_queue_port,
self.ipc_signal_suffix)
cache_config=self.cfg.cache_config,
tensor_parallel_size=self.cfg.tensor_parallel_size,
device_ids=device_ids,
pod_ip=self.cfg.pod_ips[0],
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
pid_suffix=self.ipc_signal_suffix)
self.worker_proc = self._start_worker_service()
console_logger.info("Waitting worker processes ready...")
@@ -850,10 +830,7 @@ class LLMEngine(object):
Initialize shared memory to indicate engine status
"""
# worker_ready_signatensor_parallel_size
array_size = min(
8, self.cfg.tensor_parallel_size *
self.cfg.parallel_config.data_parallel_size)
worker_ready_signal_data = np.zeros(shape=[array_size], dtype=np.int32)
worker_ready_signal_data = np.zeros(shape=[self.cfg.worker_num_per_node], dtype=np.int32)
self.worker_ready_signal = IPCSignal(name="worker_ready_signal",
array=worker_ready_signal_data,
dtype=np.int32,
@@ -889,7 +866,7 @@ class LLMEngine(object):
create=True)
# worker_live_signal 用于engine感知各worker进程是否存活记录每个step 时间
worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size],
worker_healthy_live_recorded_time_array = np.zeros(shape=[self.cfg.worker_num_per_node],
dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal(
name="worker_healthy_live_signal",
@@ -899,7 +876,7 @@ class LLMEngine(object):
create=True)
if self.do_profile:
get_profile_block_num = np.zeros([array_size], dtype=np.int32)
get_profile_block_num = np.zeros([self.cfg.worker_num_per_node], dtype=np.int32)
self.get_profile_block_num_signal = IPCSignal(
name="get_profile_block_num",
array=get_profile_block_num,
@@ -1028,6 +1005,7 @@ class LLMEngine(object):
arguments = (
f" --nnodes {str(self.cfg.nnode)}"
f" --ips {','.join(self.cfg.pod_ips)}"
f" --devices {self.cfg.device_ids} {py_script}"
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
@@ -1035,6 +1013,7 @@ class LLMEngine(object):
f" --device_ids {self.cfg.device_ids}"
f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}"
f" --pod_ip {self.cfg.pod_ips[0]}"
f" --total_block_num {self.cfg.cache_config.total_block_num}"
f" --block_size {self.cfg.cache_config.block_size}"
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
@@ -1171,10 +1150,12 @@ class LLMEngine(object):
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
device_ids = self.cfg.device_ids.split(",")
self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager(
self.cfg.cache_config, self.cfg.tensor_parallel_size,
device_ids, self.cfg.engine_worker_queue_port,
self.ipc_signal_suffix)
cache_config=self.cfg.cache_config,
tensor_parallel_size=self.cfg.tensor_parallel_size,
device_ids=device_ids,
pod_ip=self.cfg.pod_ips[0],
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
pid_suffix=self.ipc_signal_suffix)
def check_health(self, time_interval_threashold=30):
"""
Check the health of the model server by checking whether all workers are alive.
@@ -1254,3 +1235,34 @@ class LLMEngine(object):
except Exception:
pass
return True
def start_queue_service(self):
"""
start queue service for engine worker communication
"""
address = (self.cfg.pod_ips[0], self.cfg.engine_worker_queue_port)
if self.cfg.host_ip == self.cfg.pod_ips[0] or self.cfg.pod_ips[0] == "0.0.0.0":
self.engine_worker_queue_server = EngineWorkerQueue(
address=address,
is_server=True,
num_client=self.cfg.tensor_parallel_size,
local_data_parallel_size=self.cfg.parallel_config.
data_parallel_size)
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed':
self.cache_task_queue = EngineCacheQueue(
address=(self.cfg.pod_ips[0], self.cfg.cache_config.cache_queue_port),
authkey=b'cache_queue_service',
is_server=True,
num_client=self.cfg.tensor_parallel_size,
client_id=-1,
local_data_parallel_size=self.cfg.parallel_config.
data_parallel_size)
self.engine_worker_queue = EngineWorkerQueue(
address=address,
is_server=False,
num_client=self.cfg.tensor_parallel_size,
client_id=0,
local_data_parallel_id=0)