mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[LLM] support multi node deploy (#2708)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* [LLM] support multi node deploy * Update engine.py * fix bugs * fix * [LLM] support multi node deploy * [LLM] support multi node deploy --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -98,30 +98,7 @@ class LLMEngine(object):
|
||||
cfg.mm_processor_kwargs,
|
||||
cfg.enable_mm)
|
||||
|
||||
address = ('0.0.0.0', self.cfg.engine_worker_queue_port)
|
||||
self.engine_worker_queue_server = EngineWorkerQueue(
|
||||
address=address,
|
||||
is_server=True,
|
||||
num_client=self.cfg.tensor_parallel_size,
|
||||
local_data_parallel_size=self.cfg.parallel_config.
|
||||
data_parallel_size)
|
||||
|
||||
self.engine_worker_queue = EngineWorkerQueue(
|
||||
address=address,
|
||||
is_server=False,
|
||||
num_client=self.cfg.tensor_parallel_size,
|
||||
client_id=0,
|
||||
local_data_parallel_id=0)
|
||||
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed':
|
||||
self.cache_task_queue = EngineCacheQueue(
|
||||
address=('127.0.0.1', self.cfg.cache_config.cache_queue_port),
|
||||
authkey=b'cache_queue_service',
|
||||
is_server=True,
|
||||
num_client=self.cfg.tensor_parallel_size,
|
||||
client_id=-1,
|
||||
local_data_parallel_size=self.cfg.parallel_config.
|
||||
data_parallel_size)
|
||||
self.start_queue_service()
|
||||
|
||||
self.resource_manager = ResourceManager(cfg.max_num_seqs, cfg,
|
||||
cfg.tensor_parallel_size,
|
||||
@@ -198,9 +175,12 @@ class LLMEngine(object):
|
||||
or self.cfg.splitwise_role != "mixed"):
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager(
|
||||
self.cfg.cache_config, self.cfg.tensor_parallel_size,
|
||||
device_ids, self.cfg.engine_worker_queue_port,
|
||||
self.ipc_signal_suffix)
|
||||
cache_config=self.cfg.cache_config,
|
||||
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
||||
device_ids=device_ids,
|
||||
pod_ip=self.cfg.pod_ips[0],
|
||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||
pid_suffix=self.ipc_signal_suffix)
|
||||
|
||||
self.worker_proc = self._start_worker_service()
|
||||
console_logger.info("Waitting worker processes ready...")
|
||||
@@ -850,10 +830,7 @@ class LLMEngine(object):
|
||||
Initialize shared memory to indicate engine status
|
||||
"""
|
||||
# worker_ready_signatensor_parallel_size
|
||||
array_size = min(
|
||||
8, self.cfg.tensor_parallel_size *
|
||||
self.cfg.parallel_config.data_parallel_size)
|
||||
worker_ready_signal_data = np.zeros(shape=[array_size], dtype=np.int32)
|
||||
worker_ready_signal_data = np.zeros(shape=[self.cfg.worker_num_per_node], dtype=np.int32)
|
||||
self.worker_ready_signal = IPCSignal(name="worker_ready_signal",
|
||||
array=worker_ready_signal_data,
|
||||
dtype=np.int32,
|
||||
@@ -889,7 +866,7 @@ class LLMEngine(object):
|
||||
create=True)
|
||||
|
||||
# worker_live_signal 用于engine感知各worker进程是否存活,记录每个step 时间
|
||||
worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size],
|
||||
worker_healthy_live_recorded_time_array = np.zeros(shape=[self.cfg.worker_num_per_node],
|
||||
dtype=np.int32)
|
||||
self.worker_healthy_live_signal = IPCSignal(
|
||||
name="worker_healthy_live_signal",
|
||||
@@ -899,7 +876,7 @@ class LLMEngine(object):
|
||||
create=True)
|
||||
|
||||
if self.do_profile:
|
||||
get_profile_block_num = np.zeros([array_size], dtype=np.int32)
|
||||
get_profile_block_num = np.zeros([self.cfg.worker_num_per_node], dtype=np.int32)
|
||||
self.get_profile_block_num_signal = IPCSignal(
|
||||
name="get_profile_block_num",
|
||||
array=get_profile_block_num,
|
||||
@@ -1028,6 +1005,7 @@ class LLMEngine(object):
|
||||
|
||||
arguments = (
|
||||
f" --nnodes {str(self.cfg.nnode)}"
|
||||
f" --ips {','.join(self.cfg.pod_ips)}"
|
||||
f" --devices {self.cfg.device_ids} {py_script}"
|
||||
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
|
||||
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
|
||||
@@ -1035,6 +1013,7 @@ class LLMEngine(object):
|
||||
f" --device_ids {self.cfg.device_ids}"
|
||||
f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
|
||||
f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}"
|
||||
f" --pod_ip {self.cfg.pod_ips[0]}"
|
||||
f" --total_block_num {self.cfg.cache_config.total_block_num}"
|
||||
f" --block_size {self.cfg.cache_config.block_size}"
|
||||
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
|
||||
@@ -1171,10 +1150,12 @@ class LLMEngine(object):
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager(
|
||||
self.cfg.cache_config, self.cfg.tensor_parallel_size,
|
||||
device_ids, self.cfg.engine_worker_queue_port,
|
||||
self.ipc_signal_suffix)
|
||||
|
||||
cache_config=self.cfg.cache_config,
|
||||
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
||||
device_ids=device_ids,
|
||||
pod_ip=self.cfg.pod_ips[0],
|
||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||
pid_suffix=self.ipc_signal_suffix)
|
||||
def check_health(self, time_interval_threashold=30):
|
||||
"""
|
||||
Check the health of the model server by checking whether all workers are alive.
|
||||
@@ -1254,3 +1235,34 @@ class LLMEngine(object):
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
def start_queue_service(self):
|
||||
"""
|
||||
start queue service for engine worker communication
|
||||
"""
|
||||
address = (self.cfg.pod_ips[0], self.cfg.engine_worker_queue_port)
|
||||
if self.cfg.host_ip == self.cfg.pod_ips[0] or self.cfg.pod_ips[0] == "0.0.0.0":
|
||||
self.engine_worker_queue_server = EngineWorkerQueue(
|
||||
address=address,
|
||||
is_server=True,
|
||||
num_client=self.cfg.tensor_parallel_size,
|
||||
local_data_parallel_size=self.cfg.parallel_config.
|
||||
data_parallel_size)
|
||||
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed':
|
||||
self.cache_task_queue = EngineCacheQueue(
|
||||
address=(self.cfg.pod_ips[0], self.cfg.cache_config.cache_queue_port),
|
||||
authkey=b'cache_queue_service',
|
||||
is_server=True,
|
||||
num_client=self.cfg.tensor_parallel_size,
|
||||
client_id=-1,
|
||||
local_data_parallel_size=self.cfg.parallel_config.
|
||||
data_parallel_size)
|
||||
|
||||
|
||||
self.engine_worker_queue = EngineWorkerQueue(
|
||||
address=address,
|
||||
is_server=False,
|
||||
num_client=self.cfg.tensor_parallel_size,
|
||||
client_id=0,
|
||||
local_data_parallel_id=0)
|
||||
|
Reference in New Issue
Block a user