[LLM] Update Multinode Deployment (#2830)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* [LLM] fix multinode bugs

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] update multinode deployment

* [LLM] fix ci bugs

* Update fastdeploy/engine/args_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* [LLM] update random port

* [LLM] update random port

* [LLM] fix ci bugs

* fix ci bugs

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
ltd0924
2025-07-16 23:42:54 +08:00
committed by GitHub
parent d245d1ca6c
commit 9c25dcca0b
11 changed files with 108 additions and 56 deletions

View File

@@ -124,9 +124,19 @@ class EngineArgs:
Ratio of tokens to process in a block. Ratio of tokens to process in a block.
""" """
pod_ips: Optional[List[str]] = None dist_init_ip: Optional[str] = None
""" """
List of IP addresses for nodes in the cluster. The master node ip of multinode deployment
"""
nnodes: int = 1
"""
The number of nodes in multinode deployment
"""
node_rank: int = 0
"""
The rank of the current node in multinode deployment
""" """
swap_space: float = None swap_space: float = None
@@ -485,11 +495,25 @@ class EngineArgs:
# Cluster system parameters group # Cluster system parameters group
system_group = parser.add_argument_group("System Configuration") system_group = parser.add_argument_group("System Configuration")
system_group.add_argument( system_group.add_argument(
"--pod-ips", "--dist-init-ip",
type=lambda s: s.split(",") if s else None, default=EngineArgs.dist_init_ip,
default=EngineArgs.pod_ips,
help= help=
"List of IP addresses for nodes in the cluster (comma-separated).") "IP addresses of master node.")
system_group.add_argument(
"--nnodes",
type=int,
default=EngineArgs.nnodes,
help=
"The number of all nodes.")
system_group.add_argument(
"--node-rank",
type=int,
default=EngineArgs.node_rank,
help=
"node rank id (range [0, nnodes)).")
# Performance tuning parameters group # Performance tuning parameters group
@@ -789,7 +813,9 @@ class EngineArgs:
max_num_seqs=self.max_num_seqs, max_num_seqs=self.max_num_seqs,
speculative_config=speculative_cfg, speculative_config=speculative_cfg,
max_num_batched_tokens=self.max_num_batched_tokens, max_num_batched_tokens=self.max_num_batched_tokens,
pod_ips=self.pod_ips, dist_init_ip=self.dist_init_ip,
nnodes=self.nnodes,
node_rank=self.node_rank,
use_warmup=self.use_warmup, use_warmup=self.use_warmup,
engine_worker_queue_port=self.engine_worker_queue_port, engine_worker_queue_port=self.engine_worker_queue_port,
limit_mm_per_prompt=self.limit_mm_per_prompt, limit_mm_per_prompt=self.limit_mm_per_prompt,

View File

@@ -6,7 +6,7 @@
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #dist_init_ip
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,7 +24,7 @@ from fastdeploy import envs
from fastdeploy.platforms import current_platform from fastdeploy.platforms import current_platform
from fastdeploy.scheduler import SchedulerConfig from fastdeploy.scheduler import SchedulerConfig
from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip, from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip,
is_port_available, llm_logger) is_port_available, get_random_port, llm_logger)
TaskOption = Literal["generate"] TaskOption = Literal["generate"]
@@ -642,7 +642,9 @@ class Config:
max_model_len: int = 8192, max_model_len: int = 8192,
max_num_seqs: int = 8, max_num_seqs: int = 8,
max_num_batched_tokens: Optional[int] = None, max_num_batched_tokens: Optional[int] = None,
pod_ips: Optional[List[str]] = None, dist_init_ip: str = None,
nnodes: int = 1,
node_rank: int = 0,
speculative_config: Optional[Dict[str, Any]] = None, speculative_config: Optional[Dict[str, Any]] = None,
graph_optimization_config: Optional[Dict[str, Any]] = None, graph_optimization_config: Optional[Dict[str, Any]] = None,
use_warmup: bool = False, use_warmup: bool = False,
@@ -675,7 +677,6 @@ class Config:
max_model_len (int): Maximum model length. Default is 8192. max_model_len (int): Maximum model length. Default is 8192.
max_num_seqs (int): Maximum number of sequences. Default is 8. max_num_seqs (int): Maximum number of sequences. Default is 8.
max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. Default is None. max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. Default is None.
pod_ips (Optional[List[str]]): List of POD IPs. Default is None.
mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. Default is None. mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. Default is None.
speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. Default is None. speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. Default is None.
graph_optimization_config (Optional[Dict[str, Any]]): Graph optimizaion backend execution configuration. Default is None. graph_optimization_config (Optional[Dict[str, Any]]): Graph optimizaion backend execution configuration. Default is None.
@@ -699,7 +700,16 @@ class Config:
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.max_num_batched_tokens = max_num_batched_tokens self.max_num_batched_tokens = max_num_batched_tokens
self.tensor_parallel_size = tensor_parallel_size self.tensor_parallel_size = tensor_parallel_size
self.pod_ips = pod_ips self.dist_init_ip = dist_init_ip
self.nnode = nnodes
self.node_rank = node_rank
if self.dist_init_ip is None:
self.master_ip = "0.0.0.0"
else:
self.master_ip = self.dist_init_ip
self.dist_init_addr = f"{self.dist_init_ip}:{get_random_port()}"
self.max_model_len = max_model_len self.max_model_len = max_model_len
self.max_num_seqs = max_num_seqs self.max_num_seqs = max_num_seqs
self.limit_mm_per_prompt = limit_mm_per_prompt self.limit_mm_per_prompt = limit_mm_per_prompt
@@ -716,14 +726,8 @@ class Config:
self.graph_optimization_config = graph_optimization_config self.graph_optimization_config = graph_optimization_config
self.guided_decoding_backend = guided_decoding_backend self.guided_decoding_backend = guided_decoding_backend
self.disable_any_whitespace = disable_any_whitespace self.disable_any_whitespace = disable_any_whitespace
self.is_master = True
self._str_to_list("innode_prefill_ports", int) self._str_to_list("innode_prefill_ports", int)
self._str_to_list("pod_ips", str)
if self.pod_ips is None:
self.nnode = 1
else:
self.nnode = len(self.pod_ips)
assert self.splitwise_role in ["mixed", "prefill", "decode"] assert self.splitwise_role in ["mixed", "prefill", "decode"]
@@ -778,9 +782,9 @@ class Config:
self.host_ip = get_host_ip() self.host_ip = get_host_ip()
if self.pod_ips is None: if self.dist_init_ip is None or self.host_ip == self.master_ip:
self.pod_ips = ["0.0.0.0"] self.is_master = True
elif self.host_ip != self.pod_ips[0]: else:
self.is_master = False self.is_master = False
import paddle import paddle

View File

@@ -174,7 +174,7 @@ class LLMEngine(object):
cache_config=self.cfg.cache_config, cache_config=self.cfg.cache_config,
tensor_parallel_size=self.cfg.tensor_parallel_size, tensor_parallel_size=self.cfg.tensor_parallel_size,
device_ids=device_ids, device_ids=device_ids,
pod_ip=self.cfg.pod_ips[0], pod_ip=self.cfg.master_ip,
engine_worker_queue_port=self.cfg.engine_worker_queue_port, engine_worker_queue_port=self.cfg.engine_worker_queue_port,
pid_suffix=self.ipc_signal_suffix) pid_suffix=self.ipc_signal_suffix)
@@ -239,11 +239,12 @@ class LLMEngine(object):
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1: if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
self.dp_processed = [] self.dp_processed = []
for i in range(1, self.cfg.parallel_config.data_parallel_size): for i in range(1, self.cfg.parallel_config.data_parallel_size // self.cfg.nnode):
time.sleep(1) time.sleep(1)
self.dp_processed.append( self.dp_processed.append(
multiprocessing.Process(target=start_expert_service, multiprocessing.Process(target=start_expert_service,
args=(self.cfg, i, args=(self.cfg,
i + self.cfg.node_rank * self.cfg.worker_num_per_node,
self.ipc_signal_suffix))) self.ipc_signal_suffix)))
llm_logger.info(f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}" \ llm_logger.info(f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}" \
+ " data parallel id {}".format(i)) + " data parallel id {}".format(i))
@@ -1007,8 +1008,6 @@ class LLMEngine(object):
) )
arguments = ( arguments = (
f" --nnodes {str(self.cfg.nnode)}"
f" --ips {','.join(self.cfg.pod_ips)}"
f" --devices {self.cfg.device_ids} {py_script}" f" --devices {self.cfg.device_ids} {py_script}"
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}" f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}" f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
@@ -1016,7 +1015,7 @@ class LLMEngine(object):
f" --device_ids {self.cfg.device_ids}" f" --device_ids {self.cfg.device_ids}"
f" --tensor_parallel_size {self.cfg.tensor_parallel_size}" f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}" f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}"
f" --pod_ip {self.cfg.pod_ips[0]}" f" --pod_ip {self.cfg.master_ip}"
f" --total_block_num {self.cfg.cache_config.total_block_num}" f" --total_block_num {self.cfg.cache_config.total_block_num}"
f" --block_size {self.cfg.cache_config.block_size}" f" --block_size {self.cfg.cache_config.block_size}"
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}" f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
@@ -1057,7 +1056,11 @@ class LLMEngine(object):
if value: if value:
arguments = arguments + f" --{worker_flag}" arguments = arguments + f" --{worker_flag}"
if self.cfg.nnode > 1: if self.cfg.nnode > 1:
pd_cmd = pd_cmd + f" --ips {self.cfg.ips}" pd_cmd = pd_cmd + (
f" --master {self.cfg.dist_init_addr}"
f" --nnodes {str(self.cfg.nnode)}"
f" --rank {str(self.cfg.node_rank)}"
)
pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log" pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log"
llm_logger.info("Launch worker service command: {}".format(pd_cmd)) llm_logger.info("Launch worker service command: {}".format(pd_cmd))
p = subprocess.Popen( p = subprocess.Popen(
@@ -1158,7 +1161,7 @@ class LLMEngine(object):
cache_config=self.cfg.cache_config, cache_config=self.cfg.cache_config,
tensor_parallel_size=self.cfg.tensor_parallel_size, tensor_parallel_size=self.cfg.tensor_parallel_size,
device_ids=device_ids, device_ids=device_ids,
pod_ip=self.cfg.pod_ips[0], pod_ip=self.cfg.master_ip,
engine_worker_queue_port=self.cfg.engine_worker_queue_port, engine_worker_queue_port=self.cfg.engine_worker_queue_port,
pid_suffix=self.ipc_signal_suffix) pid_suffix=self.ipc_signal_suffix)
def check_health(self, time_interval_threashold=30): def check_health(self, time_interval_threashold=30):
@@ -1245,8 +1248,9 @@ class LLMEngine(object):
""" """
start queue service for engine worker communication start queue service for engine worker communication
""" """
address = (self.cfg.pod_ips[0], self.cfg.engine_worker_queue_port) address = (self.cfg.master_ip, self.cfg.engine_worker_queue_port)
if self.cfg.host_ip == self.cfg.pod_ips[0] or self.cfg.pod_ips[0] == "0.0.0.0": if self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0":
llm_logger.info(f"Starting engine worker queue server service at {address}")
self.engine_worker_queue_server = EngineWorkerQueue( self.engine_worker_queue_server = EngineWorkerQueue(
address=address, address=address,
is_server=True, is_server=True,
@@ -1256,7 +1260,7 @@ class LLMEngine(object):
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed': if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed':
self.cache_task_queue = EngineCacheQueue( self.cache_task_queue = EngineCacheQueue(
address=(self.cfg.pod_ips[0], self.cfg.cache_config.cache_queue_port), address=(self.cfg.master_ip, self.cfg.cache_config.cache_queue_port),
authkey=b'cache_queue_service', authkey=b'cache_queue_service',
is_server=True, is_server=True,
num_client=self.cfg.tensor_parallel_size, num_client=self.cfg.tensor_parallel_size,
@@ -1270,4 +1274,6 @@ class LLMEngine(object):
is_server=False, is_server=False,
num_client=self.cfg.tensor_parallel_size, num_client=self.cfg.tensor_parallel_size,
client_id=0, client_id=0,
local_data_parallel_id=0) local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
local_data_parallel_id= min(self.cfg.worker_num_per_node * self.cfg.node_rank,
self.cfg.parallel_config.data_parallel_size - 1))

View File

@@ -49,8 +49,8 @@ class ExpertService(object):
cfg (Config): Config object containing all the configuration parameters. cfg (Config): Config object containing all the configuration parameters.
""" """
self.cfg = cfg self.cfg = cfg
start_pos = local_data_parallel_id * self.cfg.tensor_parallel_size start_pos = (local_data_parallel_id * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node
end_pos = (local_data_parallel_id + 1) * self.cfg.tensor_parallel_size end_pos = ((local_data_parallel_id + 1) * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node
self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[ self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[
start_pos:end_pos] start_pos:end_pos]
self.cfg.local_device_ids = self.cfg.device_ids.split( self.cfg.local_device_ids = self.cfg.device_ids.split(
@@ -65,7 +65,7 @@ class ExpertService(object):
self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id
address = (cfg.pod_ips[0], cfg.engine_worker_queue_port) address = (cfg.master_ip, cfg.engine_worker_queue_port)
self.engine_worker_queue = EngineWorkerQueue( self.engine_worker_queue = EngineWorkerQueue(
address=address, address=address,
is_server=False, is_server=False,
@@ -118,7 +118,7 @@ class ExpertService(object):
cache_config=self.cfg.cache_config, cache_config=self.cfg.cache_config,
tensor_parallel_size=self.cfg.tensor_parallel_size, tensor_parallel_size=self.cfg.tensor_parallel_size,
device_ids=self.cfg.local_device_ids, device_ids=self.cfg.local_device_ids,
pod_ip=self.cfg.pod_ips[0], pod_ip=self.cfg.master_ip,
engine_worker_queue_port=self.cfg.engine_worker_queue_port, engine_worker_queue_port=self.cfg.engine_worker_queue_port,
pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}" pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}"
) )

View File

@@ -85,7 +85,7 @@ class LLM:
self.mutex = threading.Lock() self.mutex = threading.Lock()
self.req_output = dict() self.req_output = dict()
self.master_node_ip = self.llm_engine.cfg.pod_ips[0] self.master_node_ip = self.llm_engine.cfg.master_ip
self._receive_output_thread = threading.Thread( self._receive_output_thread = threading.Thread(
target=self._receive_output, daemon=True) target=self._receive_output, daemon=True)
self._receive_output_thread.start() self._receive_output_thread.start()

View File

@@ -122,8 +122,8 @@ async def lifespan(app: FastAPI):
args.mm_processor_kwargs, args.enable_mm, args.mm_processor_kwargs, args.enable_mm,
args.reasoning_parser) args.reasoning_parser)
app.state.dynamic_load_weight = args.dynamic_load_weight app.state.dynamic_load_weight = args.dynamic_load_weight
chat_handler = OpenAIServingChat(engine_client, pid, args.pod_ips) chat_handler = OpenAIServingChat(engine_client, pid, args.dist_init_ip)
completion_handler = OpenAIServingCompletion(engine_client, pid, args.pod_ips) completion_handler = OpenAIServingCompletion(engine_client, pid, args.dist_init_ip)
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH) engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
engine_client.pid = pid engine_client.pid = pid
app.state.engine_client = engine_client app.state.engine_client = engine_client

View File

@@ -40,16 +40,16 @@ class OpenAIServingChat:
OpenAI-style chat completions serving OpenAI-style chat completions serving
""" """
def __init__(self, engine_client, pid, pod_ips): def __init__(self, engine_client, pid, dist_init_ip):
self.engine_client = engine_client self.engine_client = engine_client
self.pid = pid self.pid = pid
self.pod_ips = pod_ips self.master_ip = dist_init_ip
self.host_ip = get_host_ip() self.host_ip = get_host_ip()
def _check_master(self): def _check_master(self):
if self.pod_ips is None: if self.master_ip is None:
return True return True
if self.host_ip == self.pod_ips[0]: if self.host_ip == self.master_ip:
return True return True
return False return False

View File

@@ -45,16 +45,16 @@ from fastdeploy.engine.request import RequestOutput
class OpenAIServingCompletion: class OpenAIServingCompletion:
def __init__(self, engine_client, pid, pod_ips): def __init__(self, engine_client, pid, dist_init_ip):
self.engine_client = engine_client self.engine_client = engine_client
self.pid = pid self.pid = pid
self.pod_ips = pod_ips self.master_ip = dist_init_ip
self.host_ip = get_host_ip() self.host_ip = get_host_ip()
def _check_master(self): def _check_master(self):
if self.pod_ips is None: if self.master_ip is None:
return True return True
if self.host_ip == self.pod_ips[0]: if self.host_ip == self.master_ip:
return True return True
return False return False

View File

@@ -27,7 +27,8 @@ from datetime import datetime
from logging.handlers import BaseRotatingHandler from logging.handlers import BaseRotatingHandler
from pathlib import Path from pathlib import Path
from typing import Literal, TypeVar, Union from typing import Literal, TypeVar, Union
import random
import socket
import requests import requests
import yaml import yaml
from aistudio_sdk.snapshot_download import snapshot_download from aistudio_sdk.snapshot_download import snapshot_download
@@ -421,6 +422,19 @@ def get_host_ip():
return ip return ip
def get_random_port():
while True:
port = random.randint(49152, 65535)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("0.0.0.0", port))
return port
except OSError:
continue
def is_port_available(host, port): def is_port_available(host, port):
""" """
Check the port is available Check the port is available

View File

@@ -23,6 +23,7 @@ import pynvml
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
from fastdeploy.platforms import current_platform
from fastdeploy.utils import get_logger from fastdeploy.utils import get_logger
from fastdeploy.worker.gpu_model_runner import GPUModelRunner from fastdeploy.worker.gpu_model_runner import GPUModelRunner
from fastdeploy.worker.output import ModelRunnerOutput from fastdeploy.worker.output import ModelRunnerOutput
@@ -50,11 +51,12 @@ class GpuWorker(WorkerBase):
""" """
Initialize device and construct model runner Initialize device and construct model runner
""" """
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda( if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda(
): ):
# Set evironment variable # Set evironment variable
self.device_ids = self.parallel_config.device_ids.split(",") self.device_ids = self.parallel_config.device_ids.split(",")
self.device = f"gpu:{self.local_rank}" self.device = f"gpu:{self.local_rank % self.max_chips_per_node}"
paddle.device.set_device(self.device) paddle.device.set_device(self.device)
paddle.set_default_dtype(self.parallel_config.dtype) paddle.set_default_dtype(self.parallel_config.dtype)
@@ -72,7 +74,7 @@ class GpuWorker(WorkerBase):
self.model_runner: GPUModelRunner = GPUModelRunner( self.model_runner: GPUModelRunner = GPUModelRunner(
fd_config=self.fd_config, fd_config=self.fd_config,
device=self.device, device=self.device,
device_id=self.device_ids[self.local_rank], device_id=self.device_ids[self.local_rank % self.max_chips_per_node],
rank=self.rank, rank=self.rank,
local_rank=self.local_rank) local_rank=self.local_rank)

View File

@@ -136,9 +136,9 @@ class PaddleDisWorkerProc():
model_weights_status: model_weights_status:
""" """
# init worker_ready_signal # init worker_ready_signal
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
array_size = min( array_size = min(
max_chips_per_node, self.parallel_config.tensor_parallel_size * self.max_chips_per_node, self.parallel_config.tensor_parallel_size *
self.parallel_config.expert_parallel_size) self.parallel_config.expert_parallel_size)
workers_ready = np.zeros(shape=[array_size], dtype=np.int32) workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_ready_signal = IPCSignal( self.worker_ready_signal = IPCSignal(
@@ -148,10 +148,10 @@ class PaddleDisWorkerProc():
suffix=self.parallel_config.engine_pid, suffix=self.parallel_config.engine_pid,
create=False) create=False)
self.worker_ready_signal.value[self.local_rank % self.worker_ready_signal.value[self.local_rank %
max_chips_per_node] = 1 self.max_chips_per_node] = 1
# init worker_healthy_live_signal # init worker_healthy_live_signal
workers_alive = np.zeros(shape=[self.ranks], dtype=np.int32) workers_alive = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal( self.worker_healthy_live_signal = IPCSignal(
name="worker_healthy_live_signal", name="worker_healthy_live_signal",
array=workers_alive, array=workers_alive,
@@ -205,7 +205,7 @@ class PaddleDisWorkerProc():
Tmp loop function for ep utill DP is supported Tmp loop function for ep utill DP is supported
""" """
while True: while True:
self.worker_healthy_live_signal.value[self.local_rank] = int( self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(
time.time()) time.time())
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks( if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(