mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
[LLM] Update Multinode Deployment (#2830)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* [LLM] fix multinode bugs * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] update multinode deployment * [LLM] fix ci bugs * Update fastdeploy/engine/args_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * [LLM] update random port * [LLM] update random port * [LLM] fix ci bugs * fix ci bugs --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -124,9 +124,19 @@ class EngineArgs:
|
|||||||
Ratio of tokens to process in a block.
|
Ratio of tokens to process in a block.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pod_ips: Optional[List[str]] = None
|
dist_init_ip: Optional[str] = None
|
||||||
"""
|
"""
|
||||||
List of IP addresses for nodes in the cluster.
|
The master node ip of multinode deployment
|
||||||
|
"""
|
||||||
|
|
||||||
|
nnodes: int = 1
|
||||||
|
"""
|
||||||
|
The number of nodes in multinode deployment
|
||||||
|
"""
|
||||||
|
|
||||||
|
node_rank: int = 0
|
||||||
|
"""
|
||||||
|
The rank of the current node in multinode deployment
|
||||||
"""
|
"""
|
||||||
|
|
||||||
swap_space: float = None
|
swap_space: float = None
|
||||||
@@ -485,11 +495,25 @@ class EngineArgs:
|
|||||||
# Cluster system parameters group
|
# Cluster system parameters group
|
||||||
system_group = parser.add_argument_group("System Configuration")
|
system_group = parser.add_argument_group("System Configuration")
|
||||||
system_group.add_argument(
|
system_group.add_argument(
|
||||||
"--pod-ips",
|
"--dist-init-ip",
|
||||||
type=lambda s: s.split(",") if s else None,
|
default=EngineArgs.dist_init_ip,
|
||||||
default=EngineArgs.pod_ips,
|
|
||||||
help=
|
help=
|
||||||
"List of IP addresses for nodes in the cluster (comma-separated).")
|
"IP addresses of master node.")
|
||||||
|
|
||||||
|
system_group.add_argument(
|
||||||
|
"--nnodes",
|
||||||
|
type=int,
|
||||||
|
default=EngineArgs.nnodes,
|
||||||
|
help=
|
||||||
|
"The number of all nodes.")
|
||||||
|
|
||||||
|
system_group.add_argument(
|
||||||
|
"--node-rank",
|
||||||
|
type=int,
|
||||||
|
default=EngineArgs.node_rank,
|
||||||
|
help=
|
||||||
|
"node rank id (range [0, nnodes)).")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Performance tuning parameters group
|
# Performance tuning parameters group
|
||||||
@@ -789,7 +813,9 @@ class EngineArgs:
|
|||||||
max_num_seqs=self.max_num_seqs,
|
max_num_seqs=self.max_num_seqs,
|
||||||
speculative_config=speculative_cfg,
|
speculative_config=speculative_cfg,
|
||||||
max_num_batched_tokens=self.max_num_batched_tokens,
|
max_num_batched_tokens=self.max_num_batched_tokens,
|
||||||
pod_ips=self.pod_ips,
|
dist_init_ip=self.dist_init_ip,
|
||||||
|
nnodes=self.nnodes,
|
||||||
|
node_rank=self.node_rank,
|
||||||
use_warmup=self.use_warmup,
|
use_warmup=self.use_warmup,
|
||||||
engine_worker_queue_port=self.engine_worker_queue_port,
|
engine_worker_queue_port=self.engine_worker_queue_port,
|
||||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||||
|
@@ -6,7 +6,7 @@
|
|||||||
# You may obtain a copy of the License at
|
# You may obtain a copy of the License at
|
||||||
#
|
#
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
#
|
#dist_init_ip
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
@@ -24,7 +24,7 @@ from fastdeploy import envs
|
|||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.scheduler import SchedulerConfig
|
from fastdeploy.scheduler import SchedulerConfig
|
||||||
from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip,
|
from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip,
|
||||||
is_port_available, llm_logger)
|
is_port_available, get_random_port, llm_logger)
|
||||||
|
|
||||||
TaskOption = Literal["generate"]
|
TaskOption = Literal["generate"]
|
||||||
|
|
||||||
@@ -642,7 +642,9 @@ class Config:
|
|||||||
max_model_len: int = 8192,
|
max_model_len: int = 8192,
|
||||||
max_num_seqs: int = 8,
|
max_num_seqs: int = 8,
|
||||||
max_num_batched_tokens: Optional[int] = None,
|
max_num_batched_tokens: Optional[int] = None,
|
||||||
pod_ips: Optional[List[str]] = None,
|
dist_init_ip: str = None,
|
||||||
|
nnodes: int = 1,
|
||||||
|
node_rank: int = 0,
|
||||||
speculative_config: Optional[Dict[str, Any]] = None,
|
speculative_config: Optional[Dict[str, Any]] = None,
|
||||||
graph_optimization_config: Optional[Dict[str, Any]] = None,
|
graph_optimization_config: Optional[Dict[str, Any]] = None,
|
||||||
use_warmup: bool = False,
|
use_warmup: bool = False,
|
||||||
@@ -675,7 +677,6 @@ class Config:
|
|||||||
max_model_len (int): Maximum model length. Default is 8192.
|
max_model_len (int): Maximum model length. Default is 8192.
|
||||||
max_num_seqs (int): Maximum number of sequences. Default is 8.
|
max_num_seqs (int): Maximum number of sequences. Default is 8.
|
||||||
max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. Default is None.
|
max_num_batched_tokens (Optional[int]): Maximum number of batched tokens. Default is None.
|
||||||
pod_ips (Optional[List[str]]): List of POD IPs. Default is None.
|
|
||||||
mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. Default is None.
|
mm_processor_kwargs (Optional[Dict[str, Any]]): Additional arguments for multi-modal processor. Default is None.
|
||||||
speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. Default is None.
|
speculative_config (Optional[Dict[str, Any]]): Speculative execution configuration. Default is None.
|
||||||
graph_optimization_config (Optional[Dict[str, Any]]): Graph optimizaion backend execution configuration. Default is None.
|
graph_optimization_config (Optional[Dict[str, Any]]): Graph optimizaion backend execution configuration. Default is None.
|
||||||
@@ -699,7 +700,16 @@ class Config:
|
|||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.max_num_batched_tokens = max_num_batched_tokens
|
self.max_num_batched_tokens = max_num_batched_tokens
|
||||||
self.tensor_parallel_size = tensor_parallel_size
|
self.tensor_parallel_size = tensor_parallel_size
|
||||||
self.pod_ips = pod_ips
|
self.dist_init_ip = dist_init_ip
|
||||||
|
|
||||||
|
self.nnode = nnodes
|
||||||
|
self.node_rank = node_rank
|
||||||
|
if self.dist_init_ip is None:
|
||||||
|
self.master_ip = "0.0.0.0"
|
||||||
|
else:
|
||||||
|
self.master_ip = self.dist_init_ip
|
||||||
|
self.dist_init_addr = f"{self.dist_init_ip}:{get_random_port()}"
|
||||||
|
|
||||||
self.max_model_len = max_model_len
|
self.max_model_len = max_model_len
|
||||||
self.max_num_seqs = max_num_seqs
|
self.max_num_seqs = max_num_seqs
|
||||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||||
@@ -716,14 +726,8 @@ class Config:
|
|||||||
self.graph_optimization_config = graph_optimization_config
|
self.graph_optimization_config = graph_optimization_config
|
||||||
self.guided_decoding_backend = guided_decoding_backend
|
self.guided_decoding_backend = guided_decoding_backend
|
||||||
self.disable_any_whitespace = disable_any_whitespace
|
self.disable_any_whitespace = disable_any_whitespace
|
||||||
self.is_master = True
|
|
||||||
self._str_to_list("innode_prefill_ports", int)
|
self._str_to_list("innode_prefill_ports", int)
|
||||||
self._str_to_list("pod_ips", str)
|
|
||||||
|
|
||||||
if self.pod_ips is None:
|
|
||||||
self.nnode = 1
|
|
||||||
else:
|
|
||||||
self.nnode = len(self.pod_ips)
|
|
||||||
|
|
||||||
assert self.splitwise_role in ["mixed", "prefill", "decode"]
|
assert self.splitwise_role in ["mixed", "prefill", "decode"]
|
||||||
|
|
||||||
@@ -778,9 +782,9 @@ class Config:
|
|||||||
|
|
||||||
self.host_ip = get_host_ip()
|
self.host_ip = get_host_ip()
|
||||||
|
|
||||||
if self.pod_ips is None:
|
if self.dist_init_ip is None or self.host_ip == self.master_ip:
|
||||||
self.pod_ips = ["0.0.0.0"]
|
self.is_master = True
|
||||||
elif self.host_ip != self.pod_ips[0]:
|
else:
|
||||||
self.is_master = False
|
self.is_master = False
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
@@ -174,7 +174,7 @@ class LLMEngine(object):
|
|||||||
cache_config=self.cfg.cache_config,
|
cache_config=self.cfg.cache_config,
|
||||||
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
||||||
device_ids=device_ids,
|
device_ids=device_ids,
|
||||||
pod_ip=self.cfg.pod_ips[0],
|
pod_ip=self.cfg.master_ip,
|
||||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||||
pid_suffix=self.ipc_signal_suffix)
|
pid_suffix=self.ipc_signal_suffix)
|
||||||
|
|
||||||
@@ -239,11 +239,12 @@ class LLMEngine(object):
|
|||||||
|
|
||||||
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
|
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
|
||||||
self.dp_processed = []
|
self.dp_processed = []
|
||||||
for i in range(1, self.cfg.parallel_config.data_parallel_size):
|
for i in range(1, self.cfg.parallel_config.data_parallel_size // self.cfg.nnode):
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
self.dp_processed.append(
|
self.dp_processed.append(
|
||||||
multiprocessing.Process(target=start_expert_service,
|
multiprocessing.Process(target=start_expert_service,
|
||||||
args=(self.cfg, i,
|
args=(self.cfg,
|
||||||
|
i + self.cfg.node_rank * self.cfg.worker_num_per_node,
|
||||||
self.ipc_signal_suffix)))
|
self.ipc_signal_suffix)))
|
||||||
llm_logger.info(f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}" \
|
llm_logger.info(f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}" \
|
||||||
+ " data parallel id {}".format(i))
|
+ " data parallel id {}".format(i))
|
||||||
@@ -1007,8 +1008,6 @@ class LLMEngine(object):
|
|||||||
)
|
)
|
||||||
|
|
||||||
arguments = (
|
arguments = (
|
||||||
f" --nnodes {str(self.cfg.nnode)}"
|
|
||||||
f" --ips {','.join(self.cfg.pod_ips)}"
|
|
||||||
f" --devices {self.cfg.device_ids} {py_script}"
|
f" --devices {self.cfg.device_ids} {py_script}"
|
||||||
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
|
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
|
||||||
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
|
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
|
||||||
@@ -1016,7 +1015,7 @@ class LLMEngine(object):
|
|||||||
f" --device_ids {self.cfg.device_ids}"
|
f" --device_ids {self.cfg.device_ids}"
|
||||||
f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
|
f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
|
||||||
f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}"
|
f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}"
|
||||||
f" --pod_ip {self.cfg.pod_ips[0]}"
|
f" --pod_ip {self.cfg.master_ip}"
|
||||||
f" --total_block_num {self.cfg.cache_config.total_block_num}"
|
f" --total_block_num {self.cfg.cache_config.total_block_num}"
|
||||||
f" --block_size {self.cfg.cache_config.block_size}"
|
f" --block_size {self.cfg.cache_config.block_size}"
|
||||||
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
|
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
|
||||||
@@ -1057,7 +1056,11 @@ class LLMEngine(object):
|
|||||||
if value:
|
if value:
|
||||||
arguments = arguments + f" --{worker_flag}"
|
arguments = arguments + f" --{worker_flag}"
|
||||||
if self.cfg.nnode > 1:
|
if self.cfg.nnode > 1:
|
||||||
pd_cmd = pd_cmd + f" --ips {self.cfg.ips}"
|
pd_cmd = pd_cmd + (
|
||||||
|
f" --master {self.cfg.dist_init_addr}"
|
||||||
|
f" --nnodes {str(self.cfg.nnode)}"
|
||||||
|
f" --rank {str(self.cfg.node_rank)}"
|
||||||
|
)
|
||||||
pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log"
|
pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log"
|
||||||
llm_logger.info("Launch worker service command: {}".format(pd_cmd))
|
llm_logger.info("Launch worker service command: {}".format(pd_cmd))
|
||||||
p = subprocess.Popen(
|
p = subprocess.Popen(
|
||||||
@@ -1158,7 +1161,7 @@ class LLMEngine(object):
|
|||||||
cache_config=self.cfg.cache_config,
|
cache_config=self.cfg.cache_config,
|
||||||
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
||||||
device_ids=device_ids,
|
device_ids=device_ids,
|
||||||
pod_ip=self.cfg.pod_ips[0],
|
pod_ip=self.cfg.master_ip,
|
||||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||||
pid_suffix=self.ipc_signal_suffix)
|
pid_suffix=self.ipc_signal_suffix)
|
||||||
def check_health(self, time_interval_threashold=30):
|
def check_health(self, time_interval_threashold=30):
|
||||||
@@ -1245,8 +1248,9 @@ class LLMEngine(object):
|
|||||||
"""
|
"""
|
||||||
start queue service for engine worker communication
|
start queue service for engine worker communication
|
||||||
"""
|
"""
|
||||||
address = (self.cfg.pod_ips[0], self.cfg.engine_worker_queue_port)
|
address = (self.cfg.master_ip, self.cfg.engine_worker_queue_port)
|
||||||
if self.cfg.host_ip == self.cfg.pod_ips[0] or self.cfg.pod_ips[0] == "0.0.0.0":
|
if self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0":
|
||||||
|
llm_logger.info(f"Starting engine worker queue server service at {address}")
|
||||||
self.engine_worker_queue_server = EngineWorkerQueue(
|
self.engine_worker_queue_server = EngineWorkerQueue(
|
||||||
address=address,
|
address=address,
|
||||||
is_server=True,
|
is_server=True,
|
||||||
@@ -1256,7 +1260,7 @@ class LLMEngine(object):
|
|||||||
|
|
||||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed':
|
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed':
|
||||||
self.cache_task_queue = EngineCacheQueue(
|
self.cache_task_queue = EngineCacheQueue(
|
||||||
address=(self.cfg.pod_ips[0], self.cfg.cache_config.cache_queue_port),
|
address=(self.cfg.master_ip, self.cfg.cache_config.cache_queue_port),
|
||||||
authkey=b'cache_queue_service',
|
authkey=b'cache_queue_service',
|
||||||
is_server=True,
|
is_server=True,
|
||||||
num_client=self.cfg.tensor_parallel_size,
|
num_client=self.cfg.tensor_parallel_size,
|
||||||
@@ -1270,4 +1274,6 @@ class LLMEngine(object):
|
|||||||
is_server=False,
|
is_server=False,
|
||||||
num_client=self.cfg.tensor_parallel_size,
|
num_client=self.cfg.tensor_parallel_size,
|
||||||
client_id=0,
|
client_id=0,
|
||||||
local_data_parallel_id=0)
|
local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
|
||||||
|
local_data_parallel_id= min(self.cfg.worker_num_per_node * self.cfg.node_rank,
|
||||||
|
self.cfg.parallel_config.data_parallel_size - 1))
|
||||||
|
@@ -49,8 +49,8 @@ class ExpertService(object):
|
|||||||
cfg (Config): Config object containing all the configuration parameters.
|
cfg (Config): Config object containing all the configuration parameters.
|
||||||
"""
|
"""
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
start_pos = local_data_parallel_id * self.cfg.tensor_parallel_size
|
start_pos = (local_data_parallel_id * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node
|
||||||
end_pos = (local_data_parallel_id + 1) * self.cfg.tensor_parallel_size
|
end_pos = ((local_data_parallel_id + 1) * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node
|
||||||
self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[
|
self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[
|
||||||
start_pos:end_pos]
|
start_pos:end_pos]
|
||||||
self.cfg.local_device_ids = self.cfg.device_ids.split(
|
self.cfg.local_device_ids = self.cfg.device_ids.split(
|
||||||
@@ -65,7 +65,7 @@ class ExpertService(object):
|
|||||||
|
|
||||||
self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id
|
self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id
|
||||||
|
|
||||||
address = (cfg.pod_ips[0], cfg.engine_worker_queue_port)
|
address = (cfg.master_ip, cfg.engine_worker_queue_port)
|
||||||
self.engine_worker_queue = EngineWorkerQueue(
|
self.engine_worker_queue = EngineWorkerQueue(
|
||||||
address=address,
|
address=address,
|
||||||
is_server=False,
|
is_server=False,
|
||||||
@@ -118,7 +118,7 @@ class ExpertService(object):
|
|||||||
cache_config=self.cfg.cache_config,
|
cache_config=self.cfg.cache_config,
|
||||||
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
||||||
device_ids=self.cfg.local_device_ids,
|
device_ids=self.cfg.local_device_ids,
|
||||||
pod_ip=self.cfg.pod_ips[0],
|
pod_ip=self.cfg.master_ip,
|
||||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||||
pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}"
|
pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}"
|
||||||
)
|
)
|
||||||
|
@@ -85,7 +85,7 @@ class LLM:
|
|||||||
|
|
||||||
self.mutex = threading.Lock()
|
self.mutex = threading.Lock()
|
||||||
self.req_output = dict()
|
self.req_output = dict()
|
||||||
self.master_node_ip = self.llm_engine.cfg.pod_ips[0]
|
self.master_node_ip = self.llm_engine.cfg.master_ip
|
||||||
self._receive_output_thread = threading.Thread(
|
self._receive_output_thread = threading.Thread(
|
||||||
target=self._receive_output, daemon=True)
|
target=self._receive_output, daemon=True)
|
||||||
self._receive_output_thread.start()
|
self._receive_output_thread.start()
|
||||||
|
@@ -122,8 +122,8 @@ async def lifespan(app: FastAPI):
|
|||||||
args.mm_processor_kwargs, args.enable_mm,
|
args.mm_processor_kwargs, args.enable_mm,
|
||||||
args.reasoning_parser)
|
args.reasoning_parser)
|
||||||
app.state.dynamic_load_weight = args.dynamic_load_weight
|
app.state.dynamic_load_weight = args.dynamic_load_weight
|
||||||
chat_handler = OpenAIServingChat(engine_client, pid, args.pod_ips)
|
chat_handler = OpenAIServingChat(engine_client, pid, args.dist_init_ip)
|
||||||
completion_handler = OpenAIServingCompletion(engine_client, pid, args.pod_ips)
|
completion_handler = OpenAIServingCompletion(engine_client, pid, args.dist_init_ip)
|
||||||
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
|
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
|
||||||
engine_client.pid = pid
|
engine_client.pid = pid
|
||||||
app.state.engine_client = engine_client
|
app.state.engine_client = engine_client
|
||||||
|
@@ -40,16 +40,16 @@ class OpenAIServingChat:
|
|||||||
OpenAI-style chat completions serving
|
OpenAI-style chat completions serving
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, engine_client, pid, pod_ips):
|
def __init__(self, engine_client, pid, dist_init_ip):
|
||||||
self.engine_client = engine_client
|
self.engine_client = engine_client
|
||||||
self.pid = pid
|
self.pid = pid
|
||||||
self.pod_ips = pod_ips
|
self.master_ip = dist_init_ip
|
||||||
self.host_ip = get_host_ip()
|
self.host_ip = get_host_ip()
|
||||||
|
|
||||||
def _check_master(self):
|
def _check_master(self):
|
||||||
if self.pod_ips is None:
|
if self.master_ip is None:
|
||||||
return True
|
return True
|
||||||
if self.host_ip == self.pod_ips[0]:
|
if self.host_ip == self.master_ip:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@@ -45,16 +45,16 @@ from fastdeploy.engine.request import RequestOutput
|
|||||||
|
|
||||||
|
|
||||||
class OpenAIServingCompletion:
|
class OpenAIServingCompletion:
|
||||||
def __init__(self, engine_client, pid, pod_ips):
|
def __init__(self, engine_client, pid, dist_init_ip):
|
||||||
self.engine_client = engine_client
|
self.engine_client = engine_client
|
||||||
self.pid = pid
|
self.pid = pid
|
||||||
self.pod_ips = pod_ips
|
self.master_ip = dist_init_ip
|
||||||
self.host_ip = get_host_ip()
|
self.host_ip = get_host_ip()
|
||||||
|
|
||||||
def _check_master(self):
|
def _check_master(self):
|
||||||
if self.pod_ips is None:
|
if self.master_ip is None:
|
||||||
return True
|
return True
|
||||||
if self.host_ip == self.pod_ips[0]:
|
if self.host_ip == self.master_ip:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@@ -27,7 +27,8 @@ from datetime import datetime
|
|||||||
from logging.handlers import BaseRotatingHandler
|
from logging.handlers import BaseRotatingHandler
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal, TypeVar, Union
|
from typing import Literal, TypeVar, Union
|
||||||
|
import random
|
||||||
|
import socket
|
||||||
import requests
|
import requests
|
||||||
import yaml
|
import yaml
|
||||||
from aistudio_sdk.snapshot_download import snapshot_download
|
from aistudio_sdk.snapshot_download import snapshot_download
|
||||||
@@ -421,6 +422,19 @@ def get_host_ip():
|
|||||||
return ip
|
return ip
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_port():
|
||||||
|
while True:
|
||||||
|
port = random.randint(49152, 65535)
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||||
|
try:
|
||||||
|
s.bind(("0.0.0.0", port))
|
||||||
|
return port
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
def is_port_available(host, port):
|
def is_port_available(host, port):
|
||||||
"""
|
"""
|
||||||
Check the port is available
|
Check the port is available
|
||||||
|
@@ -23,6 +23,7 @@ import pynvml
|
|||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.engine.request import Request
|
from fastdeploy.engine.request import Request
|
||||||
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.utils import get_logger
|
from fastdeploy.utils import get_logger
|
||||||
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
|
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
|
||||||
from fastdeploy.worker.output import ModelRunnerOutput
|
from fastdeploy.worker.output import ModelRunnerOutput
|
||||||
@@ -50,11 +51,12 @@ class GpuWorker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
Initialize device and construct model runner
|
Initialize device and construct model runner
|
||||||
"""
|
"""
|
||||||
|
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||||
if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda(
|
if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda(
|
||||||
):
|
):
|
||||||
# Set evironment variable
|
# Set evironment variable
|
||||||
self.device_ids = self.parallel_config.device_ids.split(",")
|
self.device_ids = self.parallel_config.device_ids.split(",")
|
||||||
self.device = f"gpu:{self.local_rank}"
|
self.device = f"gpu:{self.local_rank % self.max_chips_per_node}"
|
||||||
paddle.device.set_device(self.device)
|
paddle.device.set_device(self.device)
|
||||||
paddle.set_default_dtype(self.parallel_config.dtype)
|
paddle.set_default_dtype(self.parallel_config.dtype)
|
||||||
|
|
||||||
@@ -72,7 +74,7 @@ class GpuWorker(WorkerBase):
|
|||||||
self.model_runner: GPUModelRunner = GPUModelRunner(
|
self.model_runner: GPUModelRunner = GPUModelRunner(
|
||||||
fd_config=self.fd_config,
|
fd_config=self.fd_config,
|
||||||
device=self.device,
|
device=self.device,
|
||||||
device_id=self.device_ids[self.local_rank],
|
device_id=self.device_ids[self.local_rank % self.max_chips_per_node],
|
||||||
rank=self.rank,
|
rank=self.rank,
|
||||||
local_rank=self.local_rank)
|
local_rank=self.local_rank)
|
||||||
|
|
||||||
|
@@ -136,9 +136,9 @@ class PaddleDisWorkerProc():
|
|||||||
model_weights_status:
|
model_weights_status:
|
||||||
"""
|
"""
|
||||||
# init worker_ready_signal
|
# init worker_ready_signal
|
||||||
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||||
array_size = min(
|
array_size = min(
|
||||||
max_chips_per_node, self.parallel_config.tensor_parallel_size *
|
self.max_chips_per_node, self.parallel_config.tensor_parallel_size *
|
||||||
self.parallel_config.expert_parallel_size)
|
self.parallel_config.expert_parallel_size)
|
||||||
workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
|
workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
|
||||||
self.worker_ready_signal = IPCSignal(
|
self.worker_ready_signal = IPCSignal(
|
||||||
@@ -148,10 +148,10 @@ class PaddleDisWorkerProc():
|
|||||||
suffix=self.parallel_config.engine_pid,
|
suffix=self.parallel_config.engine_pid,
|
||||||
create=False)
|
create=False)
|
||||||
self.worker_ready_signal.value[self.local_rank %
|
self.worker_ready_signal.value[self.local_rank %
|
||||||
max_chips_per_node] = 1
|
self.max_chips_per_node] = 1
|
||||||
|
|
||||||
# init worker_healthy_live_signal
|
# init worker_healthy_live_signal
|
||||||
workers_alive = np.zeros(shape=[self.ranks], dtype=np.int32)
|
workers_alive = np.zeros(shape=[array_size], dtype=np.int32)
|
||||||
self.worker_healthy_live_signal = IPCSignal(
|
self.worker_healthy_live_signal = IPCSignal(
|
||||||
name="worker_healthy_live_signal",
|
name="worker_healthy_live_signal",
|
||||||
array=workers_alive,
|
array=workers_alive,
|
||||||
@@ -205,7 +205,7 @@ class PaddleDisWorkerProc():
|
|||||||
Tmp loop function for ep utill DP is supported
|
Tmp loop function for ep utill DP is supported
|
||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
self.worker_healthy_live_signal.value[self.local_rank] = int(
|
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(
|
||||||
time.time())
|
time.time())
|
||||||
|
|
||||||
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(
|
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(
|
||||||
|
Reference in New Issue
Block a user