mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[NewFeature]Support dp multi api server && Fix some bug in mixed ep && merge develop (#3598)
* [Feature] update ep * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix queue ports idx * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * Update engine.py * fix ci * fix some bug in mixed ep * add server fix and op fix * rm some log * fix code style * ltd fix * fix * fix * fix some bug * fix bug * fix bug * fix style * Update config.py * Update splitwise_connector.py * Update cache_messager.py * Update __init__.py * merge and fix * Update engine.py * Update common_engine.py * Update run_ci_xpu.sh * Update ernie_processor.py * Update ernie_processor.py --------- Co-authored-by: ltd0924 <ltd0924@sina.com> Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
@@ -95,7 +95,7 @@ PRETRAINED_INIT_CONFIGURATION = {
|
||||
"start_layer_index": 0,
|
||||
"moe_num_shared_experts": 0,
|
||||
"moe_layer_start_index": 0,
|
||||
"num_max_dispatch_tokens_per_rank": 256,
|
||||
"num_max_dispatch_tokens_per_rank": 128,
|
||||
"moe_use_aux_free": False,
|
||||
"vocab_size": -1,
|
||||
"hidden_dropout_prob": 0.0,
|
||||
@@ -278,7 +278,7 @@ class ParallelConfig:
|
||||
# block size
|
||||
self.block_size: int = 64
|
||||
# Engine worker queue port
|
||||
self.engine_worker_queue_port: int = 9923
|
||||
self.engine_worker_queue_port: str = "9923"
|
||||
# Max model len
|
||||
self.max_model_len: int = 3072 # max_seq_len
|
||||
# cuda visible devices
|
||||
@@ -307,7 +307,11 @@ class ParallelConfig:
|
||||
for key, value in args.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
|
||||
if isinstance(self.engine_worker_queue_port, str):
|
||||
self.engine_worker_queue_port = [int(port) for port in self.engine_worker_queue_port.split(",")]
|
||||
logger.info(f"engine_worker_queue_port: {self.engine_worker_queue_port}")
|
||||
elif isinstance(self.engine_worker_queue_port, int):
|
||||
self.engine_worker_queue_port = [self.engine_worker_queue_port]
|
||||
# currently, the expert parallel size is equal data parallel size
|
||||
if self.enable_expert_parallel:
|
||||
self.expert_parallel_size = self.data_parallel_size * self.tensor_parallel_size
|
||||
@@ -1038,7 +1042,7 @@ class FDConfig:
|
||||
max_num_batched_tokens: Optional[int] = None,
|
||||
ips: str = None,
|
||||
use_warmup: bool = False,
|
||||
engine_worker_queue_port: int = 8002,
|
||||
engine_worker_queue_port: str = "8002",
|
||||
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
splitwise_role: str = "mixed",
|
||||
@@ -1082,11 +1086,10 @@ class FDConfig:
|
||||
|
||||
if self.ips is None:
|
||||
self.master_ip = "0.0.0.0"
|
||||
elif isinstance(self.ips, list):
|
||||
self.master_ip = self.ips[0]
|
||||
else:
|
||||
elif isinstance(self.ips, str):
|
||||
self.ips = self.ips.split(",")
|
||||
self.master_ip = self.ips[0]
|
||||
|
||||
self.host_ip = get_host_ip()
|
||||
|
||||
if self.ips is None:
|
||||
self.nnode = 1
|
||||
@@ -1095,7 +1098,7 @@ class FDConfig:
|
||||
self.nnode = len(self.ips)
|
||||
|
||||
for idx, ip in enumerate(self.ips):
|
||||
if ip == self.master_ip:
|
||||
if ip == self.host_ip:
|
||||
self.node_rank = idx
|
||||
|
||||
self.max_model_len = max_model_len
|
||||
@@ -1111,7 +1114,11 @@ class FDConfig:
|
||||
self.reasoning_parser = reasoning_parser
|
||||
self.guided_decoding_backend = guided_decoding_backend
|
||||
self.disable_any_whitespace = disable_any_whitespace
|
||||
self.engine_worker_queue_port = engine_worker_queue_port
|
||||
self._str_to_list("innode_prefill_ports", int)
|
||||
if isinstance(engine_worker_queue_port, int):
|
||||
self.engine_worker_queue_port = str(engine_worker_queue_port)
|
||||
self._str_to_list("engine_worker_queue_port", str)
|
||||
|
||||
if envs.FD_FOR_TORCH_MODEL_FORMAT:
|
||||
self.model_config.model_format = "torch"
|
||||
@@ -1129,10 +1136,11 @@ class FDConfig:
|
||||
self.worker_num_per_node = self.max_chips_per_node
|
||||
nnode = ceil_div(num_ranks, self.worker_num_per_node)
|
||||
assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
|
||||
|
||||
# assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
|
||||
else:
|
||||
self.worker_num_per_node = num_ranks
|
||||
|
||||
self.engine_worker_queue_port = engine_worker_queue_port
|
||||
self.device_ids = ",".join([str(i) for i in range(self.worker_num_per_node)])
|
||||
self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids)
|
||||
if current_platform.is_xpu():
|
||||
@@ -1155,15 +1163,12 @@ class FDConfig:
|
||||
|
||||
self.local_device_ids = self.device_ids.split(",")[: self.parallel_config.tensor_parallel_size]
|
||||
|
||||
self.host_ip = get_host_ip()
|
||||
|
||||
if self.ips is None or self.host_ip == self.master_ip:
|
||||
self.is_master = True
|
||||
else:
|
||||
self.is_master = False
|
||||
|
||||
if self.parallel_config.tensor_parallel_size <= self.worker_num_per_node:
|
||||
self.is_master = True
|
||||
self.master_ip = "0.0.0.0"
|
||||
else:
|
||||
self.is_master = False
|
||||
self.master_ip = self.ips[0]
|
||||
|
||||
self.paddle_commit_id = paddle.version.commit
|
||||
|
||||
@@ -1345,10 +1350,12 @@ class FDConfig:
|
||||
def _str_to_list(self, attr_name, default_type):
|
||||
if hasattr(self, attr_name):
|
||||
val = getattr(self, attr_name)
|
||||
if val is None:
|
||||
return
|
||||
if type(val) is str:
|
||||
setattr(self, attr_name, [default_type(i) for i in val.split(",")])
|
||||
else:
|
||||
setattr(self, attr_name, val)
|
||||
setattr(self, attr_name, [default_type(i) for i in val])
|
||||
|
||||
def __str__(self) -> str:
|
||||
return json.dumps(self.__dict__, indent=4)
|
||||
|
Reference in New Issue
Block a user