[NewFeature]Support dp multi api server && Fix some bug in mixed ep && merge develop (#3598)

* [Feature] update ep

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix queue ports idx

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* Update engine.py

* fix ci

* fix some bug in mixed ep

* add server fix and op fix

* rm some log

* fix code style

* ltd fix

* fix

* fix

* fix some bug

* fix bug

* fix bug

* fix style

* Update config.py

* Update splitwise_connector.py

* Update cache_messager.py

* Update __init__.py

* merge and fix

* Update engine.py

* Update common_engine.py

* Update run_ci_xpu.sh

* Update ernie_processor.py

* Update ernie_processor.py

---------

Co-authored-by: ltd0924 <ltd0924@sina.com>
Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
gaoziyuan
2025-08-26 19:59:02 +08:00
committed by GitHub
parent cbce94a00e
commit 82e64b13e1
24 changed files with 1244 additions and 1200 deletions

View File

@@ -95,7 +95,7 @@ PRETRAINED_INIT_CONFIGURATION = {
"start_layer_index": 0,
"moe_num_shared_experts": 0,
"moe_layer_start_index": 0,
"num_max_dispatch_tokens_per_rank": 256,
"num_max_dispatch_tokens_per_rank": 128,
"moe_use_aux_free": False,
"vocab_size": -1,
"hidden_dropout_prob": 0.0,
@@ -278,7 +278,7 @@ class ParallelConfig:
# block size
self.block_size: int = 64
# Engine worker queue port
self.engine_worker_queue_port: int = 9923
self.engine_worker_queue_port: str = "9923"
# Max model len
self.max_model_len: int = 3072 # max_seq_len
# cuda visible devices
@@ -307,7 +307,11 @@ class ParallelConfig:
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
if isinstance(self.engine_worker_queue_port, str):
self.engine_worker_queue_port = [int(port) for port in self.engine_worker_queue_port.split(",")]
logger.info(f"engine_worker_queue_port: {self.engine_worker_queue_port}")
elif isinstance(self.engine_worker_queue_port, int):
self.engine_worker_queue_port = [self.engine_worker_queue_port]
# currently, the expert parallel size is equal data parallel size
if self.enable_expert_parallel:
self.expert_parallel_size = self.data_parallel_size * self.tensor_parallel_size
@@ -1038,7 +1042,7 @@ class FDConfig:
max_num_batched_tokens: Optional[int] = None,
ips: str = None,
use_warmup: bool = False,
engine_worker_queue_port: int = 8002,
engine_worker_queue_port: str = "8002",
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
splitwise_role: str = "mixed",
@@ -1082,11 +1086,10 @@ class FDConfig:
if self.ips is None:
self.master_ip = "0.0.0.0"
elif isinstance(self.ips, list):
self.master_ip = self.ips[0]
else:
elif isinstance(self.ips, str):
self.ips = self.ips.split(",")
self.master_ip = self.ips[0]
self.host_ip = get_host_ip()
if self.ips is None:
self.nnode = 1
@@ -1095,7 +1098,7 @@ class FDConfig:
self.nnode = len(self.ips)
for idx, ip in enumerate(self.ips):
if ip == self.master_ip:
if ip == self.host_ip:
self.node_rank = idx
self.max_model_len = max_model_len
@@ -1111,7 +1114,11 @@ class FDConfig:
self.reasoning_parser = reasoning_parser
self.guided_decoding_backend = guided_decoding_backend
self.disable_any_whitespace = disable_any_whitespace
self.engine_worker_queue_port = engine_worker_queue_port
self._str_to_list("innode_prefill_ports", int)
if isinstance(engine_worker_queue_port, int):
self.engine_worker_queue_port = str(engine_worker_queue_port)
self._str_to_list("engine_worker_queue_port", str)
if envs.FD_FOR_TORCH_MODEL_FORMAT:
self.model_config.model_format = "torch"
@@ -1129,10 +1136,11 @@ class FDConfig:
self.worker_num_per_node = self.max_chips_per_node
nnode = ceil_div(num_ranks, self.worker_num_per_node)
assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
# assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
else:
self.worker_num_per_node = num_ranks
self.engine_worker_queue_port = engine_worker_queue_port
self.device_ids = ",".join([str(i) for i in range(self.worker_num_per_node)])
self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids)
if current_platform.is_xpu():
@@ -1155,15 +1163,12 @@ class FDConfig:
self.local_device_ids = self.device_ids.split(",")[: self.parallel_config.tensor_parallel_size]
self.host_ip = get_host_ip()
if self.ips is None or self.host_ip == self.master_ip:
self.is_master = True
else:
self.is_master = False
if self.parallel_config.tensor_parallel_size <= self.worker_num_per_node:
self.is_master = True
self.master_ip = "0.0.0.0"
else:
self.is_master = False
self.master_ip = self.ips[0]
self.paddle_commit_id = paddle.version.commit
@@ -1345,10 +1350,12 @@ class FDConfig:
def _str_to_list(self, attr_name, default_type):
if hasattr(self, attr_name):
val = getattr(self, attr_name)
if val is None:
return
if type(val) is str:
setattr(self, attr_name, [default_type(i) for i in val.split(",")])
else:
setattr(self, attr_name, val)
setattr(self, attr_name, [default_type(i) for i in val])
def __str__(self) -> str:
return json.dumps(self.__dict__, indent=4)