[NewFeature]Support dp multi api server && Fix some bug in mixed ep && merge develop (#3598)

* [Feature] update ep

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix queue ports idx

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* Update engine.py

* fix ci

* fix some bug in mixed ep

* add server fix and op fix

* rm some log

* fix code style

* ltd fix

* fix

* fix

* fix some bug

* fix bug

* fix bug

* fix style

* Update config.py

* Update splitwise_connector.py

* Update cache_messager.py

* Update __init__.py

* merge and fix

* Update engine.py

* Update common_engine.py

* Update run_ci_xpu.sh

* Update ernie_processor.py

* Update ernie_processor.py

---------

Co-authored-by: ltd0924 <ltd0924@sina.com>
Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
gaoziyuan
2025-08-26 19:59:02 +08:00
committed by GitHub
parent cbce94a00e
commit 82e64b13e1
24 changed files with 1244 additions and 1200 deletions

View File

@@ -35,23 +35,22 @@ class SplitwiseConnector:
SplitwiseConnector class for managing and scheduling Splitwise tasks.
"""
def __init__(self, cfg, scheduler, worker_queue, resource_manager):
def __init__(self, cfg, worker_queue, resource_manager):
"""
Initialize the SplitwiseConnector instance.
Parameters:
cfg (dict): Configuration information.
scheduler (object): Scheduler object.
worker_queue (object): Worker queue object.
resource_manager (object): Resource manager object.
"""
self.cfg = cfg
self.scheduler = scheduler
self.engine_worker_queue = worker_queue
self.resource_manager = resource_manager
self.connect_innode_instances = {}
self.temp_cache_info = dict()
self.current_request_ids = dict()
self.idx = self.cfg.parallel_config.local_data_parallel_id
if self.cfg.cache_config.pd_comm_port is not None:
self.zmq_ctx = zmq.Context()
@@ -85,18 +84,20 @@ class SplitwiseConnector:
"""
while True:
try:
socks = dict(self.poller.poll(100))
if not socks:
continue
if hasattr(self, "poller"):
socks = dict(self.poller.poll(100))
if not socks:
continue
else:
logger.debug(f"receive {socks}")
frames = self.router_socket.recv_multipart()
logger.debug(f"frames: {frames}")
message = frames[-1]
self.io_executor.submit(self._process_message, message)
time.sleep(0.001)
else:
logger.debug(f"receive {socks}")
frames = self.router_socket.recv_multipart()
logger.debug(f"frames: {frames}")
message = frames[-1]
self.io_executor.submit(self._process_message, message)
time.sleep(0.001)
time.sleep(5)
except Exception as e:
logger.error(f"Receiver error: {e}, {str(traceback.format_exc())}")
time.sleep(1)
@@ -183,7 +184,7 @@ class SplitwiseConnector:
def dispatch_innode_splitwise_tasks(self, tasks, current_id):
"""
Dispatch splitwise tasks to the scheduler.
Dispatch splitwise tasks .
Parameters:
tasks (list): List of tasks.
@@ -203,7 +204,7 @@ class SplitwiseConnector:
"cache_info": {
"ipc": {
"ip": "0.0.0.0",
"port": self.cfg.engine_worker_queue_port,
"port": self.cfg.engine_worker_queue_port[self.idx],
"current_id": current_id,
},
},
@@ -286,7 +287,7 @@ class SplitwiseConnector:
if port not in self.connect_innode_instances:
self.create_connection(port)
for task in tasks:
task.disaggregate_info["cache_info"]["ipc"]["port"] = self.cfg.engine_worker_queue_port
task.disaggregate_info["cache_info"]["ipc"]["port"] = self.cfg.engine_worker_queue_port[self.idx]
self.connect_innode_instances[port].put_disaggregated_tasks(("decode", tasks))
for task in tasks:
task.disaggregate_info["cache_info"]["ipc"]["port"] = port