mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[BUGFIX] fix ep mixed bug (#3513)
* Update expert_service.py * Update engine.py * Update engine.py * Update engine.py * Update expert_service.py * Update engine.py
This commit is contained in:
@@ -124,8 +124,9 @@ class LLMEngine:
|
||||
cfg.max_num_seqs, cfg, cfg.tensor_parallel_size, cfg.splitwise_role
|
||||
)
|
||||
|
||||
os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.cfg.engine_worker_queue_port)
|
||||
|
||||
os.environ["INFERENCE_MSG_QUEUE_ID"] = str(
|
||||
self.cfg.engine_worker_queue_port + self.cfg.worker_num_per_node * self.cfg.node_rank
|
||||
)
|
||||
self.split_connector = SplitwiseConnector(cfg, self.scheduler, self.engine_worker_queue, self.resource_manager)
|
||||
|
||||
self.token_processor = TokenProcessor(
|
||||
|
@@ -59,7 +59,7 @@ class ExpertService:
|
||||
self.cfg.disaggregate_info = None
|
||||
|
||||
self.scheduler = cfg.scheduler_config.scheduler()
|
||||
|
||||
if cfg.scheduler_config.name == "splitwise":
|
||||
self.scheduler.reset_nodeid(f"{self.scheduler.infer.nodeid}_{local_data_parallel_id!s}")
|
||||
|
||||
self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id
|
||||
@@ -143,7 +143,7 @@ class ExpertService:
|
||||
self.token_processor.run()
|
||||
|
||||
self.cfg.init_cache_info()
|
||||
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
role = self.cfg.splitwise_role
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
@@ -363,6 +363,10 @@ def start_expert_service(cfg, local_data_parallel_id, ipc_signal_suffix):
|
||||
expert_service = ExpertService(cfg, local_data_parallel_id)
|
||||
try:
|
||||
expert_service.start(ipc_signal_suffix, local_data_parallel_id)
|
||||
if cfg.splitwise_role != "mixed":
|
||||
expert_service.split_connector.start_receiver()
|
||||
else:
|
||||
while True:
|
||||
time.sleep(100)
|
||||
except Exception as e:
|
||||
llm_logger.exception(f"Expert service failed to start: {e}")
|
||||
|
Reference in New Issue
Block a user