[Bug Fix] fix bug for PD EP (#4823)

* fix bug for PD EP

* fix

* optimize perf for engine worker queue

* fix bug

* fix internode ll two stage

* fix for ci

* fix bug
This commit is contained in:
chenjian
2025-11-10 15:33:29 +08:00
committed by GitHub
parent 112623e33e
commit 78895e2c7d
11 changed files with 109 additions and 44 deletions

View File

@@ -709,7 +709,9 @@ class LLMEngine:
for i in range(self.cfg.parallel_config.data_parallel_size):
request_queues_for_dp_ipc.append(multiprocessing.Queue())
self.engine.scheduler.start(
self.cfg.node_rank * self.cfg.worker_num_per_node, request_queues_for_dp_ipc, result_queue_for_dp_ipc
self.cfg.node_rank * self.cfg.worker_num_per_node % self.cfg.worker_num_per_node,
request_queues_for_dp_ipc,
result_queue_for_dp_ipc,
)
if not envs.FD_ENABLE_MULTI_API_SERVER:
@@ -721,10 +723,14 @@ class LLMEngine:
1,
self.cfg.parallel_config.data_parallel_size // self.cfg.nnode,
):
address = (
self.cfg.master_ip,
int(self.cfg.parallel_config.engine_worker_queue_port[i]),
)
if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM:
address = (
self.cfg.master_ip,
int(self.cfg.parallel_config.engine_worker_queue_port[i]),
)
else:
address = f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.engine_worker_queue_port[i]}.sock"
llm_logger.info(f"dp start queue service {address}")
self.dp_engine_worker_queue_server.append(
EngineWorkerQueue(