[PD Disaggregation] support DP via v1 router and decouple DP and EP (#5197)

* [fix] support DP via v1 router and decouple DP and EP

* [fix] fix scripts

* [fix] reset model path

* [fix] dp use get_output_ep, fix router port type, update scripts

* [merge] merge with latest code

* [chore] remove some debug log

* [fix] fix code style check

* [fix] fix test_multi_api_server for log_dir name

* [chore] reduce logs

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Yonghua Li
2025-12-04 15:38:43 +08:00
committed by GitHub
parent 5cd17fd662
commit f4119d51b4
15 changed files with 394 additions and 146 deletions

View File

@@ -496,6 +496,8 @@ class EngineWorkerQueue:
self.tasks.append(tasks)
self.lock.release()
llm_logger.debug(f"put_tasks: tasks={tasks}")
def get_tasks(self) -> Tuple[List[Any], bool]:
"""
Retrieve tasks from the shared queue and update read status.
@@ -512,6 +514,7 @@ class EngineWorkerQueue:
if all_client_read:
self.tasks[:] = list()
self.lock.release()
llm_logger.debug(f"get_tasks: tasks={tasks}")
return tasks, all_client_read
def num_tasks(self) -> int:
@@ -600,8 +603,7 @@ class EngineWorkerQueue:
self.cache_infos.extend(cache_info)
llm_logger.debug(
f"put cache_infos to engine worker queue: {self.cache_infos}, "
f"local_data_parallel_id:{self.local_data_parallel_id}"
f"put_cache_info: cache_info={cache_info}, local_data_parallel_id={self.local_data_parallel_id}"
)
self.lock_info.release()

View File

@@ -214,6 +214,9 @@ class ZmqServerBase(ABC):
except zmq.Again:
time.sleep(0.001)
continue
except zmq.error.ZMQError as e:
llm_logger.error(f"recv_result_handle get zmq error: {e}")
break
except Exception as e:
llm_logger.error(f"recv_result_handle get unknown exception: {e}")
continue