[Feature] Support pd ep deployment with yiyan adapter (#4029)

* [Feature] Support mixed deployment with yiyan adapter in release2.2 * fix metrics * add unit test * add unit test * add unit test * Support pd ep deployment with yiyan adapter * Support pd ep deployment with yiyan adapter * refactor cache messager * support scheduler v1 in PD * suppport pd v1 + chunk prefill * suppport pd v1 + chunk prefill * add eplb * support eplb * support eplb * support eplb * support v1 * fix * fix * fix bug * remove eplb support * support prefix cache in P * fix bug * fix bug * support one stop in V1 * fix bug * fix ci * fix ci * fix * fix * fix * fix * fix --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-10-05 08:37:06 +08:00 · 2025-09-22 16:41:38 +08:00
parent 9845f0d010
commit 918ccdb123
22 changed files with 1838 additions and 343 deletions
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -16,6 +16,7 @@

 import argparse
 import json
+import os
 import time
 from typing import Tuple

@@ -259,6 +260,7 @@ class PaddleDisWorkerProc:
        """Main event loop for Paddle Distributed Workers.
        TODO(gongshaotian): support remote calling of functions that control worker.
        """
+
        # Currently, only support single node
        self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
        req_ids = []
@@ -643,6 +645,12 @@ def parse_args():
        help="Flag to specify dtype of lm_head as FP32",
    )

+    parser.add_argument(
+        "--cache-transfer-protocol",
+        type=str,
+        default="ipc",
+        help="support protocol list, comma separated, default is ipc",
+    )
    parser.add_argument(
        "--runner",
        type=str,
@@ -762,8 +770,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
    ):
        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not support speculative decoding now.")
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
-    if args.splitwise_role != "mixed":
-        logger.info(f"Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported {args.splitwise_role} now.")
+    if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma":
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
    if not current_platform.is_cuda():
        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
@@ -772,6 +779,9 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported guided_decoding.")
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0

+    if envs.ENABLE_V1_KVCACHE_SCHEDULER and args.splitwise_role == "prefill":
+        os.environ["PREFILL_NODE_ONE_STEP_STOP_V1"] = "1"
+
    fd_config = FDConfig(
        model_config=model_config,
        parallel_config=parallel_config,