[Feature] Support pd ep deployment with yiyan adapter (#4029)

* [Feature] Support mixed deployment with yiyan adapter in release2.2

* fix metrics

* add unit test

* add unit test

* add unit test

* Support pd ep deployment with yiyan adapter

* Support pd ep deployment with yiyan adapter

* refactor cache messager

* support scheduler v1 in PD

* suppport pd v1 + chunk prefill

* suppport pd v1 + chunk prefill

* add eplb

* support eplb

* support eplb

* support eplb

* support v1

* fix

* fix

* fix bug

* remove eplb support

* support prefix cache in P

* fix bug

* fix bug

* support one stop in V1

* fix bug

* fix ci

* fix ci

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
chenjian
2025-09-22 16:41:38 +08:00
committed by GitHub
parent 9845f0d010
commit 918ccdb123
22 changed files with 1838 additions and 343 deletions

View File

@@ -16,6 +16,7 @@
import argparse
import json
import os
import time
from typing import Tuple
@@ -259,6 +260,7 @@ class PaddleDisWorkerProc:
"""Main event loop for Paddle Distributed Workers.
TODO(gongshaotian): support remote calling of functions that control worker.
"""
# Currently, only support single node
self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
req_ids = []
@@ -643,6 +645,12 @@ def parse_args():
help="Flag to specify dtype of lm_head as FP32",
)
parser.add_argument(
"--cache-transfer-protocol",
type=str,
default="ipc",
help="support protocol list, comma separated, default is ipc",
)
parser.add_argument(
"--runner",
type=str,
@@ -762,8 +770,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
):
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not support speculative decoding now.")
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if args.splitwise_role != "mixed":
logger.info(f"Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported {args.splitwise_role} now.")
if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma":
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if not current_platform.is_cuda():
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
@@ -772,6 +779,9 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported guided_decoding.")
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if envs.ENABLE_V1_KVCACHE_SCHEDULER and args.splitwise_role == "prefill":
os.environ["PREFILL_NODE_ONE_STEP_STOP_V1"] = "1"
fd_config = FDConfig(
model_config=model_config,
parallel_config=parallel_config,