mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Feature] Support pd ep deployment with yiyan adapter (#4029)
* [Feature] Support mixed deployment with yiyan adapter in release2.2 * fix metrics * add unit test * add unit test * add unit test * Support pd ep deployment with yiyan adapter * Support pd ep deployment with yiyan adapter * refactor cache messager * support scheduler v1 in PD * suppport pd v1 + chunk prefill * suppport pd v1 + chunk prefill * add eplb * support eplb * support eplb * support eplb * support v1 * fix * fix * fix bug * remove eplb support * support prefix cache in P * fix bug * fix bug * support one stop in V1 * fix bug * fix ci * fix ci * fix * fix * fix * fix * fix --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
@@ -259,6 +260,7 @@ class PaddleDisWorkerProc:
|
||||
"""Main event loop for Paddle Distributed Workers.
|
||||
TODO(gongshaotian): support remote calling of functions that control worker.
|
||||
"""
|
||||
|
||||
# Currently, only support single node
|
||||
self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
|
||||
req_ids = []
|
||||
@@ -643,6 +645,12 @@ def parse_args():
|
||||
help="Flag to specify dtype of lm_head as FP32",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--cache-transfer-protocol",
|
||||
type=str,
|
||||
default="ipc",
|
||||
help="support protocol list, comma separated, default is ipc",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--runner",
|
||||
type=str,
|
||||
@@ -762,8 +770,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
):
|
||||
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not support speculative decoding now.")
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if args.splitwise_role != "mixed":
|
||||
logger.info(f"Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported {args.splitwise_role} now.")
|
||||
if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma":
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if not current_platform.is_cuda():
|
||||
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
|
||||
@@ -772,6 +779,9 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported guided_decoding.")
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER and args.splitwise_role == "prefill":
|
||||
os.environ["PREFILL_NODE_ONE_STEP_STOP_V1"] = "1"
|
||||
|
||||
fd_config = FDConfig(
|
||||
model_config=model_config,
|
||||
parallel_config=parallel_config,
|
||||
|
Reference in New Issue
Block a user