[Feature] [PD] add simple router and refine splitwise deployment (#4709)

* add simple router and refine splitwise deployment

* fix
This commit is contained in:
Juncai
2025-11-06 14:56:02 +08:00
committed by GitHub
parent 831266da7a
commit 08ca0f6aea
39 changed files with 2397 additions and 171 deletions

View File

@@ -34,6 +34,7 @@ from fastdeploy.config import (
ParallelConfig,
PlasAttentionConfig,
PoolerConfig,
RouterConfig,
RunnerOption,
SpeculativeConfig,
StructuredOutputsConfig,
@@ -74,6 +75,10 @@ class EngineArgs:
"""
The name or path of the model to be used.
"""
port: Optional[str] = None
"""
Port for api server.
"""
served_model_name: Optional[str] = None
"""
The name of the model being served.
@@ -445,6 +450,11 @@ class EngineArgs:
- To enable custom logits processors, add your dotted paths to module and class names to the list.
"""
router: Optional[str] = None
"""
Url for router server, such as `0.0.0.0:30000`.
"""
def __post_init__(self):
"""
Post-initialization processing to set default tokenizer if not provided.
@@ -859,21 +869,6 @@ class EngineArgs:
help="Flag to enable prefix caching.",
)
perf_group.add_argument(
"--splitwise-role",
type=str,
default=EngineArgs.splitwise_role,
help="Role of splitwise. Default is \
'mixed'. (prefill, decode, mixed)",
)
perf_group.add_argument(
"--innode-prefill-ports",
type=lambda s: s.split(",") if s else None,
default=EngineArgs.innode_prefill_ports,
help="port for innode prefill",
)
perf_group.add_argument(
"--enable-chunked-prefill",
action="store_true",
@@ -903,27 +898,53 @@ class EngineArgs:
help=("For chunked prefill, the threshold number of" " tokens for a prompt to be considered long."),
)
perf_group.add_argument(
# Splitwise deployment parameters group
splitwise_group = parser.add_argument_group("Splitwise Deployment")
splitwise_group.add_argument(
"--splitwise-role",
type=str,
default=EngineArgs.splitwise_role,
help="Role of splitwise. Default is \
'mixed'. (prefill, decode, mixed)",
)
splitwise_group.add_argument(
"--innode-prefill-ports",
type=lambda s: s.split(",") if s else None,
default=EngineArgs.innode_prefill_ports,
help="port for innode prefill, only used in single machine splitwise deployment",
)
splitwise_group.add_argument(
"--cache-transfer-protocol",
type=str,
default=EngineArgs.cache_transfer_protocol,
help="support protocol list, comma separated, default is ipc",
help="support protocol list (ipc or rdma), comma separated, default is ipc",
)
perf_group.add_argument(
splitwise_group.add_argument(
"--pd-comm-port",
type=lambda s: s.split(",") if s else None,
default=EngineArgs.pd_comm_port,
help="port for splitwise communication.",
)
perf_group.add_argument(
splitwise_group.add_argument(
"--rdma-comm-ports",
type=lambda s: s.split(",") if s else None,
default=EngineArgs.rdma_comm_ports,
help="ports for rdma communication.",
)
# Router parameters group
router_group = parser.add_argument_group("Router")
router_group.add_argument(
"--router",
type=str,
default=EngineArgs.router,
help="url for router server.",
)
# Scheduler parameters group
scheduler_group = parser.add_argument_group("Scheduler")
scheduler_group.add_argument(
@@ -1044,7 +1065,11 @@ class EngineArgs:
"""
Create an instance of EngineArgs from command line arguments.
"""
return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)})
args_dict = {}
for field in dataclass_fields(cls):
if hasattr(args, field.name):
args_dict[field.name] = getattr(args, field.name)
return cls(**args_dict)
def create_speculative_config(self) -> SpeculativeConfig:
""" """
@@ -1063,6 +1088,7 @@ class EngineArgs:
prefix_len = len(prefix)
all = asdict(self)
all.pop("port") # port and scheduler_port are not the same
params = dict()
for k, v in all.items():
if k[:prefix_len] == prefix:
@@ -1151,6 +1177,7 @@ class EngineArgs:
scheduler_cfg = self.create_scheduler_config()
graph_opt_cfg = self.create_graph_optimization_config()
plas_attention_config = self.create_plas_attention_config()
router_config = RouterConfig(all_dict)
early_stop_cfg = self.create_early_stop_config()
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
@@ -1170,6 +1197,7 @@ class EngineArgs:
speculative_config=speculative_cfg,
eplb_config=eplb_cfg,
structured_outputs_config=structured_outputs_config,
router_config=router_config,
ips=self.ips,
use_warmup=self.use_warmup,
limit_mm_per_prompt=self.limit_mm_per_prompt,