mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature] [PD] add simple router and refine splitwise deployment (#4709)
* add simple router and refine splitwise deployment * fix
This commit is contained in:
@@ -34,6 +34,7 @@ from fastdeploy.config import (
|
||||
ParallelConfig,
|
||||
PlasAttentionConfig,
|
||||
PoolerConfig,
|
||||
RouterConfig,
|
||||
RunnerOption,
|
||||
SpeculativeConfig,
|
||||
StructuredOutputsConfig,
|
||||
@@ -74,6 +75,10 @@ class EngineArgs:
|
||||
"""
|
||||
The name or path of the model to be used.
|
||||
"""
|
||||
port: Optional[str] = None
|
||||
"""
|
||||
Port for api server.
|
||||
"""
|
||||
served_model_name: Optional[str] = None
|
||||
"""
|
||||
The name of the model being served.
|
||||
@@ -445,6 +450,11 @@ class EngineArgs:
|
||||
- To enable custom logits processors, add your dotted paths to module and class names to the list.
|
||||
"""
|
||||
|
||||
router: Optional[str] = None
|
||||
"""
|
||||
Url for router server, such as `0.0.0.0:30000`.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Post-initialization processing to set default tokenizer if not provided.
|
||||
@@ -859,21 +869,6 @@ class EngineArgs:
|
||||
help="Flag to enable prefix caching.",
|
||||
)
|
||||
|
||||
perf_group.add_argument(
|
||||
"--splitwise-role",
|
||||
type=str,
|
||||
default=EngineArgs.splitwise_role,
|
||||
help="Role of splitwise. Default is \
|
||||
'mixed'. (prefill, decode, mixed)",
|
||||
)
|
||||
|
||||
perf_group.add_argument(
|
||||
"--innode-prefill-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.innode_prefill_ports,
|
||||
help="port for innode prefill",
|
||||
)
|
||||
|
||||
perf_group.add_argument(
|
||||
"--enable-chunked-prefill",
|
||||
action="store_true",
|
||||
@@ -903,27 +898,53 @@ class EngineArgs:
|
||||
help=("For chunked prefill, the threshold number of" " tokens for a prompt to be considered long."),
|
||||
)
|
||||
|
||||
perf_group.add_argument(
|
||||
# Splitwise deployment parameters group
|
||||
splitwise_group = parser.add_argument_group("Splitwise Deployment")
|
||||
splitwise_group.add_argument(
|
||||
"--splitwise-role",
|
||||
type=str,
|
||||
default=EngineArgs.splitwise_role,
|
||||
help="Role of splitwise. Default is \
|
||||
'mixed'. (prefill, decode, mixed)",
|
||||
)
|
||||
|
||||
splitwise_group.add_argument(
|
||||
"--innode-prefill-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.innode_prefill_ports,
|
||||
help="port for innode prefill, only used in single machine splitwise deployment",
|
||||
)
|
||||
|
||||
splitwise_group.add_argument(
|
||||
"--cache-transfer-protocol",
|
||||
type=str,
|
||||
default=EngineArgs.cache_transfer_protocol,
|
||||
help="support protocol list, comma separated, default is ipc",
|
||||
help="support protocol list (ipc or rdma), comma separated, default is ipc",
|
||||
)
|
||||
|
||||
perf_group.add_argument(
|
||||
splitwise_group.add_argument(
|
||||
"--pd-comm-port",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.pd_comm_port,
|
||||
help="port for splitwise communication.",
|
||||
)
|
||||
|
||||
perf_group.add_argument(
|
||||
splitwise_group.add_argument(
|
||||
"--rdma-comm-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.rdma_comm_ports,
|
||||
help="ports for rdma communication.",
|
||||
)
|
||||
|
||||
# Router parameters group
|
||||
router_group = parser.add_argument_group("Router")
|
||||
router_group.add_argument(
|
||||
"--router",
|
||||
type=str,
|
||||
default=EngineArgs.router,
|
||||
help="url for router server.",
|
||||
)
|
||||
|
||||
# Scheduler parameters group
|
||||
scheduler_group = parser.add_argument_group("Scheduler")
|
||||
scheduler_group.add_argument(
|
||||
@@ -1044,7 +1065,11 @@ class EngineArgs:
|
||||
"""
|
||||
Create an instance of EngineArgs from command line arguments.
|
||||
"""
|
||||
return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)})
|
||||
args_dict = {}
|
||||
for field in dataclass_fields(cls):
|
||||
if hasattr(args, field.name):
|
||||
args_dict[field.name] = getattr(args, field.name)
|
||||
return cls(**args_dict)
|
||||
|
||||
def create_speculative_config(self) -> SpeculativeConfig:
|
||||
""" """
|
||||
@@ -1063,6 +1088,7 @@ class EngineArgs:
|
||||
prefix_len = len(prefix)
|
||||
|
||||
all = asdict(self)
|
||||
all.pop("port") # port and scheduler_port are not the same
|
||||
params = dict()
|
||||
for k, v in all.items():
|
||||
if k[:prefix_len] == prefix:
|
||||
@@ -1151,6 +1177,7 @@ class EngineArgs:
|
||||
scheduler_cfg = self.create_scheduler_config()
|
||||
graph_opt_cfg = self.create_graph_optimization_config()
|
||||
plas_attention_config = self.create_plas_attention_config()
|
||||
router_config = RouterConfig(all_dict)
|
||||
|
||||
early_stop_cfg = self.create_early_stop_config()
|
||||
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
|
||||
@@ -1170,6 +1197,7 @@ class EngineArgs:
|
||||
speculative_config=speculative_cfg,
|
||||
eplb_config=eplb_cfg,
|
||||
structured_outputs_config=structured_outputs_config,
|
||||
router_config=router_config,
|
||||
ips=self.ips,
|
||||
use_warmup=self.use_warmup,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
|
||||
Reference in New Issue
Block a user