mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature] support eplb in api_server (#4782)
* support eplb in api_server * update code * add eplb test case * update eplb * support tp+dp eplb * update test cese * update code * update code * fix bug * update copilot review * update test case name
This commit is contained in:
@@ -467,6 +467,16 @@ class EngineArgs:
|
||||
Url for router server, such as `0.0.0.0:30000`.
|
||||
"""
|
||||
|
||||
enable_eplb: bool = False
|
||||
"""
|
||||
Flag to enable eplb
|
||||
"""
|
||||
|
||||
eplb_config: Optional[Dict[str, Any]] = None
|
||||
"""
|
||||
Configuration for eplb.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Post-initialization processing to set default tokenizer if not provided.
|
||||
@@ -850,6 +860,18 @@ class EngineArgs:
|
||||
default=EngineArgs.enable_expert_parallel,
|
||||
help="Enable expert parallelism.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--enable-eplb",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_eplb,
|
||||
help="Enable eplb.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--eplb-config",
|
||||
type=json.loads,
|
||||
default=EngineArgs.eplb_config,
|
||||
help="Config of eplb.",
|
||||
)
|
||||
|
||||
# Load group
|
||||
load_group = parser.add_argument_group("Load Configuration")
|
||||
@@ -1126,7 +1148,7 @@ class EngineArgs:
|
||||
|
||||
def create_scheduler_config(self) -> SchedulerConfig:
|
||||
"""
|
||||
Create and retuan a SchedulerConfig object based on the current settings.
|
||||
Create and return a SchedulerConfig object based on the current settings.
|
||||
"""
|
||||
prefix = "scheduler_"
|
||||
prefix_len = len(prefix)
|
||||
@@ -1173,13 +1195,22 @@ class EngineArgs:
|
||||
early_stop_args[k] = v
|
||||
return EarlyStopConfig(early_stop_args)
|
||||
|
||||
def create_eplb_config(self) -> EPLBConfig:
|
||||
"""
|
||||
Create and retuan an EPLBConfig object based on the current settings.
|
||||
"""
|
||||
eplb_args = asdict(self)
|
||||
if self.eplb_config is not None:
|
||||
for k, v in self.eplb_config.items():
|
||||
eplb_args[k] = v
|
||||
eplb_args["enable_eplb"] = self.enable_eplb
|
||||
return EPLBConfig(eplb_args)
|
||||
|
||||
def create_engine_config(self, port_availability_check=True) -> FDConfig:
|
||||
"""
|
||||
Create and return a Config object based on the current settings.
|
||||
"""
|
||||
all_dict = asdict(self)
|
||||
eplb_cfg = EPLBConfig()
|
||||
all_dict["enable_redundant_experts"] = eplb_cfg.enable_redundant_experts
|
||||
model_cfg = ModelConfig(all_dict)
|
||||
|
||||
# XPU currently disable prefix cache for VL model
|
||||
@@ -1221,6 +1252,7 @@ class EngineArgs:
|
||||
scheduler_cfg = self.create_scheduler_config()
|
||||
graph_opt_cfg = self.create_graph_optimization_config()
|
||||
plas_attention_config = self.create_plas_attention_config()
|
||||
eplb_cfg = self.create_eplb_config()
|
||||
router_config = RouterConfig(all_dict)
|
||||
|
||||
early_stop_cfg = self.create_early_stop_config()
|
||||
|
||||
@@ -833,6 +833,7 @@ class AsyncLLMEngine:
|
||||
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
|
||||
f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"
|
||||
f" --max_logprobs {self.cfg.model_config.max_logprobs}"
|
||||
f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"
|
||||
)
|
||||
|
||||
worker_store_true_flag = {
|
||||
|
||||
@@ -34,6 +34,7 @@ from opentelemetry import trace
|
||||
from fastdeploy.engine.request import Request, RequestOutput, RequestType
|
||||
from fastdeploy.engine.resource_manager import ResourceManager
|
||||
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
|
||||
from fastdeploy.eplb.utils import init_eplb_signals
|
||||
from fastdeploy.input.preprocess import InputPreprocessor
|
||||
from fastdeploy.inter_communicator import (
|
||||
EngineCacheQueue,
|
||||
@@ -142,6 +143,12 @@ class EngineService:
|
||||
)
|
||||
self._init_worker_monitor_signals()
|
||||
|
||||
if self.cfg.eplb_config.enable_eplb:
|
||||
current_suffix = int(
|
||||
self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
|
||||
)
|
||||
init_eplb_signals(cfg, current_suffix)
|
||||
|
||||
self._finalizer = weakref.finalize(self, self._exit_sub_services)
|
||||
|
||||
def start(self):
|
||||
|
||||
@@ -566,6 +566,7 @@ class LLMEngine:
|
||||
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
|
||||
f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"
|
||||
f" --max_logprobs {self.cfg.model_config.max_logprobs}"
|
||||
f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"
|
||||
)
|
||||
if self.cfg.structured_outputs_config.logits_processors is not None:
|
||||
arguments += f" --logits-processors {' '.join(self.cfg.structured_outputs_config.logits_processors)}"
|
||||
|
||||
Reference in New Issue
Block a user