[Feature] support eplb in api_server (#4782)

* support eplb in api_server * update code * add eplb test case * update eplb * support tp+dp eplb * update test cese * update code * update code * fix bug * update copilot review * update test case name
2025-12-24 13:28:13 +08:00 · 2025-11-24 20:22:29 +08:00
parent d5bd64336a
commit 8e4e3ff510
25 changed files with 2102 additions and 421 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -467,6 +467,16 @@ class EngineArgs:
    Url for router server, such as `0.0.0.0:30000`.
    """

+    enable_eplb: bool = False
+    """
+    Flag to enable eplb
+    """
+
+    eplb_config: Optional[Dict[str, Any]] = None
+    """
+    Configuration for eplb.
+    """
+
    def __post_init__(self):
        """
        Post-initialization processing to set default tokenizer if not provided.
@@ -850,6 +860,18 @@ class EngineArgs:
            default=EngineArgs.enable_expert_parallel,
            help="Enable expert parallelism.",
        )
+        parallel_group.add_argument(
+            "--enable-eplb",
+            action="store_true",
+            default=EngineArgs.enable_eplb,
+            help="Enable eplb.",
+        )
+        parallel_group.add_argument(
+            "--eplb-config",
+            type=json.loads,
+            default=EngineArgs.eplb_config,
+            help="Config of eplb.",
+        )

        # Load group
        load_group = parser.add_argument_group("Load Configuration")
@@ -1126,7 +1148,7 @@ class EngineArgs:

    def create_scheduler_config(self) -> SchedulerConfig:
        """
-        Create and retuan a SchedulerConfig object based on the current settings.
+        Create and return a SchedulerConfig object based on the current settings.
        """
        prefix = "scheduler_"
        prefix_len = len(prefix)
@@ -1173,13 +1195,22 @@ class EngineArgs:
                early_stop_args[k] = v
        return EarlyStopConfig(early_stop_args)

+    def create_eplb_config(self) -> EPLBConfig:
+        """
+        Create and retuan an EPLBConfig object based on the current settings.
+        """
+        eplb_args = asdict(self)
+        if self.eplb_config is not None:
+            for k, v in self.eplb_config.items():
+                eplb_args[k] = v
+        eplb_args["enable_eplb"] = self.enable_eplb
+        return EPLBConfig(eplb_args)
+
    def create_engine_config(self, port_availability_check=True) -> FDConfig:
        """
        Create and return a Config object based on the current settings.
        """
        all_dict = asdict(self)
-        eplb_cfg = EPLBConfig()
-        all_dict["enable_redundant_experts"] = eplb_cfg.enable_redundant_experts
        model_cfg = ModelConfig(all_dict)

        # XPU currently disable prefix cache for VL model
@@ -1221,6 +1252,7 @@ class EngineArgs:
        scheduler_cfg = self.create_scheduler_config()
        graph_opt_cfg = self.create_graph_optimization_config()
        plas_attention_config = self.create_plas_attention_config()
+        eplb_cfg = self.create_eplb_config()
        router_config = RouterConfig(all_dict)

        early_stop_cfg = self.create_early_stop_config()