[New][RL] Support Rollout Routing Replay (#5405)

* [RL] Support Rollout Routing Replay * add routing indices cache * fix config bug and moe forward bug * R3 Support GLM * support eb4.5 * fix merge bug * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * add routing replay ci * support glm topk * support orther top_k * fix ci bug * pre-commit * only support chatcmpl * Revert "Revert "[RL] Support Rollout Routing Replay (#5321)" (#5402)" This reverts commit c45e064f3d. * Fix XPU and NPU bug --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-12-24 13:28:13 +08:00 · 2025-12-05 22:06:26 +08:00
parent c45e064f3d
commit b2908b8e82
26 changed files with 608 additions and 24 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -35,6 +35,7 @@ from fastdeploy.config import (
    PlasAttentionConfig,
    PoolerConfig,
    RouterConfig,
+    RoutingReplayConfig,
    RunnerOption,
    SpeculativeConfig,
    StructuredOutputsConfig,
@@ -491,6 +492,11 @@ class EngineArgs:
    Configuration for eplb.
    """

+    routing_replay_config: Optional[Dict[str, Any]] = None
+    """
+    Flag to rollout routing replay(r3)
+    """
+
    def __post_init__(self):
        """
        Post-initialization processing to set default tokenizer if not provided.
@@ -882,6 +888,12 @@ class EngineArgs:
            default=EngineArgs.eplb_config,
            help="Config of eplb.",
        )
+        parallel_group.add_argument(
+            "--routing-replay-config",
+            type=json.loads,
+            default=EngineArgs.routing_replay_config,
+            help="Flag of rollout routing replay(r3).",
+        )
        parallel_group.add_argument(
            "--enable-chunked-moe",
            action="store_true",
@@ -1235,6 +1247,14 @@ class EngineArgs:
        eplb_args["enable_eplb"] = self.enable_eplb
        return EPLBConfig(eplb_args)

+    def create_routing_repaly_config(self) -> RoutingReplayConfig:
+        """ """
+        routing_replay_args = asdict(self)
+        if self.routing_replay_config is not None:
+            for k, v in self.routing_replay_config.items():
+                routing_replay_args[k] = v
+        return RoutingReplayConfig(routing_replay_args)
+
    def create_engine_config(self, port_availability_check=True) -> FDConfig:
        """
        Create and return a Config object based on the current settings.
@@ -1278,6 +1298,7 @@ class EngineArgs:
        graph_opt_cfg = self.create_graph_optimization_config()
        plas_attention_config = self.create_plas_attention_config()
        eplb_cfg = self.create_eplb_config()
+        routing_replay_config = self.create_routing_repaly_config()
        router_config = RouterConfig(all_dict)

        early_stop_cfg = self.create_early_stop_config()
@@ -1310,4 +1331,5 @@ class EngineArgs:
            graph_opt_config=graph_opt_cfg,
            plas_attention_config=plas_attention_config,
            early_stop_config=early_stop_cfg,
+            routing_replay_config=routing_replay_config,
        )