[RL] Support Rollout Routing Replay (#5321)

* [RL] Support Rollout Routing Replay * add routing indices cache * fix config bug and moe forward bug * R3 Support GLM * support eb4.5 * fix merge bug * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * add routing replay ci * support glm topk * support orther top_k * fix ci bug * pre-commit * only support chatcmpl --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-12-24 13:28:13 +08:00 · 2025-12-05 20:01:33 +08:00
parent 8545b705ed
commit 96d2d4877b
24 changed files with 592 additions and 24 deletions
--- a/tests/distributed/chunked_moe.py
+++ b/tests/distributed/chunked_moe.py
@@ -90,7 +90,7 @@ class MockAttentionBackend:


 class MockQuantMethod:
-    def apply(self, layer, x, gate):
+    def apply(self, layer, x, gate, topk_ids_hookfunc=None):
        return x


@@ -129,6 +129,7 @@ class TestChunkedMoE(unittest.TestCase):
        model_runner.speculative_decoding = False
        model_runner._init_share_inputs(mock_fd_config.scheduler_config.max_num_seqs)
        model_runner.share_inputs["caches"] = None
+        model_runner.routing_replay_manager = None

        if dist.get_rank() == 0:
            model_runner.share_inputs["ids_remove_padding"] = paddle.ones([10])
@@ -148,6 +149,7 @@ class TestChunkedMoE(unittest.TestCase):

        fused_moe.fd_config = mock_fd_config
        fused_moe.quant_method = MockQuantMethod()
+        fused_moe.enable_routing_replay = None
        return fused_moe

    def run_model_runner(self):