[RL] Support Rollout Routing Replay (#5321)

* [RL] Support Rollout Routing Replay

* add routing indices cache

* fix config bug and moe forward bug

* R3 Support GLM

* support eb4.5

* fix merge bug

* Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* add routing replay ci

* support glm topk

* support orther top_k

* fix ci bug

* pre-commit

* only support chatcmpl

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yuanle Liu <yuanlehome@163.com>
This commit is contained in:
RAM
2025-12-05 20:01:33 +08:00
committed by GitHub
parent 8545b705ed
commit 96d2d4877b
24 changed files with 592 additions and 24 deletions

View File

@@ -31,6 +31,7 @@ from fastdeploy.config import (
LoadConfig,
ModelConfig,
ParallelConfig,
RoutingReplayConfig,
)
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.quantization.block_wise_fp8 import (
@@ -476,6 +477,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
graph_opt_config=GraphOptimizationConfig({}),
load_config=LoadConfig({}),
ips=",".join(["0"] * nnodes),
routing_replay_config=RoutingReplayConfig({}),
)
self.fd_config.parallel_config.tp_group = None
self.fd_config.parallel_config.tensor_parallel_rank = tp_rank

View File

@@ -13,6 +13,7 @@ from fastdeploy.config import (
LoadConfig,
ModelConfig,
ParallelConfig,
RoutingReplayConfig,
)
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.quantization.w4a8 import W4A8Config
@@ -59,6 +60,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
graph_opt_config=GraphOptimizationConfig({}),
load_config=LoadConfig({}),
ips=",".join(["0"] * nnodes),
routing_replay_config=RoutingReplayConfig({}),
)
self.fd_config.parallel_config.tp_group = None
self.fd_config.parallel_config.tensor_parallel_rank = tp_rank

View File

@@ -13,6 +13,7 @@ from fastdeploy.config import (
LoadConfig,
ModelConfig,
ParallelConfig,
RoutingReplayConfig,
)
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.quantization.w4afp8 import W4AFP8Config
@@ -65,6 +66,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
graph_opt_config=GraphOptimizationConfig({}),
load_config=LoadConfig({}),
ips=",".join(["0"] * nnodes),
routing_replay_config=RoutingReplayConfig({}),
)
self.fd_config.parallel_config.tp_group = None
self.fd_config.parallel_config.tensor_parallel_rank = tp_rank