mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[RL] Support Rollout Routing Replay (#5321)
* [RL] Support Rollout Routing Replay * add routing indices cache * fix config bug and moe forward bug * R3 Support GLM * support eb4.5 * fix merge bug * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * add routing replay ci * support glm topk * support orther top_k * fix ci bug * pre-commit * only support chatcmpl --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com>
This commit is contained in:
@@ -31,6 +31,7 @@ from fastdeploy.config import (
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RoutingReplayConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
|
||||
from fastdeploy.model_executor.layers.quantization.block_wise_fp8 import (
|
||||
@@ -476,6 +477,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
graph_opt_config=GraphOptimizationConfig({}),
|
||||
load_config=LoadConfig({}),
|
||||
ips=",".join(["0"] * nnodes),
|
||||
routing_replay_config=RoutingReplayConfig({}),
|
||||
)
|
||||
self.fd_config.parallel_config.tp_group = None
|
||||
self.fd_config.parallel_config.tensor_parallel_rank = tp_rank
|
||||
|
||||
@@ -13,6 +13,7 @@ from fastdeploy.config import (
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RoutingReplayConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
|
||||
from fastdeploy.model_executor.layers.quantization.w4a8 import W4A8Config
|
||||
@@ -59,6 +60,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
graph_opt_config=GraphOptimizationConfig({}),
|
||||
load_config=LoadConfig({}),
|
||||
ips=",".join(["0"] * nnodes),
|
||||
routing_replay_config=RoutingReplayConfig({}),
|
||||
)
|
||||
self.fd_config.parallel_config.tp_group = None
|
||||
self.fd_config.parallel_config.tensor_parallel_rank = tp_rank
|
||||
|
||||
@@ -13,6 +13,7 @@ from fastdeploy.config import (
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RoutingReplayConfig,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
|
||||
from fastdeploy.model_executor.layers.quantization.w4afp8 import W4AFP8Config
|
||||
@@ -65,6 +66,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
graph_opt_config=GraphOptimizationConfig({}),
|
||||
load_config=LoadConfig({}),
|
||||
ips=",".join(["0"] * nnodes),
|
||||
routing_replay_config=RoutingReplayConfig({}),
|
||||
)
|
||||
self.fd_config.parallel_config.tp_group = None
|
||||
self.fd_config.parallel_config.tensor_parallel_rank = tp_rank
|
||||
|
||||
Reference in New Issue
Block a user