Revert "[RL] Support Rollout Routing Replay (#5321)" (#5402)

This reverts commit 96d2d4877b.
2025-12-24 13:28:13 +08:00 · 2025-12-05 20:19:39 +08:00
parent 94c57e4175
commit c45e064f3d
24 changed files with 24 additions and 592 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -45,9 +45,6 @@ from fastdeploy.model_executor.layers.attention.append_attn_backend import (
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
    AttentionBackend,
 )
-from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
-    RoutingReplayManager,
-)
 from fastdeploy.model_executor.layers.rotary_embedding import get_rope, get_rope_3d
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
 from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
@@ -205,11 +202,6 @@ class GPUModelRunner(ModelRunnerBase):
        os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port)
        logger.info(f"queue id is {str(self.parallel_config.engine_worker_queue_port)}")

-        # Rollout routing replay config
-        self.routing_replay_manager = None
-        if self.fd_config.routing_replay_config.enable_routing_replay:
-            self.routing_replay_manager = RoutingReplayManager(fd_config=self.fd_config)
-
        self.zmq_client = None
        self.async_output_queue = None
        if envs.FD_USE_GET_SAVE_OUTPUT_V1:
@@ -656,7 +648,6 @@ class GPUModelRunner(ModelRunnerBase):
                self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = 0
                self.share_inputs["prompt_lens"][idx : idx + 1] = len(input_ids)
                self.share_inputs["is_block_step"][idx : idx + 1] = False
-                self.share_inputs["is_chunk_step"][idx : idx + 1] = prefill_end_index < len(input_ids)
                self.share_inputs["step_idx"][idx : idx + 1] = (
                    len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
                )
@@ -665,12 +656,6 @@ class GPUModelRunner(ModelRunnerBase):
                if request.sampling_params is not None and request.sampling_params.prompt_logprobs is not None:
                    self.prompt_logprobs_reqs[request.request_id] = request
                has_prefill_task = True
-
-                # Routing Replay
-                if self.fd_config.routing_replay_config.enable_routing_replay:
-                    if prefill_start_index == 0:
-                        self.routing_replay_manager.register_request(batch_id=idx, request_id=request.request_id)
-
                if (
                    self.fd_config.scheduler_config.splitwise_role == "decode"
                ):  # In PD, we continue to decode after P generate first token
@@ -1167,7 +1152,6 @@ class GPUModelRunner(ModelRunnerBase):
        self.share_inputs["bad_tokens_len"] = paddle.full([max_num_seqs], 1, dtype="int64")
        self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
        self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool")
-        self.share_inputs["is_chunk_step"] = paddle.full([max_num_seqs], False, dtype="bool").cpu()
        self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32")
        self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
        self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32")
@@ -1438,9 +1422,6 @@ class GPUModelRunner(ModelRunnerBase):
        Initialize forward meta, attention meta data and update some config.
        """
        # Initialize forward meta
-        routing_replay_table = None
-        if self.routing_replay_manager is not None:
-            routing_replay_table = self.routing_replay_manager.get_routing_table()
        self.forward_meta = ForwardMeta(
            ids_remove_padding=self.share_inputs["ids_remove_padding"],
            rotary_embs=self.share_inputs["rope_emb"],
@@ -1467,7 +1448,6 @@ class GPUModelRunner(ModelRunnerBase):
            kv_batch_ids=self.share_inputs["kv_batch_ids"],
            kv_tile_ids_per_batch=self.share_inputs["kv_tile_ids_per_batch"],
            kv_num_blocks_x_cpu=self.share_inputs["kv_num_blocks_x_cpu"],
-            routing_replay_table=routing_replay_table,
        )

        dist_status = self.collect_distributed_status()
@@ -1956,9 +1936,6 @@ class GPUModelRunner(ModelRunnerBase):
            if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
                break

-        if self.fd_config.routing_replay_config.enable_routing_replay:
-            self.routing_replay_manager.clear_routing_table()
-
    def _update_chunked_prefill(self, tasks):
        """
        Update chunked prefill related parameters
@@ -2457,15 +2434,6 @@ class GPUModelRunner(ModelRunnerBase):
                    self.speculative_config.num_speculative_tokens,
                )

-        # Routing replay
-        if self.fd_config.routing_replay_config.enable_routing_replay:
-            if (
-                not self.exist_prefill()
-                and not self.exist_decode()
-                and self.share_inputs["is_block_step"].sum() == 0
-                and self.share_inputs["is_chunk_step"].sum() == 0
-            ):
-                self.routing_replay_manager.put_table_to_store()
            return None

    def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Optional[ModelRunnerOutput]: