Revert "[RL] Support Rollout Routing Replay (#5321)" (#5402)

This reverts commit 96d2d4877b.
This commit is contained in:
Jiang-Jia-Jun
2025-12-05 20:19:39 +08:00
committed by GitHub
parent 94c57e4175
commit c45e064f3d
24 changed files with 24 additions and 592 deletions

View File

@@ -45,9 +45,6 @@ from fastdeploy.model_executor.layers.attention.append_attn_backend import (
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend,
)
from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
RoutingReplayManager,
)
from fastdeploy.model_executor.layers.rotary_embedding import get_rope, get_rope_3d
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
@@ -205,11 +202,6 @@ class GPUModelRunner(ModelRunnerBase):
os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port)
logger.info(f"queue id is {str(self.parallel_config.engine_worker_queue_port)}")
# Rollout routing replay config
self.routing_replay_manager = None
if self.fd_config.routing_replay_config.enable_routing_replay:
self.routing_replay_manager = RoutingReplayManager(fd_config=self.fd_config)
self.zmq_client = None
self.async_output_queue = None
if envs.FD_USE_GET_SAVE_OUTPUT_V1:
@@ -656,7 +648,6 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = 0
self.share_inputs["prompt_lens"][idx : idx + 1] = len(input_ids)
self.share_inputs["is_block_step"][idx : idx + 1] = False
self.share_inputs["is_chunk_step"][idx : idx + 1] = prefill_end_index < len(input_ids)
self.share_inputs["step_idx"][idx : idx + 1] = (
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
)
@@ -665,12 +656,6 @@ class GPUModelRunner(ModelRunnerBase):
if request.sampling_params is not None and request.sampling_params.prompt_logprobs is not None:
self.prompt_logprobs_reqs[request.request_id] = request
has_prefill_task = True
# Routing Replay
if self.fd_config.routing_replay_config.enable_routing_replay:
if prefill_start_index == 0:
self.routing_replay_manager.register_request(batch_id=idx, request_id=request.request_id)
if (
self.fd_config.scheduler_config.splitwise_role == "decode"
): # In PD, we continue to decode after P generate first token
@@ -1167,7 +1152,6 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["bad_tokens_len"] = paddle.full([max_num_seqs], 1, dtype="int64")
self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool")
self.share_inputs["is_chunk_step"] = paddle.full([max_num_seqs], False, dtype="bool").cpu()
self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32")
self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32")
@@ -1438,9 +1422,6 @@ class GPUModelRunner(ModelRunnerBase):
Initialize forward meta, attention meta data and update some config.
"""
# Initialize forward meta
routing_replay_table = None
if self.routing_replay_manager is not None:
routing_replay_table = self.routing_replay_manager.get_routing_table()
self.forward_meta = ForwardMeta(
ids_remove_padding=self.share_inputs["ids_remove_padding"],
rotary_embs=self.share_inputs["rope_emb"],
@@ -1467,7 +1448,6 @@ class GPUModelRunner(ModelRunnerBase):
kv_batch_ids=self.share_inputs["kv_batch_ids"],
kv_tile_ids_per_batch=self.share_inputs["kv_tile_ids_per_batch"],
kv_num_blocks_x_cpu=self.share_inputs["kv_num_blocks_x_cpu"],
routing_replay_table=routing_replay_table,
)
dist_status = self.collect_distributed_status()
@@ -1956,9 +1936,6 @@ class GPUModelRunner(ModelRunnerBase):
if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
break
if self.fd_config.routing_replay_config.enable_routing_replay:
self.routing_replay_manager.clear_routing_table()
def _update_chunked_prefill(self, tasks):
"""
Update chunked prefill related parameters
@@ -2457,15 +2434,6 @@ class GPUModelRunner(ModelRunnerBase):
self.speculative_config.num_speculative_tokens,
)
# Routing replay
if self.fd_config.routing_replay_config.enable_routing_replay:
if (
not self.exist_prefill()
and not self.exist_decode()
and self.share_inputs["is_block_step"].sum() == 0
and self.share_inputs["is_chunk_step"].sum() == 0
):
self.routing_replay_manager.put_table_to_store()
return None
def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Optional[ModelRunnerOutput]: