From 69fa741763daa05e9d111c8c1450f9c388e69a1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?= <39978853+zhoutianzi666@users.noreply.github.com> Date: Thu, 6 Nov 2025 11:06:28 +0800 Subject: [PATCH] remove seq_lens_this_time (#4821) --- fastdeploy/model_executor/pre_and_post_process.py | 11 +++-------- fastdeploy/worker/gpu_model_runner.py | 9 +-------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 521d60455..d5ef810e4 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -91,12 +91,7 @@ else: from fastdeploy.output.pooler import PoolerOutput, PoolingSequenceGroupOutput from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData -from fastdeploy.worker.output import ( - LogprobsTensors, - ModelOutputData, - ModelRunnerOutput, - SamplerOutput, -) +from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, SamplerOutput DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1" @@ -208,7 +203,7 @@ def pre_process( """ token_num = paddle.sum(seq_lens_this_time) - if current_platform.is_cuda() and not speculative_decoding: + if (current_platform.is_cuda() or current_platform.is_iluvatar()) and not speculative_decoding: # Note(ZKK): This case's code is very simple! ids_remove_padding, batch_id_per_token, cu_seqlens_q, cu_seqlens_k = get_padding_offset( input_ids, token_num, seq_lens_this_time @@ -322,7 +317,7 @@ def post_process_normal( async_output_queue: queue.Queue = None, think_end_id: int = -1, line_break_id: int = -1, -) -> ModelRunnerOutput: +): """Post-processing steps after completing a single token generation.""" if think_end_id > 0: limit_thinking_content_length( diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 84d23c465..5178e2b8c 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2050,7 +2050,7 @@ class GPUModelRunner(ModelRunnerBase): self, model_forward_batch: Optional[List[Request]] = None, num_running_requests: int = None, - ) -> Optional[ModelRunnerOutput]: + ) -> None: """ The Entrance of model execute. Args: @@ -2140,10 +2140,6 @@ class GPUModelRunner(ModelRunnerBase): async_output_queue=self.async_output_queue, ) - self.seq_lens_this_time_buffer[:num_running_requests].copy_( - self.share_inputs["seq_lens_this_time"][:num_running_requests], False - ) - return None else: hidden_states = rebuild_padding( @@ -2305,9 +2301,6 @@ class GPUModelRunner(ModelRunnerBase): self.speculative_config.num_speculative_tokens, ) - self.seq_lens_this_time_buffer[:num_running_requests].copy_( - self.share_inputs["seq_lens_this_time"][:num_running_requests], False - ) return None def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Optional[ModelRunnerOutput]: