FastDeploy/fastdeploy/model_executor/pre_and_post_process.py

"""
# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Dict, Optional

import paddle

from fastdeploy.model_executor.ops.gpu import (get_padding_offset, save_output,
                                               save_output_dynamic,
                                               set_stop_value_multi_ends,
                                               set_stop_value_multi_seqs,
                                               speculate_get_padding_offset,
                                               step_paddle, update_inputs)
from fastdeploy.worker.output import ModelOutputData


def pre_process(max_len: int, input_ids: paddle.Tensor,
                seq_lens_this_time: int, use_speculate_method: bool,
                draft_tokens: Optional[paddle.Tensor],
                seq_lens_encoder: Optional[paddle.Tensor]):
    """
    Preprocessing before embedding.
    Args:
        max_len:
        input_ids:
        seq_lens_this_time:
        use_speculate_method:
        draft_tokens:
        seq_lens_encoder:
    Return:
        ids_remove_padding:
        cum_offsets:
        padding_offset:
        cu_seqlens_q:
        cu_seqlens_k:
    """
    # Remove padding
    cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
    token_num = paddle.sum(seq_lens_this_time)
    if use_speculate_method:
        (
            ids_remove_padding,
            cum_offsets,
            padding_offset,
            cu_seqlens_q,
            cu_seqlens_k,
        ) = speculate_get_padding_offset(
            input_ids,
            draft_tokens,
            cum_offsets_now,
            token_num,
            seq_lens_this_time,
            seq_lens_encoder,
        )
    else:
        (
            ids_remove_padding,
            cum_offsets,
            padding_offset,
            cu_seqlens_q,
            cu_seqlens_k,
        ) = get_padding_offset(input_ids, cum_offsets_now, token_num,
                               seq_lens_this_time)
    return (
        ids_remove_padding,
        cum_offsets,
        padding_offset,
        cu_seqlens_q,
        cu_seqlens_k,
    )


def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
    """ Post-processing steps after completing a single token generation. """
    # 1. Set stop value
    paddle.assign(
        paddle.where(
            model_output.stop_flags,
            model_output.step_idx,
            model_output.step_idx + 1,
        ),
        model_output.step_idx,
    )
    length_cond = paddle.greater_equal(model_output.step_idx,
                                       model_output.max_dec_len)
    paddle.assign(
        paddle.logical_or(model_output.stop_flags, length_cond),
        model_output.stop_flags,
    )

    if model_output.use_stop_seqs:
        set_stop_value_multi_seqs(
            tokens,
            model_output.pre_ids,
            model_output.step_idx,
            model_output.stop_flags,
            model_output.seq_lens_this_time,
            model_output.stop_seqs,
            model_output.stop_seqs_len,
            model_output.eos_token_id,
        )
    else:
        set_stop_value_multi_ends(
            tokens,
            model_output.stop_flags,
            model_output.seq_lens_this_time,
            model_output.eos_token_id,
            model_output.next_tokens,
            False,
        )  # multi ends

    # 2. Update the input buffer of the model
    with paddle.framework._no_check_dy2st_diff():
        update_inputs(
            model_output.stop_flags,
            model_output.not_need_stop,
            model_output.seq_lens_this_time,
            model_output.seq_lens_encoder,
            model_output.seq_lens_decoder,
            model_output.input_ids,
            model_output.stop_nums,
            tokens,
            model_output.is_block_step,
        )
    # 3. Transmit the model's output and stop generation signal via message queue.
    #    In the future, we will abandon this approach.
    if model_output.output_via_mq:
        if model_output.msg_queue_id is None:
            save_output(
                tokens,
                model_output.not_need_stop,
                model_output.mp_rank,
                model_output.use_ep,
            )
        else:
            save_output_dynamic(
                tokens,
                model_output.not_need_stop,
                model_output.mp_rank,
                model_output.msg_queue_id,
                model_output.gpt.use_ep,
            )


def step_cuda(share_inputs: Dict[str, paddle.Tensor], block_size: int,
              enc_dec_block_num: int) -> None:
    """
    TODO(gongshaotian): normalization name
    """
    step_paddle(
        share_inputs["stop_flags"],
        share_inputs["seq_lens_this_time"],
        share_inputs["step_seq_lens_encoder"],
        share_inputs["seq_lens_encoder"],
        share_inputs["seq_lens_decoder"],
        share_inputs["block_tables"],
        share_inputs["encoder_block_lens"],
        share_inputs["is_block_step"],
        share_inputs["step_block_list"],
        share_inputs["step_lens"],
        share_inputs["recover_block_list"],
        share_inputs["recover_lens"],
        share_inputs["need_block_list"],
        share_inputs["need_block_len"],
        share_inputs["used_list_len"],
        share_inputs["free_list"],
        share_inputs["free_list_len"],
        share_inputs["input_ids"],
        share_inputs["pre_ids"],
        share_inputs["step_idx"],
        share_inputs["next_tokens"],
        share_inputs["first_token_ids"],
        block_size,
        enc_dec_block_num,
    )