mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-27 12:52:29 +08:00
[LLM] First commit the llm deployment code
This commit is contained in:
187
fastdeploy/model_executor/pre_and_post_process.py
Normal file
187
fastdeploy/model_executor/pre_and_post_process.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
from typing import Dict, Optional
|
||||
|
||||
import paddle
|
||||
|
||||
from fastdeploy.model_executor.ops.gpu import (get_padding_offset, save_output,
|
||||
save_output_dynamic,
|
||||
set_stop_value_multi_ends,
|
||||
set_stop_value_multi_seqs,
|
||||
speculate_get_padding_offset,
|
||||
step_paddle, update_inputs)
|
||||
from fastdeploy.worker.output import ModelOutputData
|
||||
|
||||
|
||||
def pre_process(max_len: int, input_ids: paddle.Tensor,
|
||||
seq_lens_this_time: int, use_speculate_method: bool,
|
||||
draft_tokens: Optional[paddle.Tensor],
|
||||
seq_lens_encoder: Optional[paddle.Tensor]):
|
||||
"""
|
||||
Preprocessing before embedding.
|
||||
Args:
|
||||
max_len:
|
||||
input_ids:
|
||||
seq_lens_this_time:
|
||||
use_speculate_method:
|
||||
draft_tokens:
|
||||
seq_lens_encoder:
|
||||
Return:
|
||||
ids_remove_padding:
|
||||
cum_offsets:
|
||||
padding_offset:
|
||||
cu_seqlens_q:
|
||||
cu_seqlens_k:
|
||||
"""
|
||||
# Remove padding
|
||||
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
|
||||
token_num = paddle.sum(seq_lens_this_time)
|
||||
if use_speculate_method:
|
||||
(
|
||||
ids_remove_padding,
|
||||
cum_offsets,
|
||||
padding_offset,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k,
|
||||
) = speculate_get_padding_offset(
|
||||
input_ids,
|
||||
draft_tokens,
|
||||
cum_offsets_now,
|
||||
token_num,
|
||||
seq_lens_this_time,
|
||||
seq_lens_encoder,
|
||||
)
|
||||
else:
|
||||
(
|
||||
ids_remove_padding,
|
||||
cum_offsets,
|
||||
padding_offset,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k,
|
||||
) = get_padding_offset(input_ids, cum_offsets_now, token_num,
|
||||
seq_lens_this_time)
|
||||
return (
|
||||
ids_remove_padding,
|
||||
cum_offsets,
|
||||
padding_offset,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k,
|
||||
)
|
||||
|
||||
|
||||
def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
|
||||
""" Post-processing steps after completing a single token generation. """
|
||||
# 1. Set stop value
|
||||
paddle.assign(
|
||||
paddle.where(
|
||||
model_output.stop_flags,
|
||||
model_output.step_idx,
|
||||
model_output.step_idx + 1,
|
||||
),
|
||||
model_output.step_idx,
|
||||
)
|
||||
length_cond = paddle.greater_equal(model_output.step_idx,
|
||||
model_output.max_dec_len)
|
||||
paddle.assign(
|
||||
paddle.logical_or(model_output.stop_flags, length_cond),
|
||||
model_output.stop_flags,
|
||||
)
|
||||
|
||||
if model_output.use_stop_seqs:
|
||||
set_stop_value_multi_seqs(
|
||||
tokens,
|
||||
model_output.pre_ids,
|
||||
model_output.step_idx,
|
||||
model_output.stop_flags,
|
||||
model_output.seq_lens_this_time,
|
||||
model_output.stop_seqs,
|
||||
model_output.stop_seqs_len,
|
||||
model_output.eos_token_id,
|
||||
)
|
||||
else:
|
||||
set_stop_value_multi_ends(
|
||||
tokens,
|
||||
model_output.stop_flags,
|
||||
model_output.seq_lens_this_time,
|
||||
model_output.eos_token_id,
|
||||
model_output.next_tokens,
|
||||
False,
|
||||
) # multi ends
|
||||
|
||||
# 2. Update the input buffer of the model
|
||||
with paddle.framework._no_check_dy2st_diff():
|
||||
update_inputs(
|
||||
model_output.stop_flags,
|
||||
model_output.not_need_stop,
|
||||
model_output.seq_lens_this_time,
|
||||
model_output.seq_lens_encoder,
|
||||
model_output.seq_lens_decoder,
|
||||
model_output.input_ids,
|
||||
model_output.stop_nums,
|
||||
tokens,
|
||||
model_output.is_block_step,
|
||||
)
|
||||
# 3. Transmit the model's output and stop generation signal via message queue.
|
||||
# In the future, we will abandon this approach.
|
||||
if model_output.output_via_mq:
|
||||
if model_output.msg_queue_id is None:
|
||||
save_output(
|
||||
tokens,
|
||||
model_output.not_need_stop,
|
||||
model_output.mp_rank,
|
||||
model_output.use_ep,
|
||||
)
|
||||
else:
|
||||
save_output_dynamic(
|
||||
tokens,
|
||||
model_output.not_need_stop,
|
||||
model_output.mp_rank,
|
||||
model_output.msg_queue_id,
|
||||
model_output.gpt.use_ep,
|
||||
)
|
||||
|
||||
|
||||
def step_cuda(share_inputs: Dict[str, paddle.Tensor], block_size: int,
|
||||
enc_dec_block_num: int) -> None:
|
||||
"""
|
||||
TODO(gongshaotian): normalization name
|
||||
"""
|
||||
step_paddle(
|
||||
share_inputs["stop_flags"],
|
||||
share_inputs["seq_lens_this_time"],
|
||||
share_inputs["step_seq_lens_encoder"],
|
||||
share_inputs["seq_lens_encoder"],
|
||||
share_inputs["seq_lens_decoder"],
|
||||
share_inputs["block_tables"],
|
||||
share_inputs["encoder_block_lens"],
|
||||
share_inputs["is_block_step"],
|
||||
share_inputs["step_block_list"],
|
||||
share_inputs["step_lens"],
|
||||
share_inputs["recover_block_list"],
|
||||
share_inputs["recover_lens"],
|
||||
share_inputs["need_block_list"],
|
||||
share_inputs["need_block_len"],
|
||||
share_inputs["used_list_len"],
|
||||
share_inputs["free_list"],
|
||||
share_inputs["free_list_len"],
|
||||
share_inputs["input_ids"],
|
||||
share_inputs["pre_ids"],
|
||||
share_inputs["step_idx"],
|
||||
share_inputs["next_tokens"],
|
||||
share_inputs["first_token_ids"],
|
||||
block_size,
|
||||
enc_dec_block_num,
|
||||
)
|
Reference in New Issue
Block a user