Files
FastDeploy/fastdeploy/worker/output.py
2025-06-29 23:29:37 +00:00

161 lines
3.0 KiB
Python

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from dataclasses import dataclass
from typing import Optional
import paddle
@dataclass
class ModelOutputData:
"""
OutputData by execute_model
"""
"""
Tokens generated in the previous step
"""
next_tokens: paddle.Tensor
"""
Flags indicating whether decoding should stop
"""
stop_flags: paddle.Tensor
"""
Index of the current decoding step
"""
step_idx: int
"""
Maximum decoding length
"""
max_dec_len: int
"""
Previous ids used for decoding
"""
pre_ids: paddle.Tensor
"""
Sequence lengths for this step
"""
seq_lens_this_time: paddle.Tensor
"""
Eos token ID
"""
eos_token_id: paddle.Tensor
"""
Indicates if stopping conditions should be ignored
"""
not_need_stop: bool
"""
Sequence lengths of the encoder
"""
seq_lens_encoder: paddle.Tensor
"""
Sequence lengths of the decoder
"""
seq_lens_decoder: paddle.Tensor
"""
Indicates if this is a blocking step
"""
is_block_step: bool
"""
The ID of the message queue.
"""
msg_queue_id: int
"""
The model parallel rank
"""
mp_rank: int
"""
Use EP parallel
"""
use_ep: bool
"""
input ids
"""
input_ids: paddle.Tensor
"""
stop nums for every sequence
"""
stop_nums: paddle.Tensor
"""
for speculative decoding
full hidden states before lm_head
"""
full_hidden_states: paddle.Tensor
"""
draft tokens for every sequence
"""
draft_tokens: paddle.Tensor
"""
draft token num for every sequence
"""
actual_draft_token_num: paddle.Tensor
"""
accepted tokens in current step
"""
accept_tokens: paddle.Tensor
"""
the number of accepted tokens in current step
"""
accept_num: paddle.Tensor
@dataclass
class ModelRunnerOutput:
"""
[WIP] ModelRunnerOutput is serialized and sent to the scheduler process.
"""
"""
[num_reqs]
"""
req_ids: list[str]
"""
req_id -> index
"""
req_id_to_index: dict[str, int]
"""
[num_reqs, num_generated_tokens]
"""
sampled_token_ids: list[list[int]]
"""
[num_reqs, num_spec_tokens]
"""
spec_token_ids: Optional[list[list[int]]]