Sync v2.0 version of code to github repo

2025-10-05 00:33:03 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/fastdeploy/worker/output.py
+++ b/fastdeploy/worker/output.py
@@ -20,75 +20,141 @@ from typing import Optional
 import paddle


-@dataclass
-class PreProcessOutputData:
-    """ """
-
-
@dataclass
 class ModelOutputData:
-    """ """
-    # Tokens generated in the previous step
+    """
+        OutputData by execute_model
+    """
+
+    """
+        Tokens generated in the previous step
+    """
    next_tokens: paddle.Tensor

-    # Flags indicating whether decoding should stop
+    """
+        Flags indicating whether decoding should stop
+    """
    stop_flags: paddle.Tensor

-    # Index of the current decoding step
+    """
+        Index of the current decoding step
+    """
    step_idx: int

-    # Maximum decoding length
+    """
+        Maximum decoding length
+    """
    max_dec_len: int

-    # Previous ids used for decoding
+    """
+        Previous ids used for decoding
+    """
    pre_ids: paddle.Tensor

-    # Sequence lengths for this step
+    """
+        Sequence lengths for this step
+    """
    seq_lens_this_time: paddle.Tensor

-    #  Lengths of the stop sequences
-    stop_seqs_len: paddle.Tensor
+    """
+        Eos token ID
+    """
+    eos_token_id: paddle.Tensor

-    #  Indicates if stopping conditions should be ignored
+    """
+        Indicates if stopping conditions should be ignored
+    """
    not_need_stop: bool

-    # Sequence lengths of the encoder
+    """
+        Sequence lengths of the encoder
+    """
    seq_lens_encoder: paddle.Tensor

-    # Sequence lengths of the decoder
+    """
+        Sequence lengths of the decoder
+    """
    seq_lens_decoder: paddle.Tensor

-    # Indicates if this is a blocking step
+    """
+        Indicates if this is a blocking step
+    """
    is_block_step: bool

-    # Use message queue output
-    output_via_mq: bool
-
-    # The ID of the message queue.
+    """
+        The ID of the message queue.
+    """
    msg_queue_id: int

-    # The model parallel rank
+    """
+        The model parallel rank
+    """
    mp_rank: int

-    # Use EP parallel
+    """
+        Use EP parallel
+    """
    use_ep: bool

+    """
+        input ids
+    """
+    input_ids: paddle.Tensor
+
+    """
+        stop nums for every sequence
+    """
+    stop_nums: paddle.Tensor
+
+    """
+        for speculative decoding
+        full hidden states before lm_head
+    """
+    full_hidden_states: paddle.Tensor
+
+    """
+         draft tokens for every sequence
+    """
+    draft_tokens: paddle.Tensor
+
+    """
+        draft token num for every sequence
+    """
+    actual_draft_token_num: paddle.Tensor
+
+    """
+        accepted tokens in current step
+    """
+    accept_tokens: paddle.Tensor
+
+    """
+        the number of accepted tokens in current step
+    """
+    accept_num: paddle.Tensor
+

@dataclass
 class ModelRunnerOutput:
    """
        [WIP] ModelRunnerOutput is serialized and sent to the scheduler process.
    """
-    # [num_reqs]
+
+    """
+        [num_reqs]
+    """
    req_ids: list[str]

-    # req_id -> index
+    """
+        req_id -> index
+    """
    req_id_to_index: dict[str, int]

-    # [num_reqs, num_generated_tokens]
+    """
+        [num_reqs, num_generated_tokens]
+    """
    sampled_token_ids: list[list[int]]

-    # [num_reqs, num_spec_tokens]
+    """
+        [num_reqs, num_spec_tokens]
+    """
    spec_token_ids: Optional[list[list[int]]]
-
-    # TODO(gongshaotian): supplement other outputs info