[LLM] First commit the llm deployment code

2025-10-05 00:33:03 +08:00 · 2025-06-09 19:20:15 +08:00
parent 980c0a1d2c
commit 684703fd72
11814 changed files with 127294 additions and 1293102 deletions
--- a/fastdeploy/worker/output.py
+++ b/fastdeploy/worker/output.py
@@ -0,0 +1,94 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import paddle
+
+
+@dataclass
+class PreProcessOutputData:
+    """ """
+
+
+@dataclass
+class ModelOutputData:
+    """ """
+    # Tokens generated in the previous step
+    next_tokens: paddle.Tensor
+
+    # Flags indicating whether decoding should stop
+    stop_flags: paddle.Tensor
+
+    # Index of the current decoding step
+    step_idx: int
+
+    # Maximum decoding length
+    max_dec_len: int
+
+    # Previous ids used for decoding
+    pre_ids: paddle.Tensor
+
+    # Sequence lengths for this step
+    seq_lens_this_time: paddle.Tensor
+
+    #  Lengths of the stop sequences
+    stop_seqs_len: paddle.Tensor
+
+    #  Indicates if stopping conditions should be ignored
+    not_need_stop: bool
+
+    # Sequence lengths of the encoder
+    seq_lens_encoder: paddle.Tensor
+
+    # Sequence lengths of the decoder
+    seq_lens_decoder: paddle.Tensor
+
+    # Indicates if this is a blocking step
+    is_block_step: bool
+
+    # Use message queue output
+    output_via_mq: bool
+
+    # The ID of the message queue.
+    msg_queue_id: int
+
+    # The model parallel rank
+    mp_rank: int
+
+    # Use EP parallel
+    use_ep: bool
+
+
+@dataclass
+class ModelRunnerOutput:
+    """
+        [WIP] ModelRunnerOutput is serialized and sent to the scheduler process.
+    """
+    # [num_reqs]
+    req_ids: list[str]
+
+    # req_id -> index
+    req_id_to_index: dict[str, int]
+
+    # [num_reqs, num_generated_tokens]
+    sampled_token_ids: list[list[int]]
+
+    # [num_reqs, num_spec_tokens]
+    spec_token_ids: Optional[list[list[int]]]
+
+    # TODO(gongshaotian): supplement other outputs info