[LLM] First commit the llm deployment code

2025-10-04 08:16:42 +08:00 · 2025-06-09 19:20:15 +08:00
parent 980c0a1d2c
commit 684703fd72
11814 changed files with 127294 additions and 1293102 deletions
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -0,0 +1,353 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+import time
+import numpy
+from dataclasses import dataclass, asdict, fields
+from typing import TYPE_CHECKING, Optional, Union, Any
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.utils import data_processor_logger
+
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.utils import data_processor_logger
+
+
+@dataclass
+class Request:
+    """A class representing an inference request to the LLM engine.
+    
+    Attributes:
+        request_id: Unique identifier for the request
+        prompt: Input prompt text or list of prompts
+        prompt_token_ids: Token IDs of the input prompt
+        prompt_token_ids_len: Length of prompt token IDs
+        messages: List of message dictionaries (for chat models)
+        history: Conversation history (for chat models)
+        system: System message (for chat models)
+        sampling_params: Parameters controlling text generation
+        eos_token_ids: List of end-of-sequence token IDs
+        arrival_time: Timestamp when request was received
+        preprocess_start_time: Timestamp when preprocessing started
+        preprocess_end_time: Timestamp when preprocessing completed
+        multimodal_inputs: Dictionary of multimodal inputs (images, audio etc.)
+        raw_request: Flag indicating if this is a raw request
+    """
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[Union[str, list[str]]],
+        prompt_token_ids: Optional[list[int]],
+        prompt_token_ids_len: Optional[int],
+        messages: Optional[list[list[dict[str, Any]]]],
+        history: Optional[list[list[str]]],
+        system: Optional[Union[str, list[str]]],
+        sampling_params: SamplingParams,
+        eos_token_ids: Optional[list[int]],
+        arrival_time: float,
+        preprocess_start_time: Optional[float] = None,
+        preprocess_end_time: Optional[float] = None,
+        multimodal_inputs: Optional[dict] = None,
+        raw_request: bool = True
+    ) -> None:
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_token_ids_len = prompt_token_ids_len
+        self.messages = messages
+        self.system = system
+        self.sampling_params = sampling_params
+        self.history = history
+        self.eos_token_ids = eos_token_ids
+
+        self.arrival_time = arrival_time
+        self.preprocess_start_time = preprocess_start_time
+        self.preprocess_end_time = preprocess_end_time
+        self.raw_request = raw_request
+
+
+        # Multi-modal related
+        self.multimodal_inputs = multimodal_inputs
+
+    @classmethod
+    def from_dict(cls, d: dict):
+        """Create a Request instance from a dictionary.
+        
+        Args:
+            d: Dictionary containing request parameters
+            
+        Returns:
+            Request: A new Request instance initialized with values from the dictionary
+        """
+        data_processor_logger.debug(f"{d}")
+        sampling_params = SamplingParams.from_dict(d)
+        return cls(
+            request_id=d["request_id"],
+            prompt=d.get("prompt"),
+            prompt_token_ids=d.get("prompt_token_ids"),
+            prompt_token_ids_len=d.get("prompt_token_ids_len"),
+            messages=d.get("messages"),
+            system=d.get("system"),
+            history=d.get("history"),
+            sampling_params=sampling_params,
+            eos_token_ids=d.get("eos_token_ids"),
+            arrival_time=d.get("arrival_time", time.time()),
+            preprocess_start_time=d.get("preprocess_start_time"),
+            preprocess_end_time=d.get("preprocess_end_time"),
+            multimodal_inputs=d.get("multimodal_inputs"),
+            raw_request=d.get("raw_request", True)
+        )
+
+    def to_dict(self) -> dict:
+        """Convert the Request object into a serializable dictionary.
+        
+        Returns:
+            dict: A dictionary containing all request attributes and sampling parameters
+        """
+        data = {
+            "request_id": self.request_id,
+            "prompt": self.prompt,
+            "prompt_token_ids": self.prompt_token_ids,
+            "prompt_token_ids_len": self.prompt_token_ids_len,
+            "messages": self.messages,
+            "system": self.system,
+            "history": self.history,
+            "eos_token_ids": self.eos_token_ids,
+            "arrival_time": self.arrival_time,
+            "preprocess_start_time": self.preprocess_start_time,
+            "preprocess_end_time": self.preprocess_end_time,
+            "multimodal_inputs": self.multimodal_inputs,
+            "raw_request": self.raw_request
+        }
+        data.update(asdict(self.sampling_params))
+        return data
+
+    def get(self, key: str, default_value=None):
+        """Get an attribute value from either the Request or its sampling parameters.
+        
+        Args:
+            key: Attribute name to retrieve
+            default_value: Default value to return if attribute not found
+            
+        Returns:
+            The attribute value if found, otherwise default_value
+        """
+        if hasattr(self, key):
+            return getattr(self, key)
+        elif hasattr(self.sampling_params, key):
+            return getattr(self.sampling_params, key)
+        else:
+            return default_value
+
+    def set(self, key, value):
+        """Set an attribute value on either the Request or its sampling parameters.
+        
+        Args:
+            key: Attribute name to set
+            value: Value to assign to the attribute
+        """
+        if hasattr(self.sampling_params, key):
+            setattr(self.sampling_params, key, value)
+        else:
+            setattr(self, key, value)
+
+    def __repr__(self) -> str:
+        return (f"Request(request_id={self.request_id}, "
+                f"prompt={self.prompt!r}, "
+                f"prompt_token_ids={self.prompt_token_ids}, "
+                f"sampling_params={self.sampling_params})")
+
+
+@dataclass
+class CompletionOutput:
+    """The output data of one completion output of a request.
+
+    Args:
+        index: The index of the output in the request.
+        text: The generated output text.
+        token_ids: The token IDs of the generated output text.
+    """
+
+    index: int
+    token_ids: list[int]
+    text: Optional[str] = None
+    reasoning_content: Optional[str] = None
+
+    @classmethod
+    def from_dict(cls, req_dict: dict[str, Any]) -> 'CompletionOutput':
+        """Create instance from dict arguments"""
+        return cls(**{
+            field.name: req_dict[field.name] if field.name in req_dict else field.default
+            for field in fields(cls)
+        })
+
+    def __repr__(self) -> str:
+        return (f"CompletionOutput(index={self.index}, "
+                f"text={self.text!r}, "
+                f"token_ids={self.token_ids}, "
+                f"reasoning_content={self.reasoning_content!r}")
+
+
+@dataclass
+class RequestMetrics:
+    """Metrics associated with a request.
+
+    Attributes:
+        arrival_time: The time when the request arrived.
+        inference_start_time: The time when the inference started.
+        first_token_time: The time when the first token was generated.
+        time_in_queue: The time the request spent in the queue.
+        model_forward_time: The time spent in the model forward pass when this
+                            request was in the batch.
+        model_execute_time: The time spent in the model execute function. This
+                            will include model forward, block/sync across
+                            workers, cpu-gpu sync time and sampling time.
+        request_start_time: Time to accept the request
+
+    """
+    arrival_time: float
+    inference_start_time: Optional[float] = None
+    first_token_time: Optional[float] = None
+    time_in_queue: Optional[float] = None
+    preprocess_cost_time: Optional[float] = None
+    model_forward_time: Optional[float] = None
+    model_execute_time: Optional[float] = None
+    request_start_time: Optional[float] = None
+
+    @classmethod
+    def from_dict(cls, req_dict: dict[str, Any]) -> 'RequestMetrics':
+        """Create instance from dict arguments"""
+        return cls(**{
+            field.name: req_dict[field.name] if field.name in req_dict else field.default
+            for field in fields(cls)
+        })
+
+
+class RequestOutput:
+    """The output data of a completion request to the LLM.
+
+    Args:
+        request_id: The unique ID of the request.
+        prompt: The prompt string of the request.
+                For encoder/decoder models, this is the
+                decoder input prompt.
+        prompt_token_ids: The token IDs of the prompt.
+                          For encoder/decoder models, this is the
+                          decoder input prompt token ids.
+        prompt_logprobs: The log probabilities to return per prompt token.
+        outputs: The output sequences of the request.
+        finished: Whether the whole request is finished.
+        metrics: Metrics associated with the request.
+        lora_request: The LoRA request that was used to generate the output.
+        encoder_prompt: The encoder prompt string of the request.
+                        None if decoder-only.
+        encoder_prompt_token_ids: The token IDs of the encoder prompt.
+                                  None if decoder-only.
+        num_cached_tokens: The number of tokens with prefix cache hit.
+    """
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[str] = None,
+        prompt_token_ids: Optional[list[int]] = None,
+        outputs: CompletionOutput = None,
+        finished: bool = False,
+        metrics: Optional[RequestMetrics] = None,
+        num_cached_tokens: Optional[int] = 0,
+        error_code: Optional[int] = 200,
+        error_msg: Optional[str] = None,
+    ) -> None:
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.outputs = outputs
+        self.finished = finished
+        self.metrics = metrics
+        self.num_cached_tokens = num_cached_tokens
+        self.error_code = error_code
+        self.error_msg = error_msg
+
+    def add(self, next_output: "RequestOutput") -> None:
+        """Merge another RequestOutput into this one.
+        
+        Args:
+            next_output: The RequestOutput to merge into this one
+            
+        Updates:
+            - Combines output sequences
+            - Updates finish status
+            - Calculates timing metrics
+        """
+        self.prompt = next_output.prompt
+        self.prompt_token_ids = next_output.prompt_token_ids
+        self.finished |= next_output.finished
+        self.outputs.index = next_output.outputs.index
+        self.outputs.token_ids.extend(next_output.outputs.token_ids)
+        if next_output.metrics.arrival_time is not None and self.metrics.inference_start_time is not None:
+            self.metrics.model_forward_time = next_output.metrics.arrival_time - \
+                self.metrics.inference_start_time
+        if next_output.metrics.arrival_time is not None and self.metrics.arrival_time is not None:
+            self.metrics.model_execute_time = next_output.metrics.arrival_time - \
+                self.metrics.arrival_time
+
+    def __repr__(self) -> str:
+        return (f"RequestOutput(request_id={self.request_id}, "
+                f"prompt={self.prompt!r}, "
+                f"prompt_token_ids={self.prompt_token_ids}, "
+                f"outputs={self.outputs}, "
+                f"metrics={self.metrics}, "
+                f"num_cached_tokens={self.num_cached_tokens})")
+
+    @classmethod
+    def from_dict(cls, d: dict):
+        """Create a RequestOutput instance from a dictionary.
+        
+        Args:
+            d: Dictionary containing request output parameters
+            
+        Returns:
+            RequestOutput: A new RequestOutput instance initialized with values from the dictionary
+        """
+        completion_output = CompletionOutput.from_dict(d.pop("outputs"))
+        metrics = RequestMetrics.from_dict(d.pop("metrics"))
+        return RequestOutput(**d, outputs=completion_output, metrics=metrics)
+
+    def to_dict(self):
+        """Convert the RequestOutput object into a serializable dictionary.
+        
+        Returns:
+            dict: A dictionary containing all request output attributes,
+                  with token IDs converted to lists if necessary
+        """
+        if self.prompt_token_ids is None:
+            self.prompt_token_ids = []
+
+        if type(self.prompt_token_ids) is numpy.ndarray:
+            self.prompt_token_ids = self.prompt_token_ids.tolist()
+
+        return {
+            "request_id": self.request_id,
+            "prompt": self.prompt,
+            "prompt_token_ids": self.prompt_token_ids,
+            "outputs": None if self.outputs is None else asdict(self.outputs),
+            "finished": self.finished,
+            "metrics": None if self.metrics is None else asdict(self.metrics),
+            "num_cached_tokens": self.num_cached_tokens,
+            "error_code": self.error_code,
+            "error_msg": self.error_msg,
+        }
+