FastDeploy/fastdeploy/entrypoints/openai/protocol.py

"""
# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from __future__ import annotations
import time
from typing import Any, ClassVar, Literal, Optional, Union, List, Dict

from fastapi import UploadFile
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
                      ValidationInfo, field_validator, model_validator)
from typing_extensions import TypeAlias

#from openai.types.chat import ChatCompletionMessageParam
from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam, parse_chat_messages
from fastdeploy.engine.sampling_params import SamplingParams


class ErrorResponse(BaseModel):
    """
    Standard error response format following OpenAI API specification.

    Attributes:
        object (str): Always "error"
        message (str): Human-readable error message
        code (int): HTTP status code
    """
    object: str = "error"
    message: str
    code: int


class PromptTokenUsageInfo(BaseModel):
    """
    Token usage information specific to prompt processing.

    Attributes:
        cached_tokens (Optional[int]): Number of tokens served from cache
    """
    cached_tokens: Optional[int] = None


class UsageInfo(BaseModel):
    """
    Token usage statistics for API requests.

    Attributes:
        prompt_tokens (int): Number of tokens in the prompt
        total_tokens (int): Total tokens used (prompt + completion)
        completion_tokens (Optional[int]): Tokens generated in completion
        prompt_tokens_details (Optional[PromptTokenUsageInfo]): Detailed prompt token info
    """
    prompt_tokens: int = 0
    total_tokens: int = 0
    completion_tokens: Optional[int] = 0
    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None


class ChatMessage(BaseModel):
    """
    Single message in a chat conversation.

    Attributes:
        role (str): Role of the message sender (system/user/assistant)
        content (str): Text content of the message
        reasoning_content (Optional[str]): Additional reasoning/explanation
    """
    role: str
    content: str
    reasoning_content: Optional[str] = None


class ChatCompletionResponseChoice(BaseModel):
    """
    Single choice in a chat completion response.

    Attributes:
        index (int): Choice index
        message (ChatMessage): Generated chat message
        finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping generation
    """
    index: int
    message: ChatMessage
    finish_reason: Optional[Literal["stop", "length"]]


class ChatCompletionResponse(BaseModel):
    """
    Standard chat completion response format.

    Attributes:
        id (str): Unique request identifier
        object (str): Always "chat.completion"
        created (int): Unix timestamp of creation
        model (str): Model name used
        choices (List[ChatCompletionResponseChoice]): Generated response choices
        usage (UsageInfo): Token usage statistics
    """
    id: str
    object: str = "chat.completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[ChatCompletionResponseChoice]
    usage: UsageInfo


class DeltaMessage(BaseModel):
    """
    Incremental message update for streaming responses.

    Attributes:
        role (Optional[str]): Role of the message sender
        content (Optional[str]): Partial message content
        token_ids (Optional[List[int]]): Token IDs for the delta content
        reasoning_content (Optional[str]): Partial reasoning content
    """
    role: Optional[str] = None
    content: Optional[str] = None
    token_ids: Optional[List[int]] = None
    reasoning_content: Optional[str] = None


class ChatCompletionResponseStreamChoice(BaseModel):
    """
    Streaming choice in a chat completion response.

    Attributes:
        index (int): Choice index
        delta (DeltaMessage): Incremental message update
        finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping
        arrival_time (Optional[float]): Timestamp when chunk was generated
    """
    index: int
    delta: DeltaMessage
    finish_reason: Optional[Literal["stop", "length"]] = None
    arrival_time: Optional[float] = None


class ChatCompletionStreamResponse(BaseModel):
    """
    Streaming chat completion response format.

    Attributes:
        id (str): Unique request identifier
        object (str): Always "chat.completion.chunk"
        created (int): Unix timestamp of creation
        model (str): Model name used
        choices (List[ChatCompletionResponseStreamChoice]): Streaming choices
        usage (Optional[UsageInfo]): Token usage (if enabled in stream options)
    """
    id: str
    object: str = "chat.completion.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[ChatCompletionResponseStreamChoice]
    usage: Optional[UsageInfo] = None


class CompletionResponseChoice(BaseModel):
    """
    Single choice in a text completion response.

    Attributes:
        index (int): Choice index
        text (str): Generated text
        token_ids (Optional[List[int]]): Token IDs for generated text
        arrival_time (Optional[float]): Timestamp when generated
        logprobs (Optional[int]): Log probabilities
        reasoning_content (Optional[str]): Additional reasoning
        finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping
    """
    index: int
    text: str
    token_ids: Optional[List[int]] = None
    arrival_time: Optional[float] = None
    logprobs: Optional[int] = None
    reasoning_content: Optional[str] = None
    finish_reason: Optional[Literal["stop", "length"]]


class CompletionResponse(BaseModel):
    """
    Standard text completion response format.

    Attributes:
        id (str): Unique request identifier
        object (str): Always "text_completion"
        created (int): Unix timestamp of creation
        model (str): Model name used
        choices (List[CompletionResponseChoice]): Generated response choices
        usage (UsageInfo): Token usage statistics
    """
    id: str
    object: str = "text_completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[CompletionResponseChoice]
    usage: UsageInfo


class CompletionResponseStreamChoice(BaseModel):
    """
    Streaming choice in a text completion response.

    Attributes:
        index (int): Choice index
        text (str): Partial generated text
        arrival_time (float): Timestamp when chunk was generated
        token_ids (Optional[List[int]]): Token IDs for partial text
        logprobs (Optional[float]): Log probabilities
        reasoning_content (Optional[str]): Partial reasoning
        finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping
    """
    index: int
    text: str
    arrival_time: float = None
    token_ids: Optional[List[int]] = None
    logprobs: Optional[float] = None
    reasoning_content: Optional[str] = None
    finish_reason: Optional[Literal["stop", "length"]] = None


class CompletionStreamResponse(BaseModel):
    """
    Streaming text completion response format.

    Attributes:
        id (str): Unique request identifier
        object (str): Always "text_completion"
        created (int): Unix timestamp of creation
        model (str): Model name used
        choices (List[CompletionResponseStreamChoice]): Streaming choices
        usage (Optional[UsageInfo]): Token usage (if enabled in stream options)
    """
    id: str
    object: str = "text_completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[CompletionResponseStreamChoice]
    usage: Optional[UsageInfo] = None


class StreamOptions(BaseModel):
    """
    Configuration options for streaming responses.

    Attributes:
        include_usage (Optional[bool]): Whether to include usage stats
        continuous_usage_stats (Optional[bool]): Whether to send incremental usage
    """
    include_usage: Optional[bool] = True
    continuous_usage_stats: Optional[bool] = False


class CompletionRequest(BaseModel):
    """
    Text completion request parameters following OpenAI API specification.

    Attributes:
        model (Optional[str]): Model name (default: "default")
        prompt (Union[List[int], List[List[int]], str, List[str]]): Input prompt(s)
        best_of (Optional[int]): Number of samples to generate
        echo (Optional[bool]): Whether to echo the prompt
        frequency_penalty (Optional[float]): Penalize repeated tokens
        logprobs (Optional[int]): Number of logprobs to return
        max_tokens (Optional[int]): Maximum tokens to generate (default: 16)
        n (int): Number of completions (default: 1)
        presence_penalty (Optional[float]): Penalize new tokens
        seed (Optional[int]): Random seed
        stop (Optional[Union[str, List[str]]]): Stop sequences
        stream (Optional[bool]): Whether to stream response
        stream_options (Optional[StreamOptions]): Streaming configuration
        suffix (Optional[dict]): Suffix to append
        temperature (Optional[float]): Sampling temperature
        top_p (Optional[float]): Nucleus sampling probability
        user (Optional[str]): User identifier
        repetition_penalty (Optional[float]): Repetition penalty factor
        stop_token_ids (Optional[List[int]]): Token IDs to stop generation
    """
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/completions/create
    model: Optional[str] = "default"
    prompt: Union[List[int], List[List[int]], str, List[str]]
    best_of: Optional[int] = None
    echo: Optional[bool] = False
    frequency_penalty: Optional[float] = 0.0
    logprobs: Optional[int] = None
    max_tokens: Optional[int] = 16
    n: int = 1
    presence_penalty: Optional[float] = 0.0
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
    stream: Optional[bool] = False
    stream_options: Optional[StreamOptions] = None
    suffix: Optional[dict] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    user: Optional[str] = None


    # doc: begin-completion-sampling-params
    repetition_penalty: Optional[float] = None
    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
    # doc: end-completion-sampling-params


    def to_dict_for_infer(self, request_id=None, prompt=None):
        """
        Convert the request parameters into a dictionary

        Returns:
            dict: request parameters in dict format
        """
        req_dict = {}
        if request_id is not None:
            req_dict['request_id'] = request_id
        for key, value in self.dict().items():
            if value is not None:
                req_dict[key] = value
        if self.suffix is not None:
            for key, value in self.suffix.items():
                req_dict[key] = value
        if prompt is not None:
            req_dict['prompt'] = prompt

        if isinstance(prompt[0], int):
            req_dict["prompt_token_ids"] = prompt
            del req_dict["prompt"]

        return req_dict


    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        """
        Validate stream options
        """
        if data.get("stream_options") and not data.get("stream"):
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data


class ChatCompletionRequest(BaseModel):
    """
    Chat completion request parameters following OpenAI API specification.

    Attributes:
        messages (Union[List[ChatCompletionMessageParam], List[int]]): Conversation history
        model (Optional[str]): Model name (default: "default")
        frequency_penalty (Optional[float]): Penalize repeated tokens
        max_tokens (Optional[int]): Deprecated - max tokens to generate
        max_completion_tokens (Optional[int]): Max tokens in completion
        n (Optional[int]): Number of completions (default: 1)
        presence_penalty (Optional[float]): Penalize new tokens
        seed (Optional[int]): Random seed
        stop (Optional[Union[str, List[str]]]): Stop sequences
        stream (Optional[bool]): Whether to stream response
        stream_options (Optional[StreamOptions]): Streaming configuration
        temperature (Optional[float]): Sampling temperature
        top_p (Optional[float]): Nucleus sampling probability
        user (Optional[str]): User identifier
        metadata (Optional[dict]): Additional metadata
        repetition_penalty (Optional[float]): Repetition penalty factor
        stop_token_ids (Optional[List[int]]): Token IDs to stop generation
    """
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/chat/create
    messages: Union[List[ChatCompletionMessageParam], List[int]]
    model: Optional[str] = "default"
    frequency_penalty: Optional[float] = 0.0
    # remove max_tokens when field is removed from OpenAI API
    max_tokens: Optional[int] = Field(
        default=None,
        deprecated='max_tokens is deprecated in favor of the max_completion_tokens field')
    max_completion_tokens: Optional[int] = None
    n: Optional[int] = 1
    presence_penalty: Optional[float] = 0.0
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
    stream: Optional[bool] = False
    stream_options: Optional[StreamOptions] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    user: Optional[str] = None
    metadata: Optional[dict] = None

    # doc: begin-chat-completion-sampling-params
    repetition_penalty: Optional[float] = None
    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
    # doc: end-chat-completion-sampling-params

    def to_dict_for_infer(self, request_id=None):
        """
        Convert the request parameters into a dictionary

        Returns:
            dict: request parameters in dict format
        """
        req_dict = {}
        if request_id is not None:
            req_dict['request_id'] = request_id

        if self.metadata is not None:
            for key, value in self.metadata.items():
                req_dict[key] = value

        for key, value in self.dict().items():
            if value is not None:
                req_dict[key] = value
        if isinstance(self.messages[0], int):
            req_dict["prompt_token_ids"] = self.messages
            del req_dict["messages"]
        if "raw_request" in req_dict and not req_dict["raw_request"]:
            req_dict["prompt"] = req_dict["messages"][0]["content"]
            del req_dict["messages"]

        return req_dict

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        """
        Validate stream options
        """
        if data.get("stream_options") and not data.get("stream"):
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data