mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-26 18:10:32 +08:00
446 lines
16 KiB
Python
446 lines
16 KiB
Python
"""
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
import time
|
|
from typing import Any, ClassVar, Literal, Optional, Union, List, Dict
|
|
|
|
from fastapi import UploadFile
|
|
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
|
|
ValidationInfo, field_validator, model_validator)
|
|
from typing_extensions import TypeAlias
|
|
|
|
#from openai.types.chat import ChatCompletionMessageParam
|
|
from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam, parse_chat_messages
|
|
from fastdeploy.engine.sampling_params import SamplingParams
|
|
|
|
|
|
class ErrorResponse(BaseModel):
|
|
"""
|
|
Standard error response format following OpenAI API specification.
|
|
|
|
Attributes:
|
|
object (str): Always "error"
|
|
message (str): Human-readable error message
|
|
code (int): HTTP status code
|
|
"""
|
|
object: str = "error"
|
|
message: str
|
|
code: int
|
|
|
|
|
|
class PromptTokenUsageInfo(BaseModel):
|
|
"""
|
|
Token usage information specific to prompt processing.
|
|
|
|
Attributes:
|
|
cached_tokens (Optional[int]): Number of tokens served from cache
|
|
"""
|
|
cached_tokens: Optional[int] = None
|
|
|
|
|
|
class UsageInfo(BaseModel):
|
|
"""
|
|
Token usage statistics for API requests.
|
|
|
|
Attributes:
|
|
prompt_tokens (int): Number of tokens in the prompt
|
|
total_tokens (int): Total tokens used (prompt + completion)
|
|
completion_tokens (Optional[int]): Tokens generated in completion
|
|
prompt_tokens_details (Optional[PromptTokenUsageInfo]): Detailed prompt token info
|
|
"""
|
|
prompt_tokens: int = 0
|
|
total_tokens: int = 0
|
|
completion_tokens: Optional[int] = 0
|
|
prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
|
|
|
|
|
|
class ChatMessage(BaseModel):
|
|
"""
|
|
Single message in a chat conversation.
|
|
|
|
Attributes:
|
|
role (str): Role of the message sender (system/user/assistant)
|
|
content (str): Text content of the message
|
|
reasoning_content (Optional[str]): Additional reasoning/explanation
|
|
"""
|
|
role: str
|
|
content: str
|
|
reasoning_content: Optional[str] = None
|
|
|
|
|
|
class ChatCompletionResponseChoice(BaseModel):
|
|
"""
|
|
Single choice in a chat completion response.
|
|
|
|
Attributes:
|
|
index (int): Choice index
|
|
message (ChatMessage): Generated chat message
|
|
finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping generation
|
|
"""
|
|
index: int
|
|
message: ChatMessage
|
|
finish_reason: Optional[Literal["stop", "length"]]
|
|
|
|
|
|
class ChatCompletionResponse(BaseModel):
|
|
"""
|
|
Standard chat completion response format.
|
|
|
|
Attributes:
|
|
id (str): Unique request identifier
|
|
object (str): Always "chat.completion"
|
|
created (int): Unix timestamp of creation
|
|
model (str): Model name used
|
|
choices (List[ChatCompletionResponseChoice]): Generated response choices
|
|
usage (UsageInfo): Token usage statistics
|
|
"""
|
|
id: str
|
|
object: str = "chat.completion"
|
|
created: int = Field(default_factory=lambda: int(time.time()))
|
|
model: str
|
|
choices: List[ChatCompletionResponseChoice]
|
|
usage: UsageInfo
|
|
|
|
|
|
class DeltaMessage(BaseModel):
|
|
"""
|
|
Incremental message update for streaming responses.
|
|
|
|
Attributes:
|
|
role (Optional[str]): Role of the message sender
|
|
content (Optional[str]): Partial message content
|
|
token_ids (Optional[List[int]]): Token IDs for the delta content
|
|
reasoning_content (Optional[str]): Partial reasoning content
|
|
"""
|
|
role: Optional[str] = None
|
|
content: Optional[str] = None
|
|
token_ids: Optional[List[int]] = None
|
|
reasoning_content: Optional[str] = None
|
|
|
|
|
|
class ChatCompletionResponseStreamChoice(BaseModel):
|
|
"""
|
|
Streaming choice in a chat completion response.
|
|
|
|
Attributes:
|
|
index (int): Choice index
|
|
delta (DeltaMessage): Incremental message update
|
|
finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping
|
|
arrival_time (Optional[float]): Timestamp when chunk was generated
|
|
"""
|
|
index: int
|
|
delta: DeltaMessage
|
|
finish_reason: Optional[Literal["stop", "length"]] = None
|
|
arrival_time: Optional[float] = None
|
|
|
|
|
|
class ChatCompletionStreamResponse(BaseModel):
|
|
"""
|
|
Streaming chat completion response format.
|
|
|
|
Attributes:
|
|
id (str): Unique request identifier
|
|
object (str): Always "chat.completion.chunk"
|
|
created (int): Unix timestamp of creation
|
|
model (str): Model name used
|
|
choices (List[ChatCompletionResponseStreamChoice]): Streaming choices
|
|
usage (Optional[UsageInfo]): Token usage (if enabled in stream options)
|
|
"""
|
|
id: str
|
|
object: str = "chat.completion.chunk"
|
|
created: int = Field(default_factory=lambda: int(time.time()))
|
|
model: str
|
|
choices: List[ChatCompletionResponseStreamChoice]
|
|
usage: Optional[UsageInfo] = None
|
|
|
|
|
|
class CompletionResponseChoice(BaseModel):
|
|
"""
|
|
Single choice in a text completion response.
|
|
|
|
Attributes:
|
|
index (int): Choice index
|
|
text (str): Generated text
|
|
token_ids (Optional[List[int]]): Token IDs for generated text
|
|
arrival_time (Optional[float]): Timestamp when generated
|
|
logprobs (Optional[int]): Log probabilities
|
|
reasoning_content (Optional[str]): Additional reasoning
|
|
finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping
|
|
"""
|
|
index: int
|
|
text: str
|
|
token_ids: Optional[List[int]] = None
|
|
arrival_time: Optional[float] = None
|
|
logprobs: Optional[int] = None
|
|
reasoning_content: Optional[str] = None
|
|
finish_reason: Optional[Literal["stop", "length"]]
|
|
|
|
|
|
class CompletionResponse(BaseModel):
|
|
"""
|
|
Standard text completion response format.
|
|
|
|
Attributes:
|
|
id (str): Unique request identifier
|
|
object (str): Always "text_completion"
|
|
created (int): Unix timestamp of creation
|
|
model (str): Model name used
|
|
choices (List[CompletionResponseChoice]): Generated response choices
|
|
usage (UsageInfo): Token usage statistics
|
|
"""
|
|
id: str
|
|
object: str = "text_completion"
|
|
created: int = Field(default_factory=lambda: int(time.time()))
|
|
model: str
|
|
choices: List[CompletionResponseChoice]
|
|
usage: UsageInfo
|
|
|
|
|
|
class CompletionResponseStreamChoice(BaseModel):
|
|
"""
|
|
Streaming choice in a text completion response.
|
|
|
|
Attributes:
|
|
index (int): Choice index
|
|
text (str): Partial generated text
|
|
arrival_time (float): Timestamp when chunk was generated
|
|
token_ids (Optional[List[int]]): Token IDs for partial text
|
|
logprobs (Optional[float]): Log probabilities
|
|
reasoning_content (Optional[str]): Partial reasoning
|
|
finish_reason (Optional[Literal["stop", "length"]]): Reason for stopping
|
|
"""
|
|
index: int
|
|
text: str
|
|
arrival_time: float = None
|
|
token_ids: Optional[List[int]] = None
|
|
logprobs: Optional[float] = None
|
|
reasoning_content: Optional[str] = None
|
|
finish_reason: Optional[Literal["stop", "length"]] = None
|
|
|
|
|
|
class CompletionStreamResponse(BaseModel):
|
|
"""
|
|
Streaming text completion response format.
|
|
|
|
Attributes:
|
|
id (str): Unique request identifier
|
|
object (str): Always "text_completion"
|
|
created (int): Unix timestamp of creation
|
|
model (str): Model name used
|
|
choices (List[CompletionResponseStreamChoice]): Streaming choices
|
|
usage (Optional[UsageInfo]): Token usage (if enabled in stream options)
|
|
"""
|
|
id: str
|
|
object: str = "text_completion"
|
|
created: int = Field(default_factory=lambda: int(time.time()))
|
|
model: str
|
|
choices: List[CompletionResponseStreamChoice]
|
|
usage: Optional[UsageInfo] = None
|
|
|
|
|
|
class StreamOptions(BaseModel):
|
|
"""
|
|
Configuration options for streaming responses.
|
|
|
|
Attributes:
|
|
include_usage (Optional[bool]): Whether to include usage stats
|
|
continuous_usage_stats (Optional[bool]): Whether to send incremental usage
|
|
"""
|
|
include_usage: Optional[bool] = True
|
|
continuous_usage_stats: Optional[bool] = False
|
|
|
|
|
|
|
|
class CompletionRequest(BaseModel):
|
|
"""
|
|
Text completion request parameters following OpenAI API specification.
|
|
|
|
Attributes:
|
|
model (Optional[str]): Model name (default: "default")
|
|
prompt (Union[List[int], List[List[int]], str, List[str]]): Input prompt(s)
|
|
best_of (Optional[int]): Number of samples to generate
|
|
echo (Optional[bool]): Whether to echo the prompt
|
|
frequency_penalty (Optional[float]): Penalize repeated tokens
|
|
logprobs (Optional[int]): Number of logprobs to return
|
|
max_tokens (Optional[int]): Maximum tokens to generate (default: 16)
|
|
n (int): Number of completions (default: 1)
|
|
presence_penalty (Optional[float]): Penalize new tokens
|
|
seed (Optional[int]): Random seed
|
|
stop (Optional[Union[str, List[str]]]): Stop sequences
|
|
stream (Optional[bool]): Whether to stream response
|
|
stream_options (Optional[StreamOptions]): Streaming configuration
|
|
suffix (Optional[dict]): Suffix to append
|
|
temperature (Optional[float]): Sampling temperature
|
|
top_p (Optional[float]): Nucleus sampling probability
|
|
user (Optional[str]): User identifier
|
|
repetition_penalty (Optional[float]): Repetition penalty factor
|
|
stop_token_ids (Optional[List[int]]): Token IDs to stop generation
|
|
"""
|
|
# Ordered by official OpenAI API documentation
|
|
# https://platform.openai.com/docs/api-reference/completions/create
|
|
model: Optional[str] = "default"
|
|
prompt: Union[List[int], List[List[int]], str, List[str]]
|
|
best_of: Optional[int] = None
|
|
echo: Optional[bool] = False
|
|
frequency_penalty: Optional[float] = 0.0
|
|
logprobs: Optional[int] = None
|
|
max_tokens: Optional[int] = 16
|
|
n: int = 1
|
|
presence_penalty: Optional[float] = 0.0
|
|
seed: Optional[int] = None
|
|
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
|
stream: Optional[bool] = False
|
|
stream_options: Optional[StreamOptions] = None
|
|
suffix: Optional[dict] = None
|
|
temperature: Optional[float] = None
|
|
top_p: Optional[float] = None
|
|
user: Optional[str] = None
|
|
|
|
|
|
# doc: begin-completion-sampling-params
|
|
repetition_penalty: Optional[float] = None
|
|
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
|
|
# doc: end-completion-sampling-params
|
|
|
|
|
|
def to_dict_for_infer(self, request_id=None, prompt=None):
|
|
"""
|
|
Convert the request parameters into a dictionary
|
|
|
|
Returns:
|
|
dict: request parameters in dict format
|
|
"""
|
|
req_dict = {}
|
|
if request_id is not None:
|
|
req_dict['request_id'] = request_id
|
|
for key, value in self.dict().items():
|
|
if value is not None:
|
|
req_dict[key] = value
|
|
if self.suffix is not None:
|
|
for key, value in self.suffix.items():
|
|
req_dict[key] = value
|
|
if prompt is not None:
|
|
req_dict['prompt'] = prompt
|
|
|
|
if isinstance(prompt[0], int):
|
|
req_dict["prompt_token_ids"] = prompt
|
|
del req_dict["prompt"]
|
|
|
|
return req_dict
|
|
|
|
|
|
@model_validator(mode="before")
|
|
@classmethod
|
|
def validate_stream_options(cls, data):
|
|
"""
|
|
Validate stream options
|
|
"""
|
|
if data.get("stream_options") and not data.get("stream"):
|
|
raise ValueError(
|
|
"Stream options can only be defined when `stream=True`.")
|
|
|
|
return data
|
|
|
|
|
|
class ChatCompletionRequest(BaseModel):
|
|
"""
|
|
Chat completion request parameters following OpenAI API specification.
|
|
|
|
Attributes:
|
|
messages (Union[List[ChatCompletionMessageParam], List[int]]): Conversation history
|
|
model (Optional[str]): Model name (default: "default")
|
|
frequency_penalty (Optional[float]): Penalize repeated tokens
|
|
max_tokens (Optional[int]): Deprecated - max tokens to generate
|
|
max_completion_tokens (Optional[int]): Max tokens in completion
|
|
n (Optional[int]): Number of completions (default: 1)
|
|
presence_penalty (Optional[float]): Penalize new tokens
|
|
seed (Optional[int]): Random seed
|
|
stop (Optional[Union[str, List[str]]]): Stop sequences
|
|
stream (Optional[bool]): Whether to stream response
|
|
stream_options (Optional[StreamOptions]): Streaming configuration
|
|
temperature (Optional[float]): Sampling temperature
|
|
top_p (Optional[float]): Nucleus sampling probability
|
|
user (Optional[str]): User identifier
|
|
metadata (Optional[dict]): Additional metadata
|
|
repetition_penalty (Optional[float]): Repetition penalty factor
|
|
stop_token_ids (Optional[List[int]]): Token IDs to stop generation
|
|
"""
|
|
# Ordered by official OpenAI API documentation
|
|
# https://platform.openai.com/docs/api-reference/chat/create
|
|
messages: Union[List[ChatCompletionMessageParam], List[int]]
|
|
model: Optional[str] = "default"
|
|
frequency_penalty: Optional[float] = 0.0
|
|
# remove max_tokens when field is removed from OpenAI API
|
|
max_tokens: Optional[int] = Field(
|
|
default=None,
|
|
deprecated='max_tokens is deprecated in favor of the max_completion_tokens field')
|
|
max_completion_tokens: Optional[int] = None
|
|
n: Optional[int] = 1
|
|
presence_penalty: Optional[float] = 0.0
|
|
seed: Optional[int] = None
|
|
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
|
stream: Optional[bool] = False
|
|
stream_options: Optional[StreamOptions] = None
|
|
temperature: Optional[float] = None
|
|
top_p: Optional[float] = None
|
|
user: Optional[str] = None
|
|
metadata: Optional[dict] = None
|
|
|
|
# doc: begin-chat-completion-sampling-params
|
|
repetition_penalty: Optional[float] = None
|
|
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
|
|
# doc: end-chat-completion-sampling-params
|
|
|
|
def to_dict_for_infer(self, request_id=None):
|
|
"""
|
|
Convert the request parameters into a dictionary
|
|
|
|
Returns:
|
|
dict: request parameters in dict format
|
|
"""
|
|
req_dict = {}
|
|
if request_id is not None:
|
|
req_dict['request_id'] = request_id
|
|
|
|
if self.metadata is not None:
|
|
for key, value in self.metadata.items():
|
|
req_dict[key] = value
|
|
|
|
for key, value in self.dict().items():
|
|
if value is not None:
|
|
req_dict[key] = value
|
|
if isinstance(self.messages[0], int):
|
|
req_dict["prompt_token_ids"] = self.messages
|
|
del req_dict["messages"]
|
|
if "raw_request" in req_dict and not req_dict["raw_request"]:
|
|
req_dict["prompt"] = req_dict["messages"][0]["content"]
|
|
del req_dict["messages"]
|
|
|
|
return req_dict
|
|
|
|
@model_validator(mode="before")
|
|
@classmethod
|
|
def validate_stream_options(cls, data):
|
|
"""
|
|
Validate stream options
|
|
"""
|
|
if data.get("stream_options") and not data.get("stream"):
|
|
raise ValueError(
|
|
"Stream options can only be defined when `stream=True`.")
|
|
|
|
return data
|