diff --git a/docs/features/reasoning_output.md b/docs/features/reasoning_output.md index 6ea2e928e..f98262d62 100644 --- a/docs/features/reasoning_output.md +++ b/docs/features/reasoning_output.md @@ -43,7 +43,7 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ {"type": "text", "text": "Which era does the cultural relic in the picture belong to"} ]} ], - "enable_thinking": true, + "chat_template_kwargs":{"enable_thinking": true}, "reasoning_max_tokens": 1024 }' ``` @@ -70,7 +70,7 @@ chat_response = client.chat.completions.create( model="vl", stream=True, extra_body={ - "enable_thinking": True, + "chat_template_kwargs":{"enable_thinking": True}, "reasoning_max_tokens": 1024 } ) diff --git a/docs/get_started/ernie-4.5-vl.md b/docs/get_started/ernie-4.5-vl.md index 14719daca..71b0626ae 100644 --- a/docs/get_started/ernie-4.5-vl.md +++ b/docs/get_started/ernie-4.5-vl.md @@ -113,7 +113,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "From which era does the artifact in the image originate?"} ]} ], - "enable_thinking": false + "chat_template_kwargs":{"enable_thinking": false} }' ``` diff --git a/docs/get_started/quick_start_vl.md b/docs/get_started/quick_start_vl.md index 1e1a37425..83b1b97d7 100644 --- a/docs/get_started/quick_start_vl.md +++ b/docs/get_started/quick_start_vl.md @@ -74,7 +74,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "What era does this artifact belong to?"} ]} ], - "enable_thinking": false + "chat_template_kwargs":{"enable_thinking": false} }' ``` diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md index 536637110..761e79720 100644 --- a/docs/online_serving/README.md +++ b/docs/online_serving/README.md @@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \ For more usage methods of the command line during service deployment, refer to [Parameter Descriptions](../parameters.md). -## Sending User Requests +## Chat Completion API +FastDeploy provides a Chat Completion API that is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method. -The FastDeploy interface is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method. +### Sending User Requests Here is an example of sending a user request using the curl command: @@ -73,53 +74,169 @@ print('\n') For a description of the OpenAI protocol, refer to the document [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create). -## Parameter Differences -### Request Parameter Differences -The differences in request parameters between FastDeploy and the OpenAI protocol are as follows. Other request parameters will be ignored: +### Compatible OpenAI Parameters +```python +messages: Union[List[Any], List[int]] +# List of input messages, which can be text messages (`List[Any]`, typically `List[dict]`) or token ID lists (`List[int]`). -- `prompt` (supported only in the `v1/completions` interface) -- `messages` (supported only in the `v1/chat/completions` interface) -- `logprobs`: Optional[bool] = False (supported only in the `v1/chat/completions` interface) -- `top_logprobs`: Optional[int] = None (supported only in the `v1/chat/completions` interface. An integer between 0 and 20,logprobs must be set to true if this parameter is used) -- `frequency_penalty`: Optional[float] = 0.0 -- `max_tokens`: Optional[int] = 16 -- `presence_penalty`: Optional[float] = 0.0 -- `stream`: Optional[bool] = False -- `stream_options`: Optional[StreamOptions] = None -- `temperature`: Optional[float] = None -- `top_p`: Optional[float] = None -- `extra_body`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `extra_body={"enable_thinking": True}`) - - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated) - - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`) - - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking) - - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition)) - - `return_token_ids`: Optional[bool] = False: (whether to return token ids as a list) - - `include_stop_str_in_output`: Optional[bool] = False: (whether to include the stop strings in output text. Defaults to False.) +tools: Optional[List[ChatCompletionToolsParam]] = None +# List of tool call configurations, used for enabling function calling (Function Calling) or tool usage (e.g., ReAct framework). -> Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used. +model: Optional[str] = "default" +# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model). -### Return Field Differences +frequency_penalty: Optional[float] = None +# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition, default `None` disables). -The additional return fields added by FastDeploy are as follows: +logprobs: Optional[bool] = False +# Whether to return the log probabilities of each generated token, used for debugging or analysis. -- `arrival_time`: Returns the cumulative time taken for all tokens -- `reasoning_content`: The returned result of the reasoning chain -- `prompt_token_ids`: The token id list of the prompt -- `completion_token_ids`: The token id list of the completion +top_logprobs: Optional[int] = 0 +# Returns the top `top_logprobs` tokens and their log probabilities for each generated position (default `0` means no return). + +max_tokens: Optional[int] = Field( + default=None, + deprecated="max_tokens is deprecated in favor of the max_completion_tokens field", +) +# Deprecated: Maximum number of tokens to generate (recommended to use `max_completion_tokens` instead). + +max_completion_tokens: Optional[int] = None +# Maximum number of tokens to generate (recommended alternative to `max_tokens`), no default limit (restricted by the model's context window). + +presence_penalty: Optional[float] = None +# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics, default `None` disables). + +stream: Optional[bool] = False +# Whether to enable streaming output (return results token by token), default `False` (returns complete results at once). + +stream_options: Optional[StreamOptions] = None +# Additional configurations for streaming output (such as chunk size, timeout, etc.), refer to the specific definition of `StreamOptions`. + +temperature: Optional[float] = None +# Temperature coefficient, controlling generation randomness (`0.0` for deterministic generation, `>1.0` for more randomness, default `None` uses model default). + +top_p: Optional[float] = None +# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds `top_p` (default `None` disables). + +response_format: Optional[AnyResponseFormat] = None +# Specifies the output format (such as JSON, XML, etc.), requires passing a predefined format configuration object. + +user: Optional[str] = None +# User identifier, used for tracking or distinguishing requests from different users (default `None` does not pass). + +metadata: Optional[dict] = None +# Additional metadata, used for passing custom information (such as request ID, debug markers, etc.). + +``` + +### Additional Parameters Added by FastDeploy + +> Note: +When sending requests using curl, the following parameters can be used directly; +When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`. + +The following sampling parameters are supported. +```python +top_k: Optional[int] = None +# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit). + +min_p: Optional[float] = None +# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled). + +min_tokens: Optional[int] = None +# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit). + +include_stop_str_in_output: Optional[bool] = False +# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered). + +bad_words: Optional[List[str]] = None +# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction). + +repetition_penalty: Optional[float] = None +# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled). +``` + +The following extra parameters are supported: +```python +chat_template_kwargs: Optional[dict] = None +# Additional parameters passed to the chat template, used for customizing dialogue formats (default None). + +reasoning_max_tokens: Optional[int] = None +# Maximum number of tokens to generate during reasoning (e.g., CoT, chain of thought) (default None means using global max_tokens). + +structural_tag: Optional[str] = None +# Structural tag, used to mark specific structures of generated content (such as JSON, XML, etc., default None). + +guided_json: Optional[Union[str, dict, BaseModel]] = None +# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None). + +guided_regex: Optional[str] = None +# Guides the generation of content conforming to regular expression rules (default None means no restriction). + +guided_choice: Optional[List[str]] = None +# Guides the generation of content selected from a specified candidate list (default None means no restriction). + +guided_grammar: Optional[str] = None +# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction). + +return_token_ids: Optional[bool] = None +# Whether to return the token IDs of the generation results instead of text (default None means return text). + +prompt_token_ids: Optional[List[int]] = None +# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input). + +max_streaming_response_tokens: Optional[int] = None +# Maximum number of tokens returned at a time during streaming output (default None means no limit). + +disable_chat_template: Optional[bool] = False +# Whether to disable chat template rendering, using raw input directly (default False means template is enabled). +``` + +### Differences in Return Fields + +Additional return fields added by FastDeploy: + +- `arrival_time`: Cumulative time consumed for all tokens +- `reasoning_content`: Return results of the chain of thought +- `prompt_token_ids`: List of token IDs for the input sequence +- `completion_token_ids`: List of token IDs for the output sequence Overview of return parameters: ```python + +ChatCompletionResponse: + id: str + object: str = "chat.completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[ChatCompletionResponseChoice] + usage: UsageInfo +ChatCompletionResponseChoice: + index: int + message: ChatMessage + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]] +ChatMessage: + role: str + content: str + reasoning_content: Optional[str] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None + +# Fields returned for streaming responses ChatCompletionStreamResponse: id: str object: str = "chat.completion.chunk" created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[ChatCompletionResponseStreamChoice] + usage: Optional[UsageInfo] = None ChatCompletionResponseStreamChoice: index: int delta: DeltaMessage - finish_reason: Optional[Literal["stop", "length"]] = None + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None arrival_time: Optional[float] = None DeltaMessage: role: Optional[str] = None @@ -128,3 +245,156 @@ DeltaMessage: completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None ``` + +## Completion API +The Completion API interface is mainly used for continuation scenarios, suitable for users who have customized context input and expect the model to only output continuation content; the inference process does not add other `prompt` concatenations. + +### Sending User Requests + +Here is an example of sending a user request using the curl command: + +```bash +curl -X POST "http://0.0.0.0:8188/v1/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "prompt": "以下是一篇关于深圳文心公园的500字游记和赏析:" +}' +``` + +Here is an example of sending a user request using a Python script: + +```python +import openai +host = "0.0.0.0" +port = "8170" +client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") + +response = client.completions.create( + model="default", + prompt="以下是一篇关于深圳文心公园的500字游记和赏析:", + stream=False, +) +print(response.choices[0].text) +``` + +For an explanation of the OpenAI protocol, refer to the [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。 + +### Compatible OpenAI Parameters +```python +model: Optional[str] = "default" +# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model). + +prompt: Union[List[int], List[List[int]], str, List[str]] +# Input prompt, supporting multiple formats: +# - `str`: Plain text prompt (e.g., `"Hello, how are you?"`). +# - `List[str]`: Multiple text segments (e.g., `["User:", "Hello!", "Assistant:", "Hi!"]`). +# - `List[int]`: Directly passes a list of token IDs (e.g., `[123, 456]`). +# - `List[List[int]]`: List of multiple token ID lists (e.g., `[[123], [456, 789]]`). + +best_of: Optional[int] = None +# Generates `best_of` candidate results and returns the highest-scoring one (requires `n=1`). + +frequency_penalty: Optional[float] = None +# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition). + +logprobs: Optional[int] = None +# Returns the log probabilities of each generated token, can specify the number of candidates to return. + +max_tokens: Optional[int] = None +# Maximum number of tokens to generate (including input and output), no default limit (restricted by the model's context window). + +presence_penalty: Optional[float] = None +# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics). +``` + +### Additional Parameters Added by FastDeploy + +> Note: +When sending requests using curl, the following parameters can be used directly; +When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`. + +The following sampling parameters are supported. +```python +top_k: Optional[int] = None +# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit). + +min_p: Optional[float] = None +# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled). + +min_tokens: Optional[int] = None +# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit). + +include_stop_str_in_output: Optional[bool] = False +# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered). + +bad_words: Optional[List[str]] = None +# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction). + +repetition_penalty: Optional[float] = None +# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled). +``` + +The following extra parameters are supported: +```python +guided_json: Optional[Union[str, dict, BaseModel]] = None +# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None). + +guided_regex: Optional[str] = None +# Guides the generation of content conforming to regular expression rules (default None means no restriction). + +guided_choice: Optional[List[str]] = None +# Guides the generation of content selected from a specified candidate list (default None means no restriction). + +guided_grammar: Optional[str] = None +# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction). + +return_token_ids: Optional[bool] = None +# Whether to return the token IDs of the generation results instead of text (default None means return text). + +prompt_token_ids: Optional[List[int]] = None +# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input). + +max_streaming_response_tokens: Optional[int] = None +# Maximum number of tokens returned at a time during streaming output (default None means no limit). +``` + +### Overview of Return Parameters + +```python + +CompletionResponse: + id: str + object: str = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[CompletionResponseChoice] + usage: UsageInfo +CompletionResponseChoice: + index: int + text: str + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None + arrival_time: Optional[float] = None + logprobs: Optional[int] = None + reasoning_content: Optional[str] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] + +# Fields returned for streaming responses +CompletionStreamResponse: + id: str + object: str = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[CompletionResponseStreamChoice] + usage: Optional[UsageInfo] = None +CompletionResponseStreamChoice: + index: int + text: str + arrival_time: float = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None + logprobs: Optional[float] = None + reasoning_content: Optional[str] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None + +``` diff --git a/docs/zh/features/reasoning_output.md b/docs/zh/features/reasoning_output.md index f41ba77dd..cd32e4c6c 100644 --- a/docs/zh/features/reasoning_output.md +++ b/docs/zh/features/reasoning_output.md @@ -43,7 +43,7 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ {"type": "text", "text": "图中的文物属于哪个年代"} ]} ], - "enable_thinking": true, + "chat_template_kwargs":{"enable_thinking": true}, "reasoning_max_tokens": 1024 }' @@ -71,7 +71,7 @@ chat_response = client.chat.completions.create( model="vl", stream=True, extra_body={ - "enable_thinking": True, + "chat_template_kwargs":{"enable_thinking": True}, "reasoning_max_tokens": 1024 } ) diff --git a/docs/zh/get_started/ernie-4.5-vl.md b/docs/zh/get_started/ernie-4.5-vl.md index 5bec9ca20..3922c899f 100644 --- a/docs/zh/get_started/ernie-4.5-vl.md +++ b/docs/zh/get_started/ernie-4.5-vl.md @@ -110,7 +110,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "图中的文物属于哪个年代"} ]} ], - "enable_thinking": false + "chat_template_kwargs":{"enable_thinking": false} }' ``` diff --git a/docs/zh/get_started/quick_start_vl.md b/docs/zh/get_started/quick_start_vl.md index c9fe26a51..0f4c88cc1 100644 --- a/docs/zh/get_started/quick_start_vl.md +++ b/docs/zh/get_started/quick_start_vl.md @@ -73,7 +73,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "图中的文物属于哪个年代"} ]} ], - "enable_thinking": false + "chat_template_kwargs":{"enable_thinking": false} }' ``` diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md index 1c90dc2f3..a68eedbdb 100644 --- a/docs/zh/online_serving/README.md +++ b/docs/zh/online_serving/README.md @@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \ 服务部署时的命令行更多使用方式参考[参数说明](../parameters.md)。 -## 发送用户请求 +## Chat Completion API +FastDeploy 接口兼容 OpenAI 的 Chat Completion API,用户可以通过 OpenAI 协议发送用户请求。 -FastDeploy 接口兼容 OpenAI 协议,可以直接使用 OpenAI 的请求方式发送用户请求。 +### 发送用户请求 使用 curl 命令发送用户请求示例如下: @@ -71,32 +72,124 @@ for chunk in response: print('\n') ``` -关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)。 +关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create)。 -## 参数差异 -### 请求参数差异 -FastDeploy 与 OpenAI 协议的请求参数差异如下,其余请求参数会被忽略: -- `prompt` (仅支持 `v1/completions` 接口) -- `messages` (仅支持 `v1/chat/completions` 接口) -- `logprobs`: Optional[bool] = False (仅支持 `v1/chat/completions` 接口) -- `top_logprobs`: Optional[int] = None (仅支持 `v1/chat/completions` 接口。如果使用这个参数必须设置logprobs为True,取值大于等于0小于20) -- `frequency_penalty`: Optional[float] = 0.0 -- `max_tokens`: Optional[int] = 16 -- `presence_penalty`: Optional[float] = 0.0 -- `stream`: Optional[bool] = False -- `stream_options`: Optional[StreamOptions] = None -- `temperature`: Optional[float] = None -- `top_p`: Optional[float] = None -- `extra_body`: Optional[dict] = None (仅在 v1/chat/compeltions 中支持,用于配置额外参数, 如 `extra_body={"enable_thinking": True}`) - - `min_tokens`: Optional[int] = 1 最小生成的Token个数 - - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数,默认与max_tokens一致 - - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考 - - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数(>1时惩罚重复,<1时鼓励重复) - - `return_token_ids`: Optional[bool] = False: 是否返回 token id 列表 - - `include_stop_str_in_output`: Optional[bool] = False: 是否返回结束符 - - `top_k`: Optional[bool] = None: TopK-TopP采样参数,参考[采样说明](../features/sampling.md) +### 兼容OpenAI 参数 +```python +messages: Union[List[Any], List[int]] +# 输入消息列表,可以是文本消息(`List[Any]`,通常为 `List[dict]`)或 token ID 列表(`List[int]`)。 -> 注: 若为多模态模型 由于思考链默认打开导致输出过长,max tokens 可以设置为模型最长输出,或使用默认值。 +tools: Optional[List[ChatCompletionToolsParam]] = None +# 工具调用配置列表,用于启用函数调用(Function Calling)或工具使用(如 ReAct 框架)。 + +model: Optional[str] = "default" +# 指定使用的模型名称或版本,默认值为 `"default"`(可能指向基础模型)。 + +frequency_penalty: Optional[float] = None +# 频率惩罚系数,降低重复生成相同 token 的概率(`>1.0` 抑制重复,`<1.0` 鼓励重复,默认 `None` 禁用)。 + +logprobs: Optional[bool] = False +# 是否返回每个生成 token 的对数概率(log probabilities),用于调试或分析。 + +top_logprobs: Optional[int] = 0 +# 返回每个生成位置概率最高的 `top_logprobs` 个 token 及其对数概率(默认 `0` 表示不返回)。 + +max_tokens: Optional[int] = Field( + default=None, + deprecated="max_tokens is deprecated in favor of the max_completion_tokens field", +) +# 已弃用:生成的最大 token 数(建议改用 `max_completion_tokens`)。 + +max_completion_tokens: Optional[int] = None +# 生成的最大 token 数(推荐替代 `max_tokens`),默认无限制(受模型上下文窗口限制)。 + +presence_penalty: Optional[float] = None +# 存在惩罚系数,降低新主题(未出现过的话题)的生成概率(`>1.0` 抑制新话题,`<1.0` 鼓励新话题,默认 `None` 禁用)。 + +stream: Optional[bool] = False +# 是否启用流式输出(逐 token 返回结果),默认 `False`(一次性返回完整结果)。 + +stream_options: Optional[StreamOptions] = None +# 流式输出的额外配置(如分块大小、超时等),需参考 `StreamOptions` 的具体定义。 + +temperature: Optional[float] = None +# 温度系数,控制生成随机性(`0.0` 确定性生成,`>1.0` 更随机,默认 `None` 使用模型默认值)。 + +top_p: Optional[float] = None +# 核采样(nucleus sampling)阈值,只保留概率累计超过 `top_p` 的 token(默认 `None` 禁用)。 + +response_format: Optional[AnyResponseFormat] = None +# 指定输出格式(如 JSON、XML 等),需传入预定义的格式配置对象。 + +user: Optional[str] = None +# 用户标识符,用于跟踪或区分不同用户的请求(默认 `None` 不传递)。 + +metadata: Optional[dict] = None +# 附加元数据,用于传递自定义信息(如请求 ID、调试标记等)。 + +``` + +### FastDeploy 增加额外参数 + +> 注: +使用 curl 命令发送请求时, 可以直接使用以下参数; +使用openai.Client 发送请求时,需要使用将以下参数放入 `extra_body` 参数中, 如:`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`。 + +额外采样参数的支持如下: +```python +top_k: Optional[int] = None +# 限制每一步生成时只考虑概率最高的 K 个 token,用于控制随机性(默认 None 表示不限制)。 + +min_p: Optional[float] = None +# 核采样(nucleus sampling)阈值,只保留概率累计超过 min_p 的 token(默认 None 表示禁用)。 + +min_tokens: Optional[int] = None +# 强制生成的最小 token 数,避免过早截断(默认 None 表示不限制)。 + +include_stop_str_in_output: Optional[bool] = False +# 是否在输出中包含停止符(stop string)的内容(默认 False,即遇到停止符时截断输出)。 + +bad_words: Optional[List[str]] = None +# 禁止生成的词汇列表(例如敏感词),模型会避免输出这些词(默认 None 表示不限制)。 + +repetition_penalty: Optional[float] = None +# 重复惩罚系数,降低已生成 token 的重复概率(>1.0 抑制重复,<1.0 鼓励重复,默认 None 表示禁用)。 +``` +其他参数的支持如下: +```python +chat_template_kwargs: Optional[dict] = None +# 传递给聊天模板(chat template)的额外参数,用于自定义对话格式(默认 None)。 + +reasoning_max_tokens: Optional[int] = None +# 推理(如 CoT, 思维链)过程中生成的最大 token 数(默认 None 表示使用全局 max_tokens)。 + +structural_tag: Optional[str] = None +# 结构化标签,用于标记生成内容的特定结构(如 JSON、XML 等,默认 None)。 + +guided_json: Optional[Union[str, dict, BaseModel]] = None +# 引导生成符合 JSON 结构的内容,可以是 JSON 字符串、字典或 Pydantic 模型(默认 None)。 + +guided_regex: Optional[str] = None +# 引导生成符合正则表达式规则的内容(默认 None 表示不限制)。 + +guided_choice: Optional[List[str]] = None +# 引导生成内容从指定的候选列表中选择(默认 None 表示不限制)。 + +guided_grammar: Optional[str] = None +# 引导生成符合语法规则(如 BNF)的内容(默认 None 表示不限制)。 + +return_token_ids: Optional[bool] = None +# 是否返回生成结果的 token ID 而非文本(默认 None 表示返回文本)。 + +prompt_token_ids: Optional[List[int]] = None +# 直接传入 prompt 的 token ID 列表,跳过文本编码步骤(默认 None 表示使用文本输入)。 + +max_streaming_response_tokens: Optional[int] = None +# 流式输出时每次返回的最大 token 数(默认 None 表示不限制)。 + +disable_chat_template: Optional[bool] = False +# 是否禁用聊天模板渲染,直接使用原始输入(默认 False 表示启用模板)。 +``` ### 返回字段差异 @@ -110,16 +203,39 @@ FastDeploy 增加的返回字段如下: 返回参数总览: ```python + +ChatCompletionResponse: + id: str + object: str = "chat.completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[ChatCompletionResponseChoice] + usage: UsageInfo +ChatCompletionResponseChoice: + index: int + message: ChatMessage + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]] +ChatMessage: + role: str + content: str + reasoning_content: Optional[str] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None + +# 返回流式响应的字段 ChatCompletionStreamResponse: id: str object: str = "chat.completion.chunk" created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[ChatCompletionResponseStreamChoice] + usage: Optional[UsageInfo] = None ChatCompletionResponseStreamChoice: index: int delta: DeltaMessage - finish_reason: Optional[Literal["stop", "length"]] = None + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None arrival_time: Optional[float] = None DeltaMessage: role: Optional[str] = None @@ -128,3 +244,155 @@ DeltaMessage: completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None ``` + +## Completion API +Completion API 接口主要用于续聊场景, 适应于用户自定义好上下文输入, 并希望模型仅输出续写内容的场景; 推理过程不会增加其他 `prompt`拼接。: + +### 发送用户请求 + +使用 curl 命令发送用户请求示例如下: + +```bash +curl -X POST "http://0.0.0.0:8188/v1/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "prompt": "以下是一篇关于深圳文心公园的500字游记和赏析:" +}' +``` + +使用 Python 脚本发送用户请求示例如下: + +```python +import openai +host = "0.0.0.0" +port = "8170" +client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") + +response = client.completions.create( + model="default", + prompt="以下是一篇关于深圳文心公园的500字游记和赏析:", + stream=False, +) +print(response.choices[0].text) +``` + +关于 OpenAI 协议的说明可参考文档 [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。 + +### 兼容OpenAI 参数 +```python +model: Optional[str] = "default" +# 指定使用的模型名称或版本,默认值为 `"default"`(可能指向基础模型)。 + +prompt: Union[List[int], List[List[int]], str, List[str]] +# 输入提示,支持多种格式: +# - `str`: 纯文本提示(如 `"Hello, how are you?"`)。 +# - `List[str]`: 多段文本(如 `["User:", "Hello!", "Assistant:", "Hi!"]`)。 +# - `List[int]`: 直接传入 token ID 列表(如 `[123, 456]`)。 +# - `List[List[int]]`: 多段 token ID 列表(如 `[[123], [456, 789]]`)。 + +best_of: Optional[int] = None +# 生成 `best_of` 个候选结果,然后返回其中评分最高的一个(需配合 `n=1` 使用)。 + +frequency_penalty: Optional[float] = None +# 频率惩罚系数,降低重复生成相同 token 的概率(`>1.0` 抑制重复,`<1.0` 鼓励重复)。 + +logprobs: Optional[int] = None +# 返回每个生成 token 的对数概率(log probabilities),可指定返回的候选数量。 + +max_tokens: Optional[int] = None +# 生成的最大 token 数(包括输入和输出),默认无限制(受模型上下文窗口限制)。 + +presence_penalty: Optional[float] = None +# 存在惩罚系数,降低新主题(未出现过的话题)的生成概率(`>1.0` 抑制新话题,`<1.0` 鼓励新话题)。 +``` + +### FastDeploy 增加额外参数 + +> 注: +使用 curl 命令发送请求时, 可以直接使用以下参数; +使用openai.Client 发送请求时,需要使用将以下参数放入 `extra_body` 参数中, 如:`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`。 + +额外采样参数的支持如下: +```python +top_k: Optional[int] = None +# 限制每一步生成时只考虑概率最高的 K 个 token,用于控制随机性(默认 None 表示不限制)。 + +min_p: Optional[float] = None +# 核采样(nucleus sampling)阈值,只保留概率累计超过 min_p 的 token(默认 None 表示禁用)。 + +min_tokens: Optional[int] = None +# 强制生成的最小 token 数,避免过早截断(默认 None 表示不限制)。 + +include_stop_str_in_output: Optional[bool] = False +# 是否在输出中包含停止符(stop string)的内容(默认 False,即遇到停止符时截断输出)。 + +bad_words: Optional[List[str]] = None +# 禁止生成的词汇列表(例如敏感词),模型会避免输出这些词(默认 None 表示不限制)。 + +repetition_penalty: Optional[float] = None +# 重复惩罚系数,降低已生成 token 的重复概率(>1.0 抑制重复,<1.0 鼓励重复,默认 None 表示禁用)。 +``` +其他参数的支持如下: +```python +guided_json: Optional[Union[str, dict, BaseModel]] = None +# 引导生成符合 JSON 结构的内容,可以是 JSON 字符串、字典或 Pydantic 模型(默认 None)。 + +guided_regex: Optional[str] = None +# 引导生成符合正则表达式规则的内容(默认 None 表示不限制)。 + +guided_choice: Optional[List[str]] = None +# 引导生成内容从指定的候选列表中选择(默认 None 表示不限制)。 + +guided_grammar: Optional[str] = None +# 引导生成符合语法规则(如 BNF)的内容(默认 None 表示不限制)。 + +return_token_ids: Optional[bool] = None +# 是否返回生成结果的 token ID 而非文本(默认 None 表示返回文本)。 + +prompt_token_ids: Optional[List[int]] = None +# 直接传入 prompt 的 token ID 列表,跳过文本编码步骤(默认 None 表示使用文本输入)。 + +max_streaming_response_tokens: Optional[int] = None +# 流式输出时每次返回的最大 token 数(默认 None 表示不限制)。 +``` + +### 返回参数总览 + +```python + +CompletionResponse: + id: str + object: str = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[CompletionResponseChoice] + usage: UsageInfo +CompletionResponseChoice: + index: int + text: str + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None + arrival_time: Optional[float] = None + logprobs: Optional[int] = None + reasoning_content: Optional[str] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] + +# 返回流式响应的字段 +CompletionStreamResponse: + id: str + object: str = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[CompletionResponseStreamChoice] + usage: Optional[UsageInfo] = None +CompletionResponseStreamChoice: + index: int + text: str + arrival_time: float = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None + logprobs: Optional[float] = None + reasoning_content: Optional[str] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None + +``` diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 482399b48..f4fd099f7 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -343,28 +343,29 @@ class CompletionRequest(BaseModel): suffix: Optional[dict] = None temperature: Optional[float] = None top_p: Optional[float] = None - top_k: Optional[int] = None - min_p: Optional[float] = None - include_stop_str_in_output: Optional[bool] = False user: Optional[str] = None + # doc: begin-completion-sampling-params + top_k: Optional[int] = None + min_p: Optional[float] = None + repetition_penalty: Optional[float] = None + stop_token_ids: Optional[List[int]] = Field(default_factory=list) min_tokens: Optional[int] = None - return_token_ids: Optional[bool] = None - max_streaming_response_tokens: Optional[int] = None - prompt_token_ids: Optional[List[int]] = None + include_stop_str_in_output: Optional[bool] = False bad_words: Optional[List[str]] = None + # doc: end-completion-sampling-params + # doc: start-completion-extra-params response_format: Optional[AnyResponseFormat] = None guided_json: Optional[Union[str, dict, BaseModel]] = None guided_regex: Optional[str] = None guided_choice: Optional[list[str]] = None guided_grammar: Optional[str] = None - # doc: begin-completion-sampling-params - repetition_penalty: Optional[float] = None - stop_token_ids: Optional[List[int]] = Field(default_factory=list) - - # doc: end-completion-sampling-params + max_streaming_response_tokens: Optional[int] = None + return_token_ids: Optional[bool] = None + prompt_token_ids: Optional[List[int]] = None + # doc: end-completion-extra-params def to_dict_for_infer(self, request_id=None, prompt=None): """ @@ -477,33 +478,34 @@ class ChatCompletionRequest(BaseModel): stream_options: Optional[StreamOptions] = None temperature: Optional[float] = None top_p: Optional[float] = None - top_k: Optional[int] = None - min_p: Optional[float] = None user: Optional[str] = None metadata: Optional[dict] = None + response_format: Optional[AnyResponseFormat] = None - return_token_ids: Optional[bool] = None - prompt_token_ids: Optional[List[int]] = None - disable_chat_template: Optional[bool] = False + # doc: begin-chat-completion-sampling-params + top_k: Optional[int] = None + min_p: Optional[float] = None min_tokens: Optional[int] = None - enable_thinking: Optional[bool] = None - reasoning_max_tokens: Optional[int] = None - max_streaming_response_tokens: Optional[int] = None include_stop_str_in_output: Optional[bool] = False bad_words: Optional[List[str]] = None + repetition_penalty: Optional[float] = None + stop_token_ids: Optional[List[int]] = Field(default_factory=list) + # doc: end-chat-completion-sampling-params - response_format: Optional[AnyResponseFormat] = None + # doc: start-completion-extra-params + chat_template_kwargs: Optional[dict] = None + reasoning_max_tokens: Optional[int] = None + structural_tag: Optional[str] = None guided_json: Optional[Union[str, dict, BaseModel]] = None guided_regex: Optional[str] = None guided_choice: Optional[list[str]] = None guided_grammar: Optional[str] = None - structural_tag: Optional[str] = None - # doc: begin-chat-completion-sampling-params - repetition_penalty: Optional[float] = None - stop_token_ids: Optional[List[int]] = Field(default_factory=list) - - # doc: end-chat-completion-sampling-params + return_token_ids: Optional[bool] = None + prompt_token_ids: Optional[List[int]] = None + max_streaming_response_tokens: Optional[int] = None + disable_chat_template: Optional[bool] = False + # doc: end-chat-completion-extra-params def to_dict_for_infer(self, request_id=None): """ diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 8b2141a4b..5ebcc98fc 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -129,11 +129,11 @@ class OpenAIServingChat: if request.max_streaming_response_tokens is not None else (request.metadata or {}).get("max_streaming_response_tokens", 1) ) # dierctly passed & passed in metadata - enable_thinking = ( - request.enable_thinking - if request.enable_thinking is not None - else (request.metadata or {}).get("enable_thinking") - ) + + enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None + if enable_thinking is None: + enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None + include_stop_str_in_output = request.include_stop_str_in_output stream_options = request.stream_options @@ -330,11 +330,10 @@ class OpenAIServingChat: """ created_time = int(time.time()) final_res = None - enable_thinking = ( - request.enable_thinking - if request.enable_thinking is not None - else (request.metadata or {}).get("enable_thinking") - ) + enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None + if enable_thinking is None: + enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None + include_stop_str_in_output = request.include_stop_str_in_output try: diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index f6aa2b424..fb31a655f 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -493,7 +493,7 @@ def test_chat_with_thinking(openai_client, capsys): temperature=1, stream=False, max_tokens=10, - extra_body={"enable_thinking": True}, + extra_body={"chat_template_kwargs": {"enable_thinking": True}}, ) assert response.choices[0].message.reasoning_content is not None @@ -504,7 +504,7 @@ def test_chat_with_thinking(openai_client, capsys): temperature=1, stream=False, max_tokens=10, - extra_body={"enable_thinking": False}, + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) assert response.choices[0].message.reasoning_content is None @@ -514,7 +514,11 @@ def test_chat_with_thinking(openai_client, capsys): model="default", messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], temperature=1, - extra_body={"enable_thinking": True, "reasoning_max_tokens": reasoning_max_tokens, "return_token_ids": True}, + extra_body={ + "chat_template_kwargs": {"enable_thinking": True}, + "reasoning_max_tokens": reasoning_max_tokens, + "return_token_ids": True, + }, stream=True, max_tokens=10, )