[Doc] add chat_template_kwagrs and update params docs (#3103)

* add chat_template_kwagrs and update params docs

* add chat_template_kwagrs and update params docs

* update enable_thinking

* pre-commit

* update test case

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
LiqinruiG
2025-07-31 19:44:06 +08:00
committed by GitHub
parent 22cab724e8
commit 25005fee30
11 changed files with 648 additions and 105 deletions

View File

@@ -43,7 +43,7 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
{"type": "text", "text": "Which era does the cultural relic in the picture belong to"}
]}
],
"enable_thinking": true,
"chat_template_kwargs":{"enable_thinking": true},
"reasoning_max_tokens": 1024
}'
```
@@ -70,7 +70,7 @@ chat_response = client.chat.completions.create(
model="vl",
stream=True,
extra_body={
"enable_thinking": True,
"chat_template_kwargs":{"enable_thinking": True},
"reasoning_max_tokens": 1024
}
)

View File

@@ -113,7 +113,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
{"type": "text", "text": "From which era does the artifact in the image originate?"}
]}
],
"enable_thinking": false
"chat_template_kwargs":{"enable_thinking": false}
}'
```

View File

@@ -74,7 +74,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
{"type": "text", "text": "What era does this artifact belong to?"}
]}
],
"enable_thinking": false
"chat_template_kwargs":{"enable_thinking": false}
}'
```

View File

@@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \
For more usage methods of the command line during service deployment, refer to [Parameter Descriptions](../parameters.md).
## Sending User Requests
## Chat Completion API
FastDeploy provides a Chat Completion API that is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method.
The FastDeploy interface is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method.
### Sending User Requests
Here is an example of sending a user request using the curl command:
@@ -73,53 +74,169 @@ print('\n')
For a description of the OpenAI protocol, refer to the document [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create).
## Parameter Differences
### Request Parameter Differences
The differences in request parameters between FastDeploy and the OpenAI protocol are as follows. Other request parameters will be ignored:
### Compatible OpenAI Parameters
```python
messages: Union[List[Any], List[int]]
# List of input messages, which can be text messages (`List[Any]`, typically `List[dict]`) or token ID lists (`List[int]`).
- `prompt` (supported only in the `v1/completions` interface)
- `messages` (supported only in the `v1/chat/completions` interface)
- `logprobs`: Optional[bool] = False (supported only in the `v1/chat/completions` interface)
- `top_logprobs`: Optional[int] = None (supported only in the `v1/chat/completions` interface. An integer between 0 and 20,logprobs must be set to true if this parameter is used)
- `frequency_penalty`: Optional[float] = 0.0
- `max_tokens`: Optional[int] = 16
- `presence_penalty`: Optional[float] = 0.0
- `stream`: Optional[bool] = False
- `stream_options`: Optional[StreamOptions] = None
- `temperature`: Optional[float] = None
- `top_p`: Optional[float] = None
- `extra_body`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `extra_body={"enable_thinking": True}`)
- `min_tokens`: Optional[int] = 1 (minimum number of tokens generated)
- `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`)
- `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking)
- `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition))
- `return_token_ids`: Optional[bool] = False: (whether to return token ids as a list)
- `include_stop_str_in_output`: Optional[bool] = False: (whether to include the stop strings in output text. Defaults to False.)
tools: Optional[List[ChatCompletionToolsParam]] = None
# List of tool call configurations, used for enabling function calling (Function Calling) or tool usage (e.g., ReAct framework).
> Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used.
model: Optional[str] = "default"
# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model).
### Return Field Differences
frequency_penalty: Optional[float] = None
# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition, default `None` disables).
The additional return fields added by FastDeploy are as follows:
logprobs: Optional[bool] = False
# Whether to return the log probabilities of each generated token, used for debugging or analysis.
- `arrival_time`: Returns the cumulative time taken for all tokens
- `reasoning_content`: The returned result of the reasoning chain
- `prompt_token_ids`: The token id list of the prompt
- `completion_token_ids`: The token id list of the completion
top_logprobs: Optional[int] = 0
# Returns the top `top_logprobs` tokens and their log probabilities for each generated position (default `0` means no return).
max_tokens: Optional[int] = Field(
default=None,
deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
)
# Deprecated: Maximum number of tokens to generate (recommended to use `max_completion_tokens` instead).
max_completion_tokens: Optional[int] = None
# Maximum number of tokens to generate (recommended alternative to `max_tokens`), no default limit (restricted by the model's context window).
presence_penalty: Optional[float] = None
# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics, default `None` disables).
stream: Optional[bool] = False
# Whether to enable streaming output (return results token by token), default `False` (returns complete results at once).
stream_options: Optional[StreamOptions] = None
# Additional configurations for streaming output (such as chunk size, timeout, etc.), refer to the specific definition of `StreamOptions`.
temperature: Optional[float] = None
# Temperature coefficient, controlling generation randomness (`0.0` for deterministic generation, `>1.0` for more randomness, default `None` uses model default).
top_p: Optional[float] = None
# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds `top_p` (default `None` disables).
response_format: Optional[AnyResponseFormat] = None
# Specifies the output format (such as JSON, XML, etc.), requires passing a predefined format configuration object.
user: Optional[str] = None
# User identifier, used for tracking or distinguishing requests from different users (default `None` does not pass).
metadata: Optional[dict] = None
# Additional metadata, used for passing custom information (such as request ID, debug markers, etc.).
```
### Additional Parameters Added by FastDeploy
> Note:
When sending requests using curl, the following parameters can be used directly;
When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`.
The following sampling parameters are supported.
```python
top_k: Optional[int] = None
# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit).
min_p: Optional[float] = None
# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled).
min_tokens: Optional[int] = None
# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit).
include_stop_str_in_output: Optional[bool] = False
# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered).
bad_words: Optional[List[str]] = None
# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction).
repetition_penalty: Optional[float] = None
# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled).
```
The following extra parameters are supported:
```python
chat_template_kwargs: Optional[dict] = None
# Additional parameters passed to the chat template, used for customizing dialogue formats (default None).
reasoning_max_tokens: Optional[int] = None
# Maximum number of tokens to generate during reasoning (e.g., CoT, chain of thought) (default None means using global max_tokens).
structural_tag: Optional[str] = None
# Structural tag, used to mark specific structures of generated content (such as JSON, XML, etc., default None).
guided_json: Optional[Union[str, dict, BaseModel]] = None
# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None).
guided_regex: Optional[str] = None
# Guides the generation of content conforming to regular expression rules (default None means no restriction).
guided_choice: Optional[List[str]] = None
# Guides the generation of content selected from a specified candidate list (default None means no restriction).
guided_grammar: Optional[str] = None
# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction).
return_token_ids: Optional[bool] = None
# Whether to return the token IDs of the generation results instead of text (default None means return text).
prompt_token_ids: Optional[List[int]] = None
# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input).
max_streaming_response_tokens: Optional[int] = None
# Maximum number of tokens returned at a time during streaming output (default None means no limit).
disable_chat_template: Optional[bool] = False
# Whether to disable chat template rendering, using raw input directly (default False means template is enabled).
```
### Differences in Return Fields
Additional return fields added by FastDeploy:
- `arrival_time`: Cumulative time consumed for all tokens
- `reasoning_content`: Return results of the chain of thought
- `prompt_token_ids`: List of token IDs for the input sequence
- `completion_token_ids`: List of token IDs for the output sequence
Overview of return parameters:
```python
ChatCompletionResponse:
id: str
object: str = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseChoice]
usage: UsageInfo
ChatCompletionResponseChoice:
index: int
message: ChatMessage
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
ChatMessage:
role: str
content: str
reasoning_content: Optional[str] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
# Fields returned for streaming responses
ChatCompletionStreamResponse:
id: str
object: str = "chat.completion.chunk"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseStreamChoice]
usage: Optional[UsageInfo] = None
ChatCompletionResponseStreamChoice:
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]] = None
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
arrival_time: Optional[float] = None
DeltaMessage:
role: Optional[str] = None
@@ -128,3 +245,156 @@ DeltaMessage:
completion_token_ids: Optional[List[int]] = None
reasoning_content: Optional[str] = None
```
## Completion API
The Completion API interface is mainly used for continuation scenarios, suitable for users who have customized context input and expect the model to only output continuation content; the inference process does not add other `prompt` concatenations.
### Sending User Requests
Here is an example of sending a user request using the curl command:
```bash
curl -X POST "http://0.0.0.0:8188/v1/completions" \
-H "Content-Type: application/json" \
-d '{
"prompt": "以下是一篇关于深圳文心公园的500字游记和赏析"
}'
```
Here is an example of sending a user request using a Python script:
```python
import openai
host = "0.0.0.0"
port = "8170"
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
response = client.completions.create(
model="default",
prompt="以下是一篇关于深圳文心公园的500字游记和赏析",
stream=False,
)
print(response.choices[0].text)
```
For an explanation of the OpenAI protocol, refer to the [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。
### Compatible OpenAI Parameters
```python
model: Optional[str] = "default"
# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model).
prompt: Union[List[int], List[List[int]], str, List[str]]
# Input prompt, supporting multiple formats:
# - `str`: Plain text prompt (e.g., `"Hello, how are you?"`).
# - `List[str]`: Multiple text segments (e.g., `["User:", "Hello!", "Assistant:", "Hi!"]`).
# - `List[int]`: Directly passes a list of token IDs (e.g., `[123, 456]`).
# - `List[List[int]]`: List of multiple token ID lists (e.g., `[[123], [456, 789]]`).
best_of: Optional[int] = None
# Generates `best_of` candidate results and returns the highest-scoring one (requires `n=1`).
frequency_penalty: Optional[float] = None
# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition).
logprobs: Optional[int] = None
# Returns the log probabilities of each generated token, can specify the number of candidates to return.
max_tokens: Optional[int] = None
# Maximum number of tokens to generate (including input and output), no default limit (restricted by the model's context window).
presence_penalty: Optional[float] = None
# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics).
```
### Additional Parameters Added by FastDeploy
> Note:
When sending requests using curl, the following parameters can be used directly;
When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`.
The following sampling parameters are supported.
```python
top_k: Optional[int] = None
# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit).
min_p: Optional[float] = None
# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled).
min_tokens: Optional[int] = None
# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit).
include_stop_str_in_output: Optional[bool] = False
# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered).
bad_words: Optional[List[str]] = None
# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction).
repetition_penalty: Optional[float] = None
# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled).
```
The following extra parameters are supported:
```python
guided_json: Optional[Union[str, dict, BaseModel]] = None
# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None).
guided_regex: Optional[str] = None
# Guides the generation of content conforming to regular expression rules (default None means no restriction).
guided_choice: Optional[List[str]] = None
# Guides the generation of content selected from a specified candidate list (default None means no restriction).
guided_grammar: Optional[str] = None
# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction).
return_token_ids: Optional[bool] = None
# Whether to return the token IDs of the generation results instead of text (default None means return text).
prompt_token_ids: Optional[List[int]] = None
# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input).
max_streaming_response_tokens: Optional[int] = None
# Maximum number of tokens returned at a time during streaming output (default None means no limit).
```
### Overview of Return Parameters
```python
CompletionResponse:
id: str
object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseChoice]
usage: UsageInfo
CompletionResponseChoice:
index: int
text: str
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
arrival_time: Optional[float] = None
logprobs: Optional[int] = None
reasoning_content: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
# Fields returned for streaming responses
CompletionStreamResponse
id: str
object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseStreamChoice]
usage: Optional[UsageInfo] = None
CompletionResponseStreamChoice:
index: int
text: str
arrival_time: float = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
logprobs: Optional[float] = None
reasoning_content: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
```

View File

@@ -43,7 +43,7 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
{"type": "text", "text": "图中的文物属于哪个年代"}
]}
],
"enable_thinking": true,
"chat_template_kwargs":{"enable_thinking": true},
"reasoning_max_tokens": 1024
}'
@@ -71,7 +71,7 @@ chat_response = client.chat.completions.create(
model="vl",
stream=True,
extra_body={
"enable_thinking": True,
"chat_template_kwargs":{"enable_thinking": True},
"reasoning_max_tokens": 1024
}
)

View File

@@ -110,7 +110,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
{"type": "text", "text": "图中的文物属于哪个年代"}
]}
],
"enable_thinking": false
"chat_template_kwargs":{"enable_thinking": false}
}'
```

View File

@@ -73,7 +73,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
{"type": "text", "text": "图中的文物属于哪个年代"}
]}
],
"enable_thinking": false
"chat_template_kwargs":{"enable_thinking": false}
}'
```

View File

@@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \
服务部署时的命令行更多使用方式参考[参数说明](../parameters.md)。
## 发送用户请求
## Chat Completion API
FastDeploy 接口兼容 OpenAI 的 Chat Completion API用户可以通过 OpenAI 协议发送用户请求。
FastDeploy 接口兼容 OpenAI 协议,可以直接使用 OpenAI 的请求方式发送用户请求
### 发送用户请求
使用 curl 命令发送用户请求示例如下:
@@ -71,32 +72,124 @@ for chunk in response:
print('\n')
```
关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)。
关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create)。
## 参数差异
### 请求参数差异
FastDeploy 与 OpenAI 协议的请求参数差异如下,其余请求参数会被忽略:
- `prompt` (仅支持 `v1/completions` 接口)
- `messages` (仅支持 `v1/chat/completions` 接口)
- `logprobs`: Optional[bool] = False (仅支持 `v1/chat/completions` 接口)
- `top_logprobs`: Optional[int] = None (仅支持 `v1/chat/completions` 接口。如果使用这个参数必须设置logprobs为True取值大于等于0小于20)
- `frequency_penalty`: Optional[float] = 0.0
- `max_tokens`: Optional[int] = 16
- `presence_penalty`: Optional[float] = 0.0
- `stream`: Optional[bool] = False
- `stream_options`: Optional[StreamOptions] = None
- `temperature`: Optional[float] = None
- `top_p`: Optional[float] = None
- `extra_body`: Optional[dict] = None (仅在 v1/chat/compeltions 中支持,用于配置额外参数, 如 `extra_body={"enable_thinking": True}`)
- `min_tokens`: Optional[int] = 1 最小生成的Token个数
- `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数默认与max_tokens一致
- `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考
- `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数>1时惩罚重复<1时鼓励重复
- `return_token_ids`: Optional[bool] = False: 是否返回 token id 列表
- `include_stop_str_in_output`: Optional[bool] = False: 是否返回结束符
- `top_k`: Optional[bool] = None: TopK-TopP采样参数参考[采样说明](../features/sampling.md)
### 兼容OpenAI 参数
```python
messages: Union[List[Any], List[int]]
# 输入消息列表,可以是文本消息(`List[Any]`,通常为 `List[dict]`)或 token ID 列表(`List[int]`)。
> 注: 若为多模态模型 由于思考链默认打开导致输出过长max tokens 可以设置为模型最长输出,或使用默认值。
tools: Optional[List[ChatCompletionToolsParam]] = None
# 工具调用配置列表用于启用函数调用Function Calling或工具使用如 ReAct 框架)。
model: Optional[str] = "default"
# 指定使用的模型名称或版本,默认值为 `"default"`(可能指向基础模型)。
frequency_penalty: Optional[float] = None
# 频率惩罚系数,降低重复生成相同 token 的概率(`>1.0` 抑制重复,`<1.0` 鼓励重复,默认 `None` 禁用)。
logprobs: Optional[bool] = False
# 是否返回每个生成 token 的对数概率log probabilities用于调试或分析。
top_logprobs: Optional[int] = 0
# 返回每个生成位置概率最高的 `top_logprobs` 个 token 及其对数概率(默认 `0` 表示不返回)。
max_tokens: Optional[int] = Field(
default=None,
deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
)
# 已弃用:生成的最大 token 数(建议改用 `max_completion_tokens`)。
max_completion_tokens: Optional[int] = None
# 生成的最大 token 数(推荐替代 `max_tokens`),默认无限制(受模型上下文窗口限制)。
presence_penalty: Optional[float] = None
# 存在惩罚系数,降低新主题(未出现过的话题)的生成概率(`>1.0` 抑制新话题,`<1.0` 鼓励新话题,默认 `None` 禁用)。
stream: Optional[bool] = False
# 是否启用流式输出(逐 token 返回结果),默认 `False`(一次性返回完整结果)。
stream_options: Optional[StreamOptions] = None
# 流式输出的额外配置(如分块大小、超时等),需参考 `StreamOptions` 的具体定义。
temperature: Optional[float] = None
# 温度系数,控制生成随机性(`0.0` 确定性生成,`>1.0` 更随机,默认 `None` 使用模型默认值)。
top_p: Optional[float] = None
# 核采样nucleus sampling阈值只保留概率累计超过 `top_p` 的 token默认 `None` 禁用)。
response_format: Optional[AnyResponseFormat] = None
# 指定输出格式(如 JSON、XML 等),需传入预定义的格式配置对象。
user: Optional[str] = None
# 用户标识符,用于跟踪或区分不同用户的请求(默认 `None` 不传递)。
metadata: Optional[dict] = None
# 附加元数据,用于传递自定义信息(如请求 ID、调试标记等
```
### FastDeploy 增加额外参数
> 注:
使用 curl 命令发送请求时, 可以直接使用以下参数;
使用openai.Client 发送请求时,需要使用将以下参数放入 `extra_body` 参数中, 如:`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`
额外采样参数的支持如下:
```python
top_k: Optional[int] = None
# 限制每一步生成时只考虑概率最高的 K 个 token用于控制随机性默认 None 表示不限制)。
min_p: Optional[float] = None
# 核采样nucleus sampling阈值只保留概率累计超过 min_p 的 token默认 None 表示禁用)。
min_tokens: Optional[int] = None
# 强制生成的最小 token 数,避免过早截断(默认 None 表示不限制)。
include_stop_str_in_output: Optional[bool] = False
# 是否在输出中包含停止符stop string的内容默认 False即遇到停止符时截断输出
bad_words: Optional[List[str]] = None
# 禁止生成的词汇列表(例如敏感词),模型会避免输出这些词(默认 None 表示不限制)。
repetition_penalty: Optional[float] = None
# 重复惩罚系数,降低已生成 token 的重复概率(>1.0 抑制重复,<1.0 鼓励重复,默认 None 表示禁用)。
```
其他参数的支持如下:
```python
chat_template_kwargs: Optional[dict] = None
# 传递给聊天模板chat template的额外参数用于自定义对话格式默认 None
reasoning_max_tokens: Optional[int] = None
# 推理(如 CoT, 思维链)过程中生成的最大 token 数(默认 None 表示使用全局 max_tokens
structural_tag: Optional[str] = None
# 结构化标签,用于标记生成内容的特定结构(如 JSON、XML 等,默认 None
guided_json: Optional[Union[str, dict, BaseModel]] = None
# 引导生成符合 JSON 结构的内容,可以是 JSON 字符串、字典或 Pydantic 模型(默认 None
guided_regex: Optional[str] = None
# 引导生成符合正则表达式规则的内容(默认 None 表示不限制)。
guided_choice: Optional[List[str]] = None
# 引导生成内容从指定的候选列表中选择(默认 None 表示不限制)。
guided_grammar: Optional[str] = None
# 引导生成符合语法规则(如 BNF的内容默认 None 表示不限制)。
return_token_ids: Optional[bool] = None
# 是否返回生成结果的 token ID 而非文本(默认 None 表示返回文本)。
prompt_token_ids: Optional[List[int]] = None
# 直接传入 prompt 的 token ID 列表,跳过文本编码步骤(默认 None 表示使用文本输入)。
max_streaming_response_tokens: Optional[int] = None
# 流式输出时每次返回的最大 token 数(默认 None 表示不限制)。
disable_chat_template: Optional[bool] = False
# 是否禁用聊天模板渲染,直接使用原始输入(默认 False 表示启用模板)。
```
### 返回字段差异
@@ -110,16 +203,39 @@ FastDeploy 增加的返回字段如下:
返回参数总览:
```python
ChatCompletionResponse:
id: str
object: str = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseChoice]
usage: UsageInfo
ChatCompletionResponseChoice:
index: int
message: ChatMessage
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
ChatMessage:
role: str
content: str
reasoning_content: Optional[str] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
# 返回流式响应的字段
ChatCompletionStreamResponse:
id: str
object: str = "chat.completion.chunk"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseStreamChoice]
usage: Optional[UsageInfo] = None
ChatCompletionResponseStreamChoice:
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]] = None
logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
arrival_time: Optional[float] = None
DeltaMessage:
role: Optional[str] = None
@@ -128,3 +244,155 @@ DeltaMessage:
completion_token_ids: Optional[List[int]] = None
reasoning_content: Optional[str] = None
```
## Completion API
Completion API 接口主要用于续聊场景, 适应于用户自定义好上下文输入, 并希望模型仅输出续写内容的场景; 推理过程不会增加其他 `prompt`拼接。:
### 发送用户请求
使用 curl 命令发送用户请求示例如下:
```bash
curl -X POST "http://0.0.0.0:8188/v1/completions" \
-H "Content-Type: application/json" \
-d '{
"prompt": "以下是一篇关于深圳文心公园的500字游记和赏析"
}'
```
使用 Python 脚本发送用户请求示例如下:
```python
import openai
host = "0.0.0.0"
port = "8170"
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
response = client.completions.create(
model="default",
prompt="以下是一篇关于深圳文心公园的500字游记和赏析",
stream=False,
)
print(response.choices[0].text)
```
关于 OpenAI 协议的说明可参考文档 [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。
### 兼容OpenAI 参数
```python
model: Optional[str] = "default"
# 指定使用的模型名称或版本,默认值为 `"default"`(可能指向基础模型)。
prompt: Union[List[int], List[List[int]], str, List[str]]
# 输入提示,支持多种格式:
# - `str`: 纯文本提示(如 `"Hello, how are you?"`)。
# - `List[str]`: 多段文本(如 `["User:", "Hello!", "Assistant:", "Hi!"]`)。
# - `List[int]`: 直接传入 token ID 列表(如 `[123, 456]`)。
# - `List[List[int]]`: 多段 token ID 列表(如 `[[123], [456, 789]]`)。
best_of: Optional[int] = None
# 生成 `best_of` 个候选结果,然后返回其中评分最高的一个(需配合 `n=1` 使用)。
frequency_penalty: Optional[float] = None
# 频率惩罚系数,降低重复生成相同 token 的概率(`>1.0` 抑制重复,`<1.0` 鼓励重复)。
logprobs: Optional[int] = None
# 返回每个生成 token 的对数概率log probabilities可指定返回的候选数量。
max_tokens: Optional[int] = None
# 生成的最大 token 数(包括输入和输出),默认无限制(受模型上下文窗口限制)。
presence_penalty: Optional[float] = None
# 存在惩罚系数,降低新主题(未出现过的话题)的生成概率(`>1.0` 抑制新话题,`<1.0` 鼓励新话题)。
```
### FastDeploy 增加额外参数
> 注:
使用 curl 命令发送请求时, 可以直接使用以下参数;
使用openai.Client 发送请求时,需要使用将以下参数放入 `extra_body` 参数中, 如:`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`
额外采样参数的支持如下:
```python
top_k: Optional[int] = None
# 限制每一步生成时只考虑概率最高的 K 个 token用于控制随机性默认 None 表示不限制)。
min_p: Optional[float] = None
# 核采样nucleus sampling阈值只保留概率累计超过 min_p 的 token默认 None 表示禁用)。
min_tokens: Optional[int] = None
# 强制生成的最小 token 数,避免过早截断(默认 None 表示不限制)。
include_stop_str_in_output: Optional[bool] = False
# 是否在输出中包含停止符stop string的内容默认 False即遇到停止符时截断输出
bad_words: Optional[List[str]] = None
# 禁止生成的词汇列表(例如敏感词),模型会避免输出这些词(默认 None 表示不限制)。
repetition_penalty: Optional[float] = None
# 重复惩罚系数,降低已生成 token 的重复概率(>1.0 抑制重复,<1.0 鼓励重复,默认 None 表示禁用)。
```
其他参数的支持如下:
```python
guided_json: Optional[Union[str, dict, BaseModel]] = None
# 引导生成符合 JSON 结构的内容,可以是 JSON 字符串、字典或 Pydantic 模型(默认 None
guided_regex: Optional[str] = None
# 引导生成符合正则表达式规则的内容(默认 None 表示不限制)。
guided_choice: Optional[List[str]] = None
# 引导生成内容从指定的候选列表中选择(默认 None 表示不限制)。
guided_grammar: Optional[str] = None
# 引导生成符合语法规则(如 BNF的内容默认 None 表示不限制)。
return_token_ids: Optional[bool] = None
# 是否返回生成结果的 token ID 而非文本(默认 None 表示返回文本)。
prompt_token_ids: Optional[List[int]] = None
# 直接传入 prompt 的 token ID 列表,跳过文本编码步骤(默认 None 表示使用文本输入)。
max_streaming_response_tokens: Optional[int] = None
# 流式输出时每次返回的最大 token 数(默认 None 表示不限制)。
```
### 返回参数总览
```python
CompletionResponse:
id: str
object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseChoice]
usage: UsageInfo
CompletionResponseChoice:
index: int
text: str
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
arrival_time: Optional[float] = None
logprobs: Optional[int] = None
reasoning_content: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
# 返回流式响应的字段
CompletionStreamResponse
id: str
object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseStreamChoice]
usage: Optional[UsageInfo] = None
CompletionResponseStreamChoice:
index: int
text: str
arrival_time: float = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
logprobs: Optional[float] = None
reasoning_content: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
```

View File

@@ -343,28 +343,29 @@ class CompletionRequest(BaseModel):
suffix: Optional[dict] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
top_k: Optional[int] = None
min_p: Optional[float] = None
include_stop_str_in_output: Optional[bool] = False
user: Optional[str] = None
# doc: begin-completion-sampling-params
top_k: Optional[int] = None
min_p: Optional[float] = None
repetition_penalty: Optional[float] = None
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
min_tokens: Optional[int] = None
return_token_ids: Optional[bool] = None
max_streaming_response_tokens: Optional[int] = None
prompt_token_ids: Optional[List[int]] = None
include_stop_str_in_output: Optional[bool] = False
bad_words: Optional[List[str]] = None
# doc: end-completion-sampling-params
# doc: start-completion-extra-params
response_format: Optional[AnyResponseFormat] = None
guided_json: Optional[Union[str, dict, BaseModel]] = None
guided_regex: Optional[str] = None
guided_choice: Optional[list[str]] = None
guided_grammar: Optional[str] = None
# doc: begin-completion-sampling-params
repetition_penalty: Optional[float] = None
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
# doc: end-completion-sampling-params
max_streaming_response_tokens: Optional[int] = None
return_token_ids: Optional[bool] = None
prompt_token_ids: Optional[List[int]] = None
# doc: end-completion-extra-params
def to_dict_for_infer(self, request_id=None, prompt=None):
"""
@@ -477,33 +478,34 @@ class ChatCompletionRequest(BaseModel):
stream_options: Optional[StreamOptions] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
top_k: Optional[int] = None
min_p: Optional[float] = None
user: Optional[str] = None
metadata: Optional[dict] = None
response_format: Optional[AnyResponseFormat] = None
return_token_ids: Optional[bool] = None
prompt_token_ids: Optional[List[int]] = None
disable_chat_template: Optional[bool] = False
# doc: begin-chat-completion-sampling-params
top_k: Optional[int] = None
min_p: Optional[float] = None
min_tokens: Optional[int] = None
enable_thinking: Optional[bool] = None
reasoning_max_tokens: Optional[int] = None
max_streaming_response_tokens: Optional[int] = None
include_stop_str_in_output: Optional[bool] = False
bad_words: Optional[List[str]] = None
repetition_penalty: Optional[float] = None
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
# doc: end-chat-completion-sampling-params
response_format: Optional[AnyResponseFormat] = None
# doc: start-completion-extra-params
chat_template_kwargs: Optional[dict] = None
reasoning_max_tokens: Optional[int] = None
structural_tag: Optional[str] = None
guided_json: Optional[Union[str, dict, BaseModel]] = None
guided_regex: Optional[str] = None
guided_choice: Optional[list[str]] = None
guided_grammar: Optional[str] = None
structural_tag: Optional[str] = None
# doc: begin-chat-completion-sampling-params
repetition_penalty: Optional[float] = None
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
# doc: end-chat-completion-sampling-params
return_token_ids: Optional[bool] = None
prompt_token_ids: Optional[List[int]] = None
max_streaming_response_tokens: Optional[int] = None
disable_chat_template: Optional[bool] = False
# doc: end-chat-completion-extra-params
def to_dict_for_infer(self, request_id=None):
"""

View File

@@ -129,11 +129,11 @@ class OpenAIServingChat:
if request.max_streaming_response_tokens is not None
else (request.metadata or {}).get("max_streaming_response_tokens", 1)
) # dierctly passed & passed in metadata
enable_thinking = (
request.enable_thinking
if request.enable_thinking is not None
else (request.metadata or {}).get("enable_thinking")
)
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
if enable_thinking is None:
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
include_stop_str_in_output = request.include_stop_str_in_output
stream_options = request.stream_options
@@ -330,11 +330,10 @@ class OpenAIServingChat:
"""
created_time = int(time.time())
final_res = None
enable_thinking = (
request.enable_thinking
if request.enable_thinking is not None
else (request.metadata or {}).get("enable_thinking")
)
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
if enable_thinking is None:
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
include_stop_str_in_output = request.include_stop_str_in_output
try:

View File

@@ -493,7 +493,7 @@ def test_chat_with_thinking(openai_client, capsys):
temperature=1,
stream=False,
max_tokens=10,
extra_body={"enable_thinking": True},
extra_body={"chat_template_kwargs": {"enable_thinking": True}},
)
assert response.choices[0].message.reasoning_content is not None
@@ -504,7 +504,7 @@ def test_chat_with_thinking(openai_client, capsys):
temperature=1,
stream=False,
max_tokens=10,
extra_body={"enable_thinking": False},
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
assert response.choices[0].message.reasoning_content is None
@@ -514,7 +514,11 @@ def test_chat_with_thinking(openai_client, capsys):
model="default",
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
temperature=1,
extra_body={"enable_thinking": True, "reasoning_max_tokens": reasoning_max_tokens, "return_token_ids": True},
extra_body={
"chat_template_kwargs": {"enable_thinking": True},
"reasoning_max_tokens": reasoning_max_tokens,
"return_token_ids": True,
},
stream=True,
max_tokens=10,
)