diff --git a/docs/features/reasoning_output.md b/docs/features/reasoning_output.md
index 6ea2e928e..f98262d62 100644
--- a/docs/features/reasoning_output.md
+++ b/docs/features/reasoning_output.md
@@ -43,7 +43,7 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
       {"type": "text", "text": "Which era does the cultural relic in the picture belong to"}
     ]}
   ],
-  "enable_thinking": true,
+  "chat_template_kwargs":{"enable_thinking": true},
   "reasoning_max_tokens": 1024
 }'
 ```
@@ -70,7 +70,7 @@ chat_response = client.chat.completions.create(
     model="vl",
     stream=True,
     extra_body={
-      "enable_thinking": True,
+      "chat_template_kwargs":{"enable_thinking": True},
       "reasoning_max_tokens": 1024
     }
 )
diff --git a/docs/get_started/ernie-4.5-vl.md b/docs/get_started/ernie-4.5-vl.md
index 14719daca..71b0626ae 100644
--- a/docs/get_started/ernie-4.5-vl.md
+++ b/docs/get_started/ernie-4.5-vl.md
@@ -113,7 +113,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "From which era does the artifact in the image originate?"}
     ]}
   ],
-  "enable_thinking": false
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```
 
diff --git a/docs/get_started/quick_start_vl.md b/docs/get_started/quick_start_vl.md
index 1e1a37425..83b1b97d7 100644
--- a/docs/get_started/quick_start_vl.md
+++ b/docs/get_started/quick_start_vl.md
@@ -74,7 +74,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "What era does this artifact belong to?"}
     ]}
   ],
-  "enable_thinking": false
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```
 
diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md
index 536637110..761e79720 100644
--- a/docs/online_serving/README.md
+++ b/docs/online_serving/README.md
@@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \
 
 For more usage methods of the command line during service deployment, refer to [Parameter Descriptions](../parameters.md).
 
-## Sending User Requests
+## Chat Completion API
+FastDeploy provides a Chat Completion API that is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method.
 
-The FastDeploy interface is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method.
+### Sending User Requests
 
 Here is an example of sending a user request using the curl command:
 
@@ -73,53 +74,169 @@ print('\n')
 
 For a description of the OpenAI protocol, refer to the document [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create).
 
-## Parameter Differences
-### Request Parameter Differences
-The differences in request parameters between FastDeploy and the OpenAI protocol are as follows. Other request parameters will be ignored:
+### Compatible OpenAI Parameters
+```python
+messages: Union[List[Any], List[int]]
+# List of input messages, which can be text messages (`List[Any]`, typically `List[dict]`) or token ID lists (`List[int]`).
 
-- `prompt` (supported only in the `v1/completions` interface)
-- `messages` (supported only in the `v1/chat/completions` interface)
-- `logprobs`: Optional[bool] = False (supported only in the `v1/chat/completions` interface)
-- `top_logprobs`: Optional[int] = None (supported only in the `v1/chat/completions` interface. An integer between 0 and 20,logprobs must be set to true if this parameter is used)
-- `frequency_penalty`: Optional[float] = 0.0
-- `max_tokens`: Optional[int] = 16
-- `presence_penalty`: Optional[float] = 0.0
-- `stream`: Optional[bool] = False
-- `stream_options`: Optional[StreamOptions] = None
-- `temperature`: Optional[float] = None
-- `top_p`: Optional[float] = None
-- `extra_body`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `extra_body={"enable_thinking": True}`)
-  - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated)
-  - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`)
-  - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking)
-  - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition))
-  - `return_token_ids`: Optional[bool] = False: (whether to return token ids as a list)
-  - `include_stop_str_in_output`: Optional[bool] = False: (whether to include the stop strings in output text. Defaults to False.)
+tools: Optional[List[ChatCompletionToolsParam]] = None
+# List of tool call configurations, used for enabling function calling (Function Calling) or tool usage (e.g., ReAct framework).
 
-> Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used.
+model: Optional[str] = "default"
+# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model).
 
-### Return Field Differences
+frequency_penalty: Optional[float] = None
+# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition, default `None` disables).
 
-The additional return fields added by FastDeploy are as follows:
+logprobs: Optional[bool] = False
+# Whether to return the log probabilities of each generated token, used for debugging or analysis.
 
-- `arrival_time`: Returns the cumulative time taken for all tokens
-- `reasoning_content`: The returned result of the reasoning chain
-- `prompt_token_ids`: The token id list of the prompt
-- `completion_token_ids`: The token id list of the completion
+top_logprobs: Optional[int] = 0
+# Returns the top `top_logprobs` tokens and their log probabilities for each generated position (default `0` means no return).
+
+max_tokens: Optional[int] = Field(
+    default=None,
+    deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+)
+# Deprecated: Maximum number of tokens to generate (recommended to use `max_completion_tokens` instead).
+
+max_completion_tokens: Optional[int] = None
+# Maximum number of tokens to generate (recommended alternative to `max_tokens`), no default limit (restricted by the model's context window).
+
+presence_penalty: Optional[float] = None
+# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics, default `None` disables).
+
+stream: Optional[bool] = False
+# Whether to enable streaming output (return results token by token), default `False` (returns complete results at once).
+
+stream_options: Optional[StreamOptions] = None
+# Additional configurations for streaming output (such as chunk size, timeout, etc.), refer to the specific definition of `StreamOptions`.
+
+temperature: Optional[float] = None
+# Temperature coefficient, controlling generation randomness (`0.0` for deterministic generation, `>1.0` for more randomness, default `None` uses model default).
+
+top_p: Optional[float] = None
+# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds `top_p` (default `None` disables).
+
+response_format: Optional[AnyResponseFormat] = None
+# Specifies the output format (such as JSON, XML, etc.), requires passing a predefined format configuration object.
+
+user: Optional[str] = None
+# User identifier, used for tracking or distinguishing requests from different users (default `None` does not pass).
+
+metadata: Optional[dict] = None
+# Additional metadata, used for passing custom information (such as request ID, debug markers, etc.).
+
+```
+
+### Additional Parameters Added by FastDeploy
+
+> Note:
+When sending requests using curl, the following parameters can be used directly;
+When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`.
+
+The following sampling parameters are supported.
+```python
+top_k: Optional[int] = None
+# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit).
+
+min_p: Optional[float] = None
+# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled).
+
+min_tokens: Optional[int] = None
+# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit).
+
+include_stop_str_in_output: Optional[bool] = False
+# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered).
+
+bad_words: Optional[List[str]] = None
+# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction).
+
+repetition_penalty: Optional[float] = None
+# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled).
+```
+
+The following extra parameters are supported:
+```python
+chat_template_kwargs: Optional[dict] = None
+# Additional parameters passed to the chat template, used for customizing dialogue formats (default None).
+
+reasoning_max_tokens: Optional[int] = None
+# Maximum number of tokens to generate during reasoning (e.g., CoT, chain of thought) (default None means using global max_tokens).
+
+structural_tag: Optional[str] = None
+# Structural tag, used to mark specific structures of generated content (such as JSON, XML, etc., default None).
+
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None).
+
+guided_regex: Optional[str] = None
+# Guides the generation of content conforming to regular expression rules (default None means no restriction).
+
+guided_choice: Optional[List[str]] = None
+# Guides the generation of content selected from a specified candidate list (default None means no restriction).
+
+guided_grammar: Optional[str] = None
+# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction).
+
+return_token_ids: Optional[bool] = None
+# Whether to return the token IDs of the generation results instead of text (default None means return text).
+
+prompt_token_ids: Optional[List[int]] = None
+# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input).
+
+max_streaming_response_tokens: Optional[int] = None
+# Maximum number of tokens returned at a time during streaming output (default None means no limit).
+
+disable_chat_template: Optional[bool] = False
+# Whether to disable chat template rendering, using raw input directly (default False means template is enabled).
+```
+
+### Differences in Return Fields
+
+Additional return fields added by FastDeploy:
+
+- `arrival_time`: Cumulative time consumed for all tokens
+- `reasoning_content`: Return results of the chain of thought
+- `prompt_token_ids`: List of token IDs for the input sequence
+- `completion_token_ids`: List of token IDs for the output sequence
 
 Overview of return parameters:
 
 ```python
+
+ChatCompletionResponse:
+    id: str
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+ChatCompletionResponseChoice:
+    index: int
+    message: ChatMessage
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
+ChatMessage:
+    role: str
+    content: str
+    reasoning_content: Optional[str] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+
+# Fields returned for streaming responses
 ChatCompletionStreamResponse:
     id: str
     object: str = "chat.completion.chunk"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
 ChatCompletionResponseStreamChoice:
     index: int
     delta: DeltaMessage
-    finish_reason: Optional[Literal["stop", "length"]] = None
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
     arrival_time: Optional[float] = None
 DeltaMessage:
     role: Optional[str] = None
@@ -128,3 +245,156 @@ DeltaMessage:
     completion_token_ids: Optional[List[int]] = None
     reasoning_content: Optional[str] = None
 ```
+
+## Completion API
+The Completion API interface is mainly used for continuation scenarios, suitable for users who have customized context input and expect the model to only output continuation content; the inference process does not add other `prompt` concatenations.
+
+### Sending User Requests
+
+Here is an example of sending a user request using the curl command:
+
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "prompt": "以下是一篇关于深圳文心公园的500字游记和赏析："
+}'
+```
+
+Here is an example of sending a user request using a Python script:
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.completions.create(
+    model="default",
+    prompt="以下是一篇关于深圳文心公园的500字游记和赏析：",
+    stream=False,
+)
+print(response.choices[0].text)
+```
+
+For an explanation of the OpenAI protocol, refer to the [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。
+
+### Compatible OpenAI Parameters
+```python
+model: Optional[str] = "default"
+# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model).
+
+prompt: Union[List[int], List[List[int]], str, List[str]]
+# Input prompt, supporting multiple formats:
+#   - `str`: Plain text prompt (e.g., `"Hello, how are you?"`).
+#   - `List[str]`: Multiple text segments (e.g., `["User:", "Hello!", "Assistant:", "Hi!"]`).
+#   - `List[int]`: Directly passes a list of token IDs (e.g., `[123, 456]`).
+#   - `List[List[int]]`: List of multiple token ID lists (e.g., `[[123], [456, 789]]`).
+
+best_of: Optional[int] = None
+# Generates `best_of` candidate results and returns the highest-scoring one (requires `n=1`).
+
+frequency_penalty: Optional[float] = None
+# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition).
+
+logprobs: Optional[int] = None
+# Returns the log probabilities of each generated token, can specify the number of candidates to return.
+
+max_tokens: Optional[int] = None
+# Maximum number of tokens to generate (including input and output), no default limit (restricted by the model's context window).
+
+presence_penalty: Optional[float] = None
+# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics).
+```
+
+### Additional Parameters Added by FastDeploy
+
+> Note:
+When sending requests using curl, the following parameters can be used directly;
+When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`.
+
+The following sampling parameters are supported.
+```python
+top_k: Optional[int] = None
+# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit).
+
+min_p: Optional[float] = None
+# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled).
+
+min_tokens: Optional[int] = None
+# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit).
+
+include_stop_str_in_output: Optional[bool] = False
+# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered).
+
+bad_words: Optional[List[str]] = None
+# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction).
+
+repetition_penalty: Optional[float] = None
+# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled).
+```
+
+The following extra parameters are supported:
+```python
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None).
+
+guided_regex: Optional[str] = None
+# Guides the generation of content conforming to regular expression rules (default None means no restriction).
+
+guided_choice: Optional[List[str]] = None
+# Guides the generation of content selected from a specified candidate list (default None means no restriction).
+
+guided_grammar: Optional[str] = None
+# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction).
+
+return_token_ids: Optional[bool] = None
+# Whether to return the token IDs of the generation results instead of text (default None means return text).
+
+prompt_token_ids: Optional[List[int]] = None
+# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input).
+
+max_streaming_response_tokens: Optional[int] = None
+# Maximum number of tokens returned at a time during streaming output (default None means no limit).
+```
+
+### Overview of Return Parameters
+
+```python
+
+CompletionResponse:
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+CompletionResponseChoice:
+    index: int
+    text: str
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    arrival_time: Optional[float] = None
+    logprobs: Optional[int] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
+
+# Fields returned for streaming responses
+CompletionStreamResponse：
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+CompletionResponseStreamChoice:
+    index: int
+    text: str
+    arrival_time: float = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    logprobs: Optional[float] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
+
+```
diff --git a/docs/zh/features/reasoning_output.md b/docs/zh/features/reasoning_output.md
index f41ba77dd..cd32e4c6c 100644
--- a/docs/zh/features/reasoning_output.md
+++ b/docs/zh/features/reasoning_output.md
@@ -43,7 +43,7 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
       {"type": "text", "text": "图中的文物属于哪个年代"}
     ]}
   ],
-  "enable_thinking": true,
+  "chat_template_kwargs":{"enable_thinking": true},
   "reasoning_max_tokens": 1024
 }'
 
@@ -71,7 +71,7 @@ chat_response = client.chat.completions.create(
     model="vl",
     stream=True,
     extra_body={
-      "enable_thinking": True,
+      "chat_template_kwargs":{"enable_thinking": True},
       "reasoning_max_tokens": 1024
     }
 )
diff --git a/docs/zh/get_started/ernie-4.5-vl.md b/docs/zh/get_started/ernie-4.5-vl.md
index 5bec9ca20..3922c899f 100644
--- a/docs/zh/get_started/ernie-4.5-vl.md
+++ b/docs/zh/get_started/ernie-4.5-vl.md
@@ -110,7 +110,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "图中的文物属于哪个年代"}
     ]}
   ],
-  "enable_thinking": false
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```
 
diff --git a/docs/zh/get_started/quick_start_vl.md b/docs/zh/get_started/quick_start_vl.md
index c9fe26a51..0f4c88cc1 100644
--- a/docs/zh/get_started/quick_start_vl.md
+++ b/docs/zh/get_started/quick_start_vl.md
@@ -73,7 +73,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "图中的文物属于哪个年代"}
     ]}
   ],
-  "enable_thinking": false
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```
 
diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md
index 1c90dc2f3..a68eedbdb 100644
--- a/docs/zh/online_serving/README.md
+++ b/docs/zh/online_serving/README.md
@@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \
 
 服务部署时的命令行更多使用方式参考[参数说明](../parameters.md)。
 
-## 发送用户请求
+## Chat Completion API
+FastDeploy 接口兼容 OpenAI 的 Chat Completion API，用户可以通过 OpenAI 协议发送用户请求。
 
-FastDeploy 接口兼容 OpenAI 协议，可以直接使用 OpenAI 的请求方式发送用户请求。
+### 发送用户请求
 
 使用 curl 命令发送用户请求示例如下：
 
@@ -71,32 +72,124 @@ for chunk in response:
 print('\n')
 ```
 
-关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)。
+关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create)。
 
-## 参数差异
-### 请求参数差异
-FastDeploy 与 OpenAI 协议的请求参数差异如下，其余请求参数会被忽略：
-- `prompt` (仅支持 `v1/completions` 接口)
-- `messages` (仅支持 `v1/chat/completions` 接口)
-- `logprobs`: Optional[bool] = False (仅支持 `v1/chat/completions` 接口)
-- `top_logprobs`: Optional[int] = None (仅支持 `v1/chat/completions` 接口。如果使用这个参数必须设置logprobs为True，取值大于等于0小于20)
-- `frequency_penalty`: Optional[float] = 0.0
-- `max_tokens`: Optional[int] = 16
-- `presence_penalty`: Optional[float] = 0.0
-- `stream`: Optional[bool] = False
-- `stream_options`: Optional[StreamOptions] = None
-- `temperature`: Optional[float] = None
-- `top_p`: Optional[float] = None
-- `extra_body`: Optional[dict] = None (仅在 v1/chat/compeltions 中支持，用于配置额外参数, 如 `extra_body={"enable_thinking": True}`)
-  - `min_tokens`: Optional[int] = 1 最小生成的Token个数
-  - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数，默认与max_tokens一致
-  - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考
-  - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数（>1时惩罚重复，<1时鼓励重复）
-  - `return_token_ids`: Optional[bool] = False: 是否返回 token id 列表
-  - `include_stop_str_in_output`: Optional[bool] = False: 是否返回结束符
-  - `top_k`: Optional[bool] = None: TopK-TopP采样参数，参考[采样说明](../features/sampling.md)
+### 兼容OpenAI 参数
+```python
+messages: Union[List[Any], List[int]]
+# 输入消息列表，可以是文本消息（`List[Any]`，通常为 `List[dict]`）或 token ID 列表（`List[int]`）。
 
-> 注: 若为多模态模型 由于思考链默认打开导致输出过长，max tokens 可以设置为模型最长输出，或使用默认值。
+tools: Optional[List[ChatCompletionToolsParam]] = None
+# 工具调用配置列表，用于启用函数调用（Function Calling）或工具使用（如 ReAct 框架）。
+
+model: Optional[str] = "default"
+# 指定使用的模型名称或版本，默认值为 `"default"`（可能指向基础模型）。
+
+frequency_penalty: Optional[float] = None
+# 频率惩罚系数，降低重复生成相同 token 的概率（`>1.0` 抑制重复，`<1.0` 鼓励重复，默认 `None` 禁用）。
+
+logprobs: Optional[bool] = False
+# 是否返回每个生成 token 的对数概率（log probabilities），用于调试或分析。
+
+top_logprobs: Optional[int] = 0
+# 返回每个生成位置概率最高的 `top_logprobs` 个 token 及其对数概率（默认 `0` 表示不返回）。
+
+max_tokens: Optional[int] = Field(
+    default=None,
+    deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+)
+# 已弃用：生成的最大 token 数（建议改用 `max_completion_tokens`）。
+
+max_completion_tokens: Optional[int] = None
+# 生成的最大 token 数（推荐替代 `max_tokens`），默认无限制（受模型上下文窗口限制）。
+
+presence_penalty: Optional[float] = None
+# 存在惩罚系数，降低新主题（未出现过的话题）的生成概率（`>1.0` 抑制新话题，`<1.0` 鼓励新话题，默认 `None` 禁用）。
+
+stream: Optional[bool] = False
+# 是否启用流式输出（逐 token 返回结果），默认 `False`（一次性返回完整结果）。
+
+stream_options: Optional[StreamOptions] = None
+# 流式输出的额外配置（如分块大小、超时等），需参考 `StreamOptions` 的具体定义。
+
+temperature: Optional[float] = None
+# 温度系数，控制生成随机性（`0.0` 确定性生成，`>1.0` 更随机，默认 `None` 使用模型默认值）。
+
+top_p: Optional[float] = None
+# 核采样（nucleus sampling）阈值，只保留概率累计超过 `top_p` 的 token（默认 `None` 禁用）。
+
+response_format: Optional[AnyResponseFormat] = None
+# 指定输出格式（如 JSON、XML 等），需传入预定义的格式配置对象。
+
+user: Optional[str] = None
+# 用户标识符，用于跟踪或区分不同用户的请求（默认 `None` 不传递）。
+
+metadata: Optional[dict] = None
+# 附加元数据，用于传递自定义信息（如请求 ID、调试标记等）。
+
+```
+
+### FastDeploy 增加额外参数
+
+> 注：
+使用 curl 命令发送请求时， 可以直接使用以下参数；
+使用openai.Client 发送请求时，需要使用将以下参数放入 `extra_body` 参数中， 如：`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`。
+
+额外采样参数的支持如下：
+```python
+top_k: Optional[int] = None
+# 限制每一步生成时只考虑概率最高的 K 个 token，用于控制随机性（默认 None 表示不限制）。
+
+min_p: Optional[float] = None
+# 核采样（nucleus sampling）阈值，只保留概率累计超过 min_p 的 token（默认 None 表示禁用）。
+
+min_tokens: Optional[int] = None
+# 强制生成的最小 token 数，避免过早截断（默认 None 表示不限制）。
+
+include_stop_str_in_output: Optional[bool] = False
+# 是否在输出中包含停止符（stop string）的内容（默认 False，即遇到停止符时截断输出）。
+
+bad_words: Optional[List[str]] = None
+# 禁止生成的词汇列表（例如敏感词），模型会避免输出这些词（默认 None 表示不限制）。
+
+repetition_penalty: Optional[float] = None
+# 重复惩罚系数，降低已生成 token 的重复概率（>1.0 抑制重复，<1.0 鼓励重复，默认 None 表示禁用）。
+```
+其他参数的支持如下：
+```python
+chat_template_kwargs: Optional[dict] = None
+# 传递给聊天模板（chat template）的额外参数，用于自定义对话格式（默认 None）。
+
+reasoning_max_tokens: Optional[int] = None
+# 推理（如 CoT, 思维链）过程中生成的最大 token 数（默认 None 表示使用全局 max_tokens）。
+
+structural_tag: Optional[str] = None
+# 结构化标签，用于标记生成内容的特定结构（如 JSON、XML 等，默认 None）。
+
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# 引导生成符合 JSON 结构的内容，可以是 JSON 字符串、字典或 Pydantic 模型（默认 None）。
+
+guided_regex: Optional[str] = None
+# 引导生成符合正则表达式规则的内容（默认 None 表示不限制）。
+
+guided_choice: Optional[List[str]] = None
+# 引导生成内容从指定的候选列表中选择（默认 None 表示不限制）。
+
+guided_grammar: Optional[str] = None
+# 引导生成符合语法规则（如 BNF）的内容（默认 None 表示不限制）。
+
+return_token_ids: Optional[bool] = None
+# 是否返回生成结果的 token ID 而非文本（默认 None 表示返回文本）。
+
+prompt_token_ids: Optional[List[int]] = None
+# 直接传入 prompt 的 token ID 列表，跳过文本编码步骤（默认 None 表示使用文本输入）。
+
+max_streaming_response_tokens: Optional[int] = None
+# 流式输出时每次返回的最大 token 数（默认 None 表示不限制）。
+
+disable_chat_template: Optional[bool] = False
+# 是否禁用聊天模板渲染，直接使用原始输入（默认 False 表示启用模板）。
+```
 
 ### 返回字段差异
 
@@ -110,16 +203,39 @@ FastDeploy 增加的返回字段如下：
 返回参数总览：
 
 ```python
+
+ChatCompletionResponse:
+    id: str
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+ChatCompletionResponseChoice:
+    index: int
+    message: ChatMessage
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
+ChatMessage:
+    role: str
+    content: str
+    reasoning_content: Optional[str] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+
+# 返回流式响应的字段
 ChatCompletionStreamResponse:
     id: str
     object: str = "chat.completion.chunk"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
 ChatCompletionResponseStreamChoice:
     index: int
     delta: DeltaMessage
-    finish_reason: Optional[Literal["stop", "length"]] = None
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
     arrival_time: Optional[float] = None
 DeltaMessage:
     role: Optional[str] = None
@@ -128,3 +244,155 @@ DeltaMessage:
     completion_token_ids: Optional[List[int]] = None
     reasoning_content: Optional[str] = None
 ```
+
+## Completion API
+Completion API 接口主要用于续聊场景, 适应于用户自定义好上下文输入, 并希望模型仅输出续写内容的场景; 推理过程不会增加其他 `prompt`拼接。：
+
+### 发送用户请求
+
+使用 curl 命令发送用户请求示例如下：
+
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "prompt": "以下是一篇关于深圳文心公园的500字游记和赏析："
+}'
+```
+
+使用 Python 脚本发送用户请求示例如下：
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.completions.create(
+    model="default",
+    prompt="以下是一篇关于深圳文心公园的500字游记和赏析：",
+    stream=False,
+)
+print(response.choices[0].text)
+```
+
+关于 OpenAI 协议的说明可参考文档 [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。
+
+### 兼容OpenAI 参数
+```python
+model: Optional[str] = "default"
+# 指定使用的模型名称或版本，默认值为 `"default"`（可能指向基础模型）。
+
+prompt: Union[List[int], List[List[int]], str, List[str]]
+# 输入提示，支持多种格式：
+#   - `str`: 纯文本提示（如 `"Hello, how are you?"`）。
+#   - `List[str]`: 多段文本（如 `["User:", "Hello!", "Assistant:", "Hi!"]`）。
+#   - `List[int]`: 直接传入 token ID 列表（如 `[123, 456]`）。
+#   - `List[List[int]]`: 多段 token ID 列表（如 `[[123], [456, 789]]`）。
+
+best_of: Optional[int] = None
+# 生成 `best_of` 个候选结果，然后返回其中评分最高的一个（需配合 `n=1` 使用）。
+
+frequency_penalty: Optional[float] = None
+# 频率惩罚系数，降低重复生成相同 token 的概率（`>1.0` 抑制重复，`<1.0` 鼓励重复）。
+
+logprobs: Optional[int] = None
+# 返回每个生成 token 的对数概率（log probabilities），可指定返回的候选数量。
+
+max_tokens: Optional[int] = None
+# 生成的最大 token 数（包括输入和输出），默认无限制（受模型上下文窗口限制）。
+
+presence_penalty: Optional[float] = None
+# 存在惩罚系数，降低新主题（未出现过的话题）的生成概率（`>1.0` 抑制新话题，`<1.0` 鼓励新话题）。
+```
+
+### FastDeploy 增加额外参数
+
+> 注：
+使用 curl 命令发送请求时， 可以直接使用以下参数；
+使用openai.Client 发送请求时，需要使用将以下参数放入 `extra_body` 参数中， 如：`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`。
+
+额外采样参数的支持如下：
+```python
+top_k: Optional[int] = None
+# 限制每一步生成时只考虑概率最高的 K 个 token，用于控制随机性（默认 None 表示不限制）。
+
+min_p: Optional[float] = None
+# 核采样（nucleus sampling）阈值，只保留概率累计超过 min_p 的 token（默认 None 表示禁用）。
+
+min_tokens: Optional[int] = None
+# 强制生成的最小 token 数，避免过早截断（默认 None 表示不限制）。
+
+include_stop_str_in_output: Optional[bool] = False
+# 是否在输出中包含停止符（stop string）的内容（默认 False，即遇到停止符时截断输出）。
+
+bad_words: Optional[List[str]] = None
+# 禁止生成的词汇列表（例如敏感词），模型会避免输出这些词（默认 None 表示不限制）。
+
+repetition_penalty: Optional[float] = None
+# 重复惩罚系数，降低已生成 token 的重复概率（>1.0 抑制重复，<1.0 鼓励重复，默认 None 表示禁用）。
+```
+其他参数的支持如下：
+```python
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# 引导生成符合 JSON 结构的内容，可以是 JSON 字符串、字典或 Pydantic 模型（默认 None）。
+
+guided_regex: Optional[str] = None
+# 引导生成符合正则表达式规则的内容（默认 None 表示不限制）。
+
+guided_choice: Optional[List[str]] = None
+# 引导生成内容从指定的候选列表中选择（默认 None 表示不限制）。
+
+guided_grammar: Optional[str] = None
+# 引导生成符合语法规则（如 BNF）的内容（默认 None 表示不限制）。
+
+return_token_ids: Optional[bool] = None
+# 是否返回生成结果的 token ID 而非文本（默认 None 表示返回文本）。
+
+prompt_token_ids: Optional[List[int]] = None
+# 直接传入 prompt 的 token ID 列表，跳过文本编码步骤（默认 None 表示使用文本输入）。
+
+max_streaming_response_tokens: Optional[int] = None
+# 流式输出时每次返回的最大 token 数（默认 None 表示不限制）。
+```
+
+### 返回参数总览
+
+```python
+
+CompletionResponse:
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+CompletionResponseChoice:
+    index: int
+    text: str
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    arrival_time: Optional[float] = None
+    logprobs: Optional[int] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
+
+# 返回流式响应的字段
+CompletionStreamResponse：
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+CompletionResponseStreamChoice:
+    index: int
+    text: str
+    arrival_time: float = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    logprobs: Optional[float] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
+
+```
diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index 482399b48..f4fd099f7 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -343,28 +343,29 @@ class CompletionRequest(BaseModel):
     suffix: Optional[dict] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
-    include_stop_str_in_output: Optional[bool] = False
     user: Optional[str] = None
 
+    # doc: begin-completion-sampling-params
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     min_tokens: Optional[int] = None
-    return_token_ids: Optional[bool] = None
-    max_streaming_response_tokens: Optional[int] = None
-    prompt_token_ids: Optional[List[int]] = None
+    include_stop_str_in_output: Optional[bool] = False
     bad_words: Optional[List[str]] = None
+    # doc: end-completion-sampling-params
 
+    # doc: start-completion-extra-params
     response_format: Optional[AnyResponseFormat] = None
     guided_json: Optional[Union[str, dict, BaseModel]] = None
     guided_regex: Optional[str] = None
     guided_choice: Optional[list[str]] = None
     guided_grammar: Optional[str] = None
 
-    # doc: begin-completion-sampling-params
-    repetition_penalty: Optional[float] = None
-    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
-
-    # doc: end-completion-sampling-params
+    max_streaming_response_tokens: Optional[int] = None
+    return_token_ids: Optional[bool] = None
+    prompt_token_ids: Optional[List[int]] = None
+    # doc: end-completion-extra-params
 
     def to_dict_for_infer(self, request_id=None, prompt=None):
         """
@@ -477,33 +478,34 @@ class ChatCompletionRequest(BaseModel):
     stream_options: Optional[StreamOptions] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
     user: Optional[str] = None
     metadata: Optional[dict] = None
+    response_format: Optional[AnyResponseFormat] = None
 
-    return_token_ids: Optional[bool] = None
-    prompt_token_ids: Optional[List[int]] = None
-    disable_chat_template: Optional[bool] = False
+    # doc: begin-chat-completion-sampling-params
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
     min_tokens: Optional[int] = None
-    enable_thinking: Optional[bool] = None
-    reasoning_max_tokens: Optional[int] = None
-    max_streaming_response_tokens: Optional[int] = None
     include_stop_str_in_output: Optional[bool] = False
     bad_words: Optional[List[str]] = None
+    repetition_penalty: Optional[float] = None
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    # doc: end-chat-completion-sampling-params
 
-    response_format: Optional[AnyResponseFormat] = None
+    # doc: start-completion-extra-params
+    chat_template_kwargs: Optional[dict] = None
+    reasoning_max_tokens: Optional[int] = None
+    structural_tag: Optional[str] = None
     guided_json: Optional[Union[str, dict, BaseModel]] = None
     guided_regex: Optional[str] = None
     guided_choice: Optional[list[str]] = None
     guided_grammar: Optional[str] = None
-    structural_tag: Optional[str] = None
 
-    # doc: begin-chat-completion-sampling-params
-    repetition_penalty: Optional[float] = None
-    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
-
-    # doc: end-chat-completion-sampling-params
+    return_token_ids: Optional[bool] = None
+    prompt_token_ids: Optional[List[int]] = None
+    max_streaming_response_tokens: Optional[int] = None
+    disable_chat_template: Optional[bool] = False
+    # doc: end-chat-completion-extra-params
 
     def to_dict_for_infer(self, request_id=None):
         """
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 8b2141a4b..5ebcc98fc 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -129,11 +129,11 @@ class OpenAIServingChat:
             if request.max_streaming_response_tokens is not None
             else (request.metadata or {}).get("max_streaming_response_tokens", 1)
         )  # dierctly passed & passed in metadata
-        enable_thinking = (
-            request.enable_thinking
-            if request.enable_thinking is not None
-            else (request.metadata or {}).get("enable_thinking")
-        )
+
+        enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
+        if enable_thinking is None:
+            enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
+
         include_stop_str_in_output = request.include_stop_str_in_output
 
         stream_options = request.stream_options
@@ -330,11 +330,10 @@ class OpenAIServingChat:
         """
         created_time = int(time.time())
         final_res = None
-        enable_thinking = (
-            request.enable_thinking
-            if request.enable_thinking is not None
-            else (request.metadata or {}).get("enable_thinking")
-        )
+        enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
+        if enable_thinking is None:
+            enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
+
         include_stop_str_in_output = request.include_stop_str_in_output
 
         try:
diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index f6aa2b424..fb31a655f 100644
--- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -493,7 +493,7 @@ def test_chat_with_thinking(openai_client, capsys):
         temperature=1,
         stream=False,
         max_tokens=10,
-        extra_body={"enable_thinking": True},
+        extra_body={"chat_template_kwargs": {"enable_thinking": True}},
     )
     assert response.choices[0].message.reasoning_content is not None
 
@@ -504,7 +504,7 @@ def test_chat_with_thinking(openai_client, capsys):
         temperature=1,
         stream=False,
         max_tokens=10,
-        extra_body={"enable_thinking": False},
+        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
     )
     assert response.choices[0].message.reasoning_content is None
 
@@ -514,7 +514,11 @@ def test_chat_with_thinking(openai_client, capsys):
         model="default",
         messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
         temperature=1,
-        extra_body={"enable_thinking": True, "reasoning_max_tokens": reasoning_max_tokens, "return_token_ids": True},
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+            "reasoning_max_tokens": reasoning_max_tokens,
+            "return_token_ids": True,
+        },
         stream=True,
         max_tokens=10,
     )