diff --git a/docs/features/reasoning_output.md b/docs/features/reasoning_output.md
index 5f23e65d5..6ea2e928e 100644
--- a/docs/features/reasoning_output.md
+++ b/docs/features/reasoning_output.md
@@ -8,14 +8,14 @@ Reasoning models return an additional `reasoning_content` field in their output,
 | baidu/ERNIE-4.5-VL-424B-A47B-Paddle    | ernie-45-vl    | ✓                         |
 | baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl    | ✓                         |
 
-The reasoning model requires a specified parser to extract reasoning content. The reasoning mode can be disabled by setting the `enable_thinking=False` parameter.
+The reasoning model requires a specified parser to extract reasoning content. The reasoning mode can be disabled by setting the `"enable_thinking": false` parameter.
 
 Interfaces that support toggling the reasoning mode:
 1. `/v1/chat/completions` requests in OpenAI services.
 2. `/v1/chat/completions` requests in the OpenAI Python client.
 3. `llm.chat` requests in Offline interfaces.
 
-For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `metadata={"reasoning_max_tokens": 1024}` to the request.
+For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `"reasoning_max_tokens": 1024` to the request.
 
 ### Quick Start
 When launching the model service, specify the parser name using the `--reasoning-parser` argument.
@@ -43,7 +43,8 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
       {"type": "text", "text": "Which era does the cultural relic in the picture belong to"}
     ]}
   ],
-  "metadata": {"enable_thinking": true}
+  "enable_thinking": true,
+  "reasoning_max_tokens": 1024
 }'
 ```
 
@@ -68,7 +69,10 @@ chat_response = client.chat.completions.create(
     ],
     model="vl",
     stream=True,
-    metadata={"enable_thinking": True}
+    extra_body={
+      "enable_thinking": True,
+      "reasoning_max_tokens": 1024
+    }
 )
 for chunk in chat_response:
     if chunk.choices[0].delta is not None:
diff --git a/docs/get_started/ernie-4.5-vl.md b/docs/get_started/ernie-4.5-vl.md
index f3b0b38d7..14719daca 100644
--- a/docs/get_started/ernie-4.5-vl.md
+++ b/docs/get_started/ernie-4.5-vl.md
@@ -113,7 +113,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "From which era does the artifact in the image originate?"}
     ]}
   ],
-  "metadata": {"enable_thinking": false}
+  "enable_thinking": false
 }'
 ```
 
diff --git a/docs/get_started/quick_start_vl.md b/docs/get_started/quick_start_vl.md
index 82bc609b1..1e1a37425 100644
--- a/docs/get_started/quick_start_vl.md
+++ b/docs/get_started/quick_start_vl.md
@@ -74,7 +74,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "What era does this artifact belong to?"}
     ]}
   ],
-  "metadata": {"enable_thinking": false}
+  "enable_thinking": false
 }'
 ```
 
@@ -96,7 +96,7 @@ response = client.chat.completions.create(
             {"type": "text", "text": "What era does this artifact belong to?"},
         ]},
     ],
-    metadata={"enable_thinking": false},
+    extra_body={"enable_thinking": false},
     stream=True,
 )
 for chunk in response:
diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md
index 8062fe76a..691434eed 100644
--- a/docs/online_serving/README.md
+++ b/docs/online_serving/README.md
@@ -88,11 +88,12 @@ The differences in request parameters between FastDeploy and the OpenAI protocol
 - `stream_options`: Optional[StreamOptions] = None
 - `temperature`: Optional[float] = None
 - `top_p`: Optional[float] = None
-- `metadata`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `metadata={"enable_thinking": True}`)
+- `extra_body`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `extra_body={"enable_thinking": True}`)
   - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated)
   - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`)
   - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking)
   - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition))
+  - `return_token_ids`: Optional[bool] = False: (whether to return token ids as a list)
 
 > Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used.
 
@@ -102,6 +103,8 @@ The additional return fields added by FastDeploy are as follows:
 
 - `arrival_time`: Returns the cumulative time taken for all tokens
 - `reasoning_content`: The returned result of the reasoning chain
+- `prompt_token_ids`: The token id list of the prompt
+- `completion_token_ids`: The token id list of the completion
 
 Overview of return parameters:
 
@@ -112,7 +115,7 @@ ChatCompletionStreamResponse:
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseStreamChoice]
- ChatCompletionResponseStreamChoice:
+ChatCompletionResponseStreamChoice:
     index: int
     delta: DeltaMessage
     finish_reason: Optional[Literal["stop", "length"]] = None
@@ -120,6 +123,7 @@ ChatCompletionStreamResponse:
 DeltaMessage:
     role: Optional[str] = None
     content: Optional[str] = None
-    token_ids: Optional[List[int]] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
     reasoning_content: Optional[str] = None
 ```
diff --git a/docs/zh/features/reasoning_output.md b/docs/zh/features/reasoning_output.md
index 5417f66d7..f41ba77dd 100644
--- a/docs/zh/features/reasoning_output.md
+++ b/docs/zh/features/reasoning_output.md
@@ -8,18 +8,18 @@
 | baidu/ERNIE-4.5-VL-424B-A47B-Paddle  | ernie-45-vl | ✓       |
 | baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl |    ✓    |
 
-思考模型需要指定解析器,以便于对思考内容进行解析. 通过`enable_thinking=False` 参数可以关闭模型思考模式.
+思考模型需要指定解析器,以便于对思考内容进行解析. 通过 `"enable_thinking": false` 参数可以关闭模型思考模式.
 
 可以支持思考模式开关的接口:
 1. OpenAI 服务中 `/v1/chat/completions`  请求.
 2. OpenAI Python客户端中 `/v1/chat/completions`  请求.
 3. Offline 接口中 `llm.chat`请求.
 
-同时在思考模型中，支持通过```reasoning_max_tokens```控制思考内容的长度，在请求中添加```metadata={"reasoning_max_tokens": 1024}```即可。
+同时在思考模型中，支持通过 `reasoning_max_tokens` 控制思考内容的长度，在请求中添加 `"reasoning_max_tokens": 1024` 即可。
 
 ## 快速使用
-在启动模型服务时, 通过`--reasoning-parser`参数指定解析器名称.
-该解析器会解析思考模型的输出, 提取`reasoning_content`字段.
+在启动模型服务时, 通过 `--reasoning-parser` 参数指定解析器名称.
+该解析器会解析思考模型的输出, 提取 `reasoning_content` 字段.
 
 ```bash
 python -m fastdeploy.entrypoints.openai.api_server \
@@ -43,15 +43,16 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
       {"type": "text", "text": "图中的文物属于哪个年代"}
     ]}
   ],
-  "metadata": {"enable_thinking": true}
+  "enable_thinking": true,
+  "reasoning_max_tokens": 1024
 }'
 
 ```
 
-字段`reasoning_content`包含得出最终结论的思考步骤，而`content`字段包含最终结论。
+字段 `reasoning_content` 包含得出最终结论的思考步骤，而 `content` 字段包含最终结论。
 
 ### 流式会话
-在流式会话中, `reasoning_content`字段会可以在`chat completion response chunks`中的 `delta` 中获取
+在流式会话中, `reasoning_content` 字段会可以在 `chat completion response chunks` 中的 `delta` 中获取
 
 ```python
 from openai import OpenAI
@@ -69,7 +70,10 @@ chat_response = client.chat.completions.create(
     ],
     model="vl",
     stream=True,
-    metadata={"enable_thinking": True}
+    extra_body={
+      "enable_thinking": True,
+      "reasoning_max_tokens": 1024
+    }
 )
 for chunk in chat_response:
     if chunk.choices[0].delta is not None:
diff --git a/docs/zh/get_started/ernie-4.5-vl.md b/docs/zh/get_started/ernie-4.5-vl.md
index a270b2e4a..5bec9ca20 100644
--- a/docs/zh/get_started/ernie-4.5-vl.md
+++ b/docs/zh/get_started/ernie-4.5-vl.md
@@ -110,7 +110,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "图中的文物属于哪个年代"}
     ]}
   ],
-  "metadata": {"enable_thinking": false}
+  "enable_thinking": false
 }'
 ```
 
diff --git a/docs/zh/get_started/quick_start_vl.md b/docs/zh/get_started/quick_start_vl.md
index deaf3e10d..c9fe26a51 100644
--- a/docs/zh/get_started/quick_start_vl.md
+++ b/docs/zh/get_started/quick_start_vl.md
@@ -73,7 +73,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
       {"type": "text", "text": "图中的文物属于哪个年代"}
     ]}
   ],
-  "metadata": {"enable_thinking": false}
+  "enable_thinking": false
 }'
 ```
 
@@ -93,7 +93,7 @@ response = client.chat.completions.create(
             {"type": "text", "text": "图中的文物属于哪个年代?"},
         ]},
     ],
-    metadata={"enable_thinking": false},
+    extra_body={"enable_thinking": false},
     stream=True,
 )
 for chunk in response:
diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md
index d2c001037..a2d4f98d2 100644
--- a/docs/zh/online_serving/README.md
+++ b/docs/zh/online_serving/README.md
@@ -87,11 +87,12 @@ FastDeploy 与 OpenAI 协议的请求参数差异如下，其余请求参数会
 - `stream_options`: Optional[StreamOptions] = None
 - `temperature`: Optional[float] = None
 - `top_p`: Optional[float] = None
-- `metadata`: Optional[dict] = None (仅在v1/chat/compeltions中支持，用于配置额外参数, 如metadata={"enable_thinking": True})
+- `extra_body`: Optional[dict] = None (仅在 v1/chat/compeltions 中支持，用于配置额外参数, 如 `extra_body={"enable_thinking": True}`)
   - `min_tokens`: Optional[int] = 1 最小生成的Token个数
   - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数，默认与max_tokens一致
   - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考
   - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数（>1时惩罚重复，<1时鼓励重复）
+  - `return_token_ids`: Optional[bool] = False: 是否返回 token id 列表
 
 > 注: 若为多模态模型 由于思考链默认打开导致输出过长，max tokens 可以设置为模型最长输出，或使用默认值。
 
@@ -101,6 +102,8 @@ FastDeploy 增加的返回字段如下：
 
 - `arrival_time`：返回所有 token 的累计耗时
 - `reasoning_content`: 思考链的返回结果
+- `prompt_token_ids`: 输入序列的 token id 列表
+- `completion_token_ids`: 输出序列的 token id 列表
 
 返回参数总览：
 
@@ -111,7 +114,7 @@ ChatCompletionStreamResponse:
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseStreamChoice]
- ChatCompletionResponseStreamChoice:
+ChatCompletionResponseStreamChoice:
     index: int
     delta: DeltaMessage
     finish_reason: Optional[Literal["stop", "length"]] = None
@@ -119,6 +122,7 @@ ChatCompletionStreamResponse:
 DeltaMessage:
     role: Optional[str] = None
     content: Optional[str] = None
-    token_ids: Optional[List[int]] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
     reasoning_content: Optional[str] = None
 ```
diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index ca6232dfb..8d4c2c545 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -346,8 +346,10 @@ class CompletionRequest(BaseModel):
     top_k: Optional[int] = None
     min_p: Optional[float] = None
     user: Optional[str] = None
-    extra_body: Optional[dict] = None
-    return_token_ids: Optional[bool] = False
+
+    min_tokens: Optional[int] = None
+    return_token_ids: Optional[bool] = None
+    max_streaming_response_tokens: Optional[int] = None
     prompt_token_ids: Optional[List[int]] = None
 
     response_format: Optional[AnyResponseFormat] = None
@@ -373,16 +375,13 @@ class CompletionRequest(BaseModel):
         if request_id is not None:
             req_dict["request_id"] = request_id
 
-        # parse request model into dict, priority: request > extra_body > suffix
+        # parse request model into dict
+        if self.suffix is not None:
+            for key, value in self.suffix.items():
+                req_dict[key] = value
         for key, value in self.dict().items():
             if value is not None:
                 req_dict[key] = value
-        if self.extra_body is not None:
-            for key, value in self.extra_body.items():
-                req_dict.setdefault(key, value)
-        if self.suffix is not None:
-            for key, value in self.suffix.items():
-                req_dict.setdefault(key, value)
 
         if prompt is not None:
             req_dict["prompt"] = prompt
@@ -480,10 +479,15 @@ class ChatCompletionRequest(BaseModel):
     min_p: Optional[float] = None
     user: Optional[str] = None
     metadata: Optional[dict] = None
-    extra_body: Optional[dict] = None
-    return_token_ids: Optional[bool] = False
+
+    return_token_ids: Optional[bool] = None
     prompt_token_ids: Optional[List[int]] = None
     disable_chat_template: Optional[bool] = False
+    min_tokens: Optional[int] = None
+    enable_thinking: Optional[bool] = None
+    reasoning_max_tokens: Optional[int] = None
+    max_streaming_response_tokens: Optional[int] = None
+    include_stop_str_in_output: Optional[bool] = None
 
     response_format: Optional[AnyResponseFormat] = None
     guided_json: Optional[Union[str, dict, BaseModel]] = None
@@ -512,19 +516,16 @@ class ChatCompletionRequest(BaseModel):
         req_dict["max_tokens"] = self.max_completion_tokens or self.max_tokens
         req_dict["logprobs"] = self.top_logprobs if self.logprobs else None
 
-        # parse request model into dict, priority: request > extra_body > metadata
-        for key, value in self.dict().items():
-            if value is not None:
-                req_dict[key] = value
-        if self.extra_body is not None:
-            for key, value in self.extra_body.items():
-                req_dict.setdefault(key, value)
+        # parse request model into dict, priority: request params > metadata params
         if self.metadata is not None:
             assert (
                 "raw_request" not in self.metadata
             ), "The parameter `raw_request` is not supported now, please use completion api instead."
             for key, value in self.metadata.items():
-                req_dict.setdefault(key, value)
+                req_dict[key] = value
+        for key, value in self.dict().items():
+            if value is not None:
+                req_dict[key] = value
 
         if "prompt_token_ids" in req_dict:
             if "messages" in req_dict:
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 86da7eaea..d28eb3c7f 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -124,11 +124,21 @@ class OpenAIServingChat:
         previous_num_tokens = 0
         num_prompt_tokens = 0
         num_choices = 1
-        max_streaming_response_tokens = 1
-        enable_thinking = None
-        include_stop_str_in_output = False
-        if request.metadata is not None and request.metadata.get("max_streaming_response_tokens", 1) > 1:
-            max_streaming_response_tokens = request.metadata["max_streaming_response_tokens"]
+        max_streaming_response_tokens = (
+            request.max_streaming_response_tokens
+            if request.max_streaming_response_tokens is not None
+            else (request.metadata or {}).get("max_streaming_response_tokens", 1)
+        )  # dierctly passed & passed in metadata
+        enable_thinking = (
+            request.enable_thinking
+            if request.enable_thinking is not None
+            else (request.metadata or {}).get("enable_thinking")
+        )
+        include_stop_str_in_output = (
+            request.include_stop_str_in_output
+            if request.include_stop_str_in_output is not None
+            else (request.metadata or {}).get("include_stop_str_in_output", False)
+        )
 
         stream_options = request.stream_options
         if stream_options is None:
@@ -149,12 +159,6 @@ class OpenAIServingChat:
             dealer.write([b"", request_id.encode("utf-8")])
             choices = []
             current_waiting_time = 0
-            if request.metadata is not None:
-                enable_thinking = request.metadata.get("enable_thinking")
-                include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
-            enable_return_token_ids = request.return_token_ids or (
-                request.extra_body is not None and request.extra_body.get("return_token_ids", False)
-            )
             while num_choices > 0:
                 try:
                     raw_data = await asyncio.wait_for(dealer.read(), timeout=10)
@@ -204,7 +208,7 @@ class OpenAIServingChat:
                                     completion_token_ids=None,
                                 ),
                             )
-                            if enable_return_token_ids:
+                            if request.return_token_ids:
                                 choice.delta.prompt_token_ids = list(prompt_token_ids)
                             chunk = ChatCompletionStreamResponse(
                                 id=request_id,
@@ -274,7 +278,7 @@ class OpenAIServingChat:
                         if res.get("error_msg") is not None and "Recover" in res["error_msg"]:
                             choice.finish_reason = "recover_stop"
 
-                    if enable_return_token_ids:
+                    if request.return_token_ids:
                         choice.delta.completion_token_ids = list(output["token_ids"])
                     if include_continuous_usage:
                         chunk.usage = UsageInfo(
@@ -330,11 +334,17 @@ class OpenAIServingChat:
         """
         created_time = int(time.time())
         final_res = None
-        enable_thinking = None
-        include_stop_str_in_output = False
-        enable_return_token_ids = request.return_token_ids or (
-            request.extra_body is not None and request.extra_body.get("return_token_ids", False)
+        enable_thinking = (
+            request.enable_thinking
+            if request.enable_thinking is not None
+            else (request.metadata or {}).get("enable_thinking")
         )
+        include_stop_str_in_output = (
+            request.include_stop_str_in_output
+            if request.include_stop_str_in_output is not None
+            else (request.metadata or {}).get("include_stop_str_in_output", False)
+        )
+
         try:
             dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
             dealer.write([b"", request_id.encode("utf-8")])
@@ -363,9 +373,6 @@ class OpenAIServingChat:
                 for data in response:
                     if data.get("error_code", 200) != 200:
                         raise ValueError("{}".format(data["error_msg"]))
-                    if request.metadata is not None:
-                        enable_thinking = request.metadata.get("enable_thinking")
-                        include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
                     data = self.engine_client.data_processor.process_response_dict(
                         data,
                         stream=False,
@@ -407,8 +414,8 @@ class OpenAIServingChat:
             content=output["text"],
             reasoning_content=output.get("reasoning_content"),
             tool_calls=output.get("tool_call_content"),
-            prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
-            completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
+            prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
+            completion_token_ids=completion_token_ids if request.return_token_ids else None,
         )
         logprobs_full_res = None
         if logprob_contents:
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index a7a058858..871604799 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -228,9 +228,11 @@ class OpenAIServingCompletion:
             output_tokens = [0] * num_choices
             inference_start_time = [0] * num_choices
             first_iteration = [True] * num_choices
-            max_streaming_response_tokens = 1
-            if request.suffix is not None and request.suffix.get("max_streaming_response_tokens", 1) > 1:
-                max_streaming_response_tokens = request.suffix["max_streaming_response_tokens"]
+            max_streaming_response_tokens = (
+                request.max_streaming_response_tokens
+                if request.max_streaming_response_tokens is not None
+                else (request.suffix or {}).get("max_streaming_response_tokens", 1)
+            )  # dierctly passed & passed in suffix
             choices = []
             chunk = CompletionStreamResponse(
                 id=request_id,
@@ -238,9 +240,6 @@ class OpenAIServingCompletion:
                 model=model_name,
                 choices=choices,
             )
-            enable_return_token_ids = request.return_token_ids or (
-                request.extra_body is not None and request.extra_body.get("return_token_ids", False)
-            )
             current_waiting_time = 0
             while num_choices > 0:
                 try:
@@ -264,7 +263,7 @@ class OpenAIServingCompletion:
                         raise ValueError("{}".format(res["error_msg"]))
 
                     if first_iteration[idx]:
-                        if enable_return_token_ids:
+                        if request.return_token_ids:
                             chunk = CompletionStreamResponse(
                                 id=request_id,
                                 created=created_time,
@@ -273,9 +272,7 @@ class OpenAIServingCompletion:
                                     CompletionResponseStreamChoice(
                                         index=idx,
                                         text="",
-                                        prompt_token_ids=(
-                                            list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None
-                                        ),
+                                        prompt_token_ids=list(prompt_batched_token_ids[idx]),
                                         completion_token_ids=None,
                                     )
                                 ],
@@ -297,7 +294,7 @@ class OpenAIServingCompletion:
                             index=idx,
                             text=output["text"],
                             prompt_token_ids=None,
-                            completion_token_ids=(output.get("token_ids") if enable_return_token_ids else None),
+                            completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
                             tool_calls=output.get("tool_call_content"),
                             reasoning_content=output.get("reasoning_content"),
                             arrival_time=arrival_time,
@@ -366,9 +363,6 @@ class OpenAIServingCompletion:
         choices: List[CompletionResponseChoice] = []
         num_prompt_tokens = 0
         num_generated_tokens = 0
-        enable_return_token_ids = request.return_token_ids or (
-            request.extra_body is not None and request.extra_body.get("return_token_ids", False)
-        )
 
         for idx in range(len(final_res_batch)):
             final_res = final_res_batch[idx]
@@ -394,8 +388,8 @@ class OpenAIServingCompletion:
                 token_ids=token_ids,
                 index=len(choices),
                 text=output_text,
-                prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
-                completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
+                prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
+                completion_token_ids=completion_token_ids if request.return_token_ids else None,
                 reasoning_content=output.get("reasoning_content"),
                 tool_calls=output.get("tool_call_content"),
                 logprobs=None,
diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
index 9627ea773..56f00f6e8 100644
--- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py
+++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
@@ -696,3 +696,20 @@ def test_non_streaming_chat_completion_disable_chat_template(openai_client, caps
     assert hasattr(disabled_response, "choices")
     assert len(disabled_response.choices) > 0
     assert enabled_response.choices[0].message.content == disabled_response.choices[0].message.content
+
+
+def test_non_streaming_chat_with_min_tokens(openai_client, capsys):
+    """
+    Test min_tokens option in non-streaming chat functionality with the local service
+    """
+    min_tokens = 1000
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Hello, how are you?"}],
+        temperature=1,
+        extra_body={"min_tokens": min_tokens},
+        stream=False,
+    )
+    assert hasattr(response, "usage")
+    assert hasattr(response.usage, "completion_tokens")
+    assert response.usage.completion_tokens >= min_tokens
diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index dccc1f55a..f6aa2b424 100644
--- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -116,6 +116,8 @@ def setup_and_run_server():
         "0.71",
         "--quantization",
         "wint4",
+        "--reasoning-parser",
+        "ernie-45-vl",
     ]
 
     # Start subprocess in new process group
@@ -214,7 +216,11 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
     resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
     assert resp1.status_code == 200
     result1 = resp1.json()
-    content1 = result1["choices"][0]["message"]["content"]
+    content1 = (
+        result1["choices"][0]["message"]["reasoning_content"]
+        + "</think>"
+        + result1["choices"][0]["message"]["content"]
+    )
     file_res_temp = "ernie-4_5-vl"
     f_o = open(file_res_temp, "a")
     f_o.writelines(content1)
@@ -338,10 +344,7 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
     response = openai_client.chat.completions.create(
         model="default",
         messages=[
-            {
-                "role": "system",
-                "content": "You are a helpful AI assistant.",
-            },  # system不是必需，可选
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
             {
                 "role": "user",
                 "content": [
@@ -373,10 +376,7 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
     response = openai_client.chat.completions.create(
         model="default",
         messages=[
-            {
-                "role": "system",
-                "content": "You are a helpful AI assistant.",
-            },  # system不是必需，可选
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
             {
                 "role": "user",
                 "content": [
@@ -413,10 +413,7 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
     response = openai_client.chat.completions.create(
         model="default",
         messages=[
-            {
-                "role": "system",
-                "content": "You are a helpful AI assistant.",
-            },  # system不是必需，可选
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
             {
                 "role": "user",
                 "content": [
@@ -455,10 +452,7 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
     response = openai_client.chat.completions.create(
         model="default",
         messages=[
-            {
-                "role": "system",
-                "content": "You are a helpful AI assistant.",
-            },  # system不是必需，可选
+            {"role": "system", "content": "You are a helpful AI assistant."},  # system不是必需，可选
             {
                 "role": "user",
                 "content": [
@@ -486,3 +480,54 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
         assert chunk.choices[0].delta.prompt_token_ids is None
         assert hasattr(chunk.choices[0].delta, "completion_token_ids")
         assert chunk.choices[0].delta.completion_token_ids is None
+
+
+def test_chat_with_thinking(openai_client, capsys):
+    """
+    Test enable_thinking & reasoning_max_tokens option in non-streaming chat functionality with the local service
+    """
+    # enable thinking, non-streaming
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=10,
+        extra_body={"enable_thinking": True},
+    )
+    assert response.choices[0].message.reasoning_content is not None
+
+    # disable thinking, non-streaming
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=10,
+        extra_body={"enable_thinking": False},
+    )
+    assert response.choices[0].message.reasoning_content is None
+
+    # enable thinking, streaming
+    reasoning_max_tokens = 3
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        extra_body={"enable_thinking": True, "reasoning_max_tokens": reasoning_max_tokens, "return_token_ids": True},
+        stream=True,
+        max_tokens=10,
+    )
+    completion_tokens = reasoning_tokens = 1
+    total_tokens = 0
+    for chunk_id, chunk in enumerate(response):
+        if chunk_id == 0:  # the first chunk is an extra chunk
+            continue
+        delta_message = chunk.choices[0].delta
+        if delta_message.content != "" and delta_message.reasoning_content == "":
+            completion_tokens += len(delta_message.completion_token_ids)
+        elif delta_message.reasoning_content != "" and delta_message.content == "":
+            reasoning_tokens += len(delta_message.completion_token_ids)
+        total_tokens += len(delta_message.completion_token_ids)
+    assert completion_tokens + reasoning_tokens == total_tokens
+    assert reasoning_tokens <= reasoning_max_tokens