diff --git a/docs/features/reasoning_output.md b/docs/features/reasoning_output.md index 5f23e65d5..6ea2e928e 100644 --- a/docs/features/reasoning_output.md +++ b/docs/features/reasoning_output.md @@ -8,14 +8,14 @@ Reasoning models return an additional `reasoning_content` field in their output, | baidu/ERNIE-4.5-VL-424B-A47B-Paddle | ernie-45-vl | ✓ | | baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl | ✓ | -The reasoning model requires a specified parser to extract reasoning content. The reasoning mode can be disabled by setting the `enable_thinking=False` parameter. +The reasoning model requires a specified parser to extract reasoning content. The reasoning mode can be disabled by setting the `"enable_thinking": false` parameter. Interfaces that support toggling the reasoning mode: 1. `/v1/chat/completions` requests in OpenAI services. 2. `/v1/chat/completions` requests in the OpenAI Python client. 3. `llm.chat` requests in Offline interfaces. -For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `metadata={"reasoning_max_tokens": 1024}` to the request. +For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `"reasoning_max_tokens": 1024` to the request. ### Quick Start When launching the model service, specify the parser name using the `--reasoning-parser` argument. @@ -43,7 +43,8 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ {"type": "text", "text": "Which era does the cultural relic in the picture belong to"} ]} ], - "metadata": {"enable_thinking": true} + "enable_thinking": true, + "reasoning_max_tokens": 1024 }' ``` @@ -68,7 +69,10 @@ chat_response = client.chat.completions.create( ], model="vl", stream=True, - metadata={"enable_thinking": True} + extra_body={ + "enable_thinking": True, + "reasoning_max_tokens": 1024 + } ) for chunk in chat_response: if chunk.choices[0].delta is not None: diff --git a/docs/get_started/ernie-4.5-vl.md b/docs/get_started/ernie-4.5-vl.md index f3b0b38d7..14719daca 100644 --- a/docs/get_started/ernie-4.5-vl.md +++ b/docs/get_started/ernie-4.5-vl.md @@ -113,7 +113,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "From which era does the artifact in the image originate?"} ]} ], - "metadata": {"enable_thinking": false} + "enable_thinking": false }' ``` diff --git a/docs/get_started/quick_start_vl.md b/docs/get_started/quick_start_vl.md index 82bc609b1..1e1a37425 100644 --- a/docs/get_started/quick_start_vl.md +++ b/docs/get_started/quick_start_vl.md @@ -74,7 +74,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "What era does this artifact belong to?"} ]} ], - "metadata": {"enable_thinking": false} + "enable_thinking": false }' ``` @@ -96,7 +96,7 @@ response = client.chat.completions.create( {"type": "text", "text": "What era does this artifact belong to?"}, ]}, ], - metadata={"enable_thinking": false}, + extra_body={"enable_thinking": false}, stream=True, ) for chunk in response: diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md index 8062fe76a..691434eed 100644 --- a/docs/online_serving/README.md +++ b/docs/online_serving/README.md @@ -88,11 +88,12 @@ The differences in request parameters between FastDeploy and the OpenAI protocol - `stream_options`: Optional[StreamOptions] = None - `temperature`: Optional[float] = None - `top_p`: Optional[float] = None -- `metadata`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `metadata={"enable_thinking": True}`) +- `extra_body`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `extra_body={"enable_thinking": True}`) - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated) - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`) - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking) - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition)) + - `return_token_ids`: Optional[bool] = False: (whether to return token ids as a list) > Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used. @@ -102,6 +103,8 @@ The additional return fields added by FastDeploy are as follows: - `arrival_time`: Returns the cumulative time taken for all tokens - `reasoning_content`: The returned result of the reasoning chain +- `prompt_token_ids`: The token id list of the prompt +- `completion_token_ids`: The token id list of the completion Overview of return parameters: @@ -112,7 +115,7 @@ ChatCompletionStreamResponse: created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[ChatCompletionResponseStreamChoice] - ChatCompletionResponseStreamChoice: +ChatCompletionResponseStreamChoice: index: int delta: DeltaMessage finish_reason: Optional[Literal["stop", "length"]] = None @@ -120,6 +123,7 @@ ChatCompletionStreamResponse: DeltaMessage: role: Optional[str] = None content: Optional[str] = None - token_ids: Optional[List[int]] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None ``` diff --git a/docs/zh/features/reasoning_output.md b/docs/zh/features/reasoning_output.md index 5417f66d7..f41ba77dd 100644 --- a/docs/zh/features/reasoning_output.md +++ b/docs/zh/features/reasoning_output.md @@ -8,18 +8,18 @@ | baidu/ERNIE-4.5-VL-424B-A47B-Paddle | ernie-45-vl | ✓ | | baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl | ✓ | -思考模型需要指定解析器,以便于对思考内容进行解析. 通过`enable_thinking=False` 参数可以关闭模型思考模式. +思考模型需要指定解析器,以便于对思考内容进行解析. 通过 `"enable_thinking": false` 参数可以关闭模型思考模式. 可以支持思考模式开关的接口: 1. OpenAI 服务中 `/v1/chat/completions` 请求. 2. OpenAI Python客户端中 `/v1/chat/completions` 请求. 3. Offline 接口中 `llm.chat`请求. -同时在思考模型中,支持通过```reasoning_max_tokens```控制思考内容的长度,在请求中添加```metadata={"reasoning_max_tokens": 1024}```即可。 +同时在思考模型中,支持通过 `reasoning_max_tokens` 控制思考内容的长度,在请求中添加 `"reasoning_max_tokens": 1024` 即可。 ## 快速使用 -在启动模型服务时, 通过`--reasoning-parser`参数指定解析器名称. -该解析器会解析思考模型的输出, 提取`reasoning_content`字段. +在启动模型服务时, 通过 `--reasoning-parser` 参数指定解析器名称. +该解析器会解析思考模型的输出, 提取 `reasoning_content` 字段. ```bash python -m fastdeploy.entrypoints.openai.api_server \ @@ -43,15 +43,16 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ {"type": "text", "text": "图中的文物属于哪个年代"} ]} ], - "metadata": {"enable_thinking": true} + "enable_thinking": true, + "reasoning_max_tokens": 1024 }' ``` -字段`reasoning_content`包含得出最终结论的思考步骤,而`content`字段包含最终结论。 +字段 `reasoning_content` 包含得出最终结论的思考步骤,而 `content` 字段包含最终结论。 ### 流式会话 -在流式会话中, `reasoning_content`字段会可以在`chat completion response chunks`中的 `delta` 中获取 +在流式会话中, `reasoning_content` 字段会可以在 `chat completion response chunks` 中的 `delta` 中获取 ```python from openai import OpenAI @@ -69,7 +70,10 @@ chat_response = client.chat.completions.create( ], model="vl", stream=True, - metadata={"enable_thinking": True} + extra_body={ + "enable_thinking": True, + "reasoning_max_tokens": 1024 + } ) for chunk in chat_response: if chunk.choices[0].delta is not None: diff --git a/docs/zh/get_started/ernie-4.5-vl.md b/docs/zh/get_started/ernie-4.5-vl.md index a270b2e4a..5bec9ca20 100644 --- a/docs/zh/get_started/ernie-4.5-vl.md +++ b/docs/zh/get_started/ernie-4.5-vl.md @@ -110,7 +110,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "图中的文物属于哪个年代"} ]} ], - "metadata": {"enable_thinking": false} + "enable_thinking": false }' ``` diff --git a/docs/zh/get_started/quick_start_vl.md b/docs/zh/get_started/quick_start_vl.md index deaf3e10d..c9fe26a51 100644 --- a/docs/zh/get_started/quick_start_vl.md +++ b/docs/zh/get_started/quick_start_vl.md @@ -73,7 +73,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "图中的文物属于哪个年代"} ]} ], - "metadata": {"enable_thinking": false} + "enable_thinking": false }' ``` @@ -93,7 +93,7 @@ response = client.chat.completions.create( {"type": "text", "text": "图中的文物属于哪个年代?"}, ]}, ], - metadata={"enable_thinking": false}, + extra_body={"enable_thinking": false}, stream=True, ) for chunk in response: diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md index d2c001037..a2d4f98d2 100644 --- a/docs/zh/online_serving/README.md +++ b/docs/zh/online_serving/README.md @@ -87,11 +87,12 @@ FastDeploy 与 OpenAI 协议的请求参数差异如下,其余请求参数会 - `stream_options`: Optional[StreamOptions] = None - `temperature`: Optional[float] = None - `top_p`: Optional[float] = None -- `metadata`: Optional[dict] = None (仅在v1/chat/compeltions中支持,用于配置额外参数, 如metadata={"enable_thinking": True}) +- `extra_body`: Optional[dict] = None (仅在 v1/chat/compeltions 中支持,用于配置额外参数, 如 `extra_body={"enable_thinking": True}`) - `min_tokens`: Optional[int] = 1 最小生成的Token个数 - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数,默认与max_tokens一致 - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考 - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数(>1时惩罚重复,<1时鼓励重复) + - `return_token_ids`: Optional[bool] = False: 是否返回 token id 列表 > 注: 若为多模态模型 由于思考链默认打开导致输出过长,max tokens 可以设置为模型最长输出,或使用默认值。 @@ -101,6 +102,8 @@ FastDeploy 增加的返回字段如下: - `arrival_time`:返回所有 token 的累计耗时 - `reasoning_content`: 思考链的返回结果 +- `prompt_token_ids`: 输入序列的 token id 列表 +- `completion_token_ids`: 输出序列的 token id 列表 返回参数总览: @@ -111,7 +114,7 @@ ChatCompletionStreamResponse: created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[ChatCompletionResponseStreamChoice] - ChatCompletionResponseStreamChoice: +ChatCompletionResponseStreamChoice: index: int delta: DeltaMessage finish_reason: Optional[Literal["stop", "length"]] = None @@ -119,6 +122,7 @@ ChatCompletionStreamResponse: DeltaMessage: role: Optional[str] = None content: Optional[str] = None - token_ids: Optional[List[int]] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None ``` diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index ca6232dfb..8d4c2c545 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -346,8 +346,10 @@ class CompletionRequest(BaseModel): top_k: Optional[int] = None min_p: Optional[float] = None user: Optional[str] = None - extra_body: Optional[dict] = None - return_token_ids: Optional[bool] = False + + min_tokens: Optional[int] = None + return_token_ids: Optional[bool] = None + max_streaming_response_tokens: Optional[int] = None prompt_token_ids: Optional[List[int]] = None response_format: Optional[AnyResponseFormat] = None @@ -373,16 +375,13 @@ class CompletionRequest(BaseModel): if request_id is not None: req_dict["request_id"] = request_id - # parse request model into dict, priority: request > extra_body > suffix + # parse request model into dict + if self.suffix is not None: + for key, value in self.suffix.items(): + req_dict[key] = value for key, value in self.dict().items(): if value is not None: req_dict[key] = value - if self.extra_body is not None: - for key, value in self.extra_body.items(): - req_dict.setdefault(key, value) - if self.suffix is not None: - for key, value in self.suffix.items(): - req_dict.setdefault(key, value) if prompt is not None: req_dict["prompt"] = prompt @@ -480,10 +479,15 @@ class ChatCompletionRequest(BaseModel): min_p: Optional[float] = None user: Optional[str] = None metadata: Optional[dict] = None - extra_body: Optional[dict] = None - return_token_ids: Optional[bool] = False + + return_token_ids: Optional[bool] = None prompt_token_ids: Optional[List[int]] = None disable_chat_template: Optional[bool] = False + min_tokens: Optional[int] = None + enable_thinking: Optional[bool] = None + reasoning_max_tokens: Optional[int] = None + max_streaming_response_tokens: Optional[int] = None + include_stop_str_in_output: Optional[bool] = None response_format: Optional[AnyResponseFormat] = None guided_json: Optional[Union[str, dict, BaseModel]] = None @@ -512,19 +516,16 @@ class ChatCompletionRequest(BaseModel): req_dict["max_tokens"] = self.max_completion_tokens or self.max_tokens req_dict["logprobs"] = self.top_logprobs if self.logprobs else None - # parse request model into dict, priority: request > extra_body > metadata - for key, value in self.dict().items(): - if value is not None: - req_dict[key] = value - if self.extra_body is not None: - for key, value in self.extra_body.items(): - req_dict.setdefault(key, value) + # parse request model into dict, priority: request params > metadata params if self.metadata is not None: assert ( "raw_request" not in self.metadata ), "The parameter `raw_request` is not supported now, please use completion api instead." for key, value in self.metadata.items(): - req_dict.setdefault(key, value) + req_dict[key] = value + for key, value in self.dict().items(): + if value is not None: + req_dict[key] = value if "prompt_token_ids" in req_dict: if "messages" in req_dict: diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 86da7eaea..d28eb3c7f 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -124,11 +124,21 @@ class OpenAIServingChat: previous_num_tokens = 0 num_prompt_tokens = 0 num_choices = 1 - max_streaming_response_tokens = 1 - enable_thinking = None - include_stop_str_in_output = False - if request.metadata is not None and request.metadata.get("max_streaming_response_tokens", 1) > 1: - max_streaming_response_tokens = request.metadata["max_streaming_response_tokens"] + max_streaming_response_tokens = ( + request.max_streaming_response_tokens + if request.max_streaming_response_tokens is not None + else (request.metadata or {}).get("max_streaming_response_tokens", 1) + ) # dierctly passed & passed in metadata + enable_thinking = ( + request.enable_thinking + if request.enable_thinking is not None + else (request.metadata or {}).get("enable_thinking") + ) + include_stop_str_in_output = ( + request.include_stop_str_in_output + if request.include_stop_str_in_output is not None + else (request.metadata or {}).get("include_stop_str_in_output", False) + ) stream_options = request.stream_options if stream_options is None: @@ -149,12 +159,6 @@ class OpenAIServingChat: dealer.write([b"", request_id.encode("utf-8")]) choices = [] current_waiting_time = 0 - if request.metadata is not None: - enable_thinking = request.metadata.get("enable_thinking") - include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False) - enable_return_token_ids = request.return_token_ids or ( - request.extra_body is not None and request.extra_body.get("return_token_ids", False) - ) while num_choices > 0: try: raw_data = await asyncio.wait_for(dealer.read(), timeout=10) @@ -204,7 +208,7 @@ class OpenAIServingChat: completion_token_ids=None, ), ) - if enable_return_token_ids: + if request.return_token_ids: choice.delta.prompt_token_ids = list(prompt_token_ids) chunk = ChatCompletionStreamResponse( id=request_id, @@ -274,7 +278,7 @@ class OpenAIServingChat: if res.get("error_msg") is not None and "Recover" in res["error_msg"]: choice.finish_reason = "recover_stop" - if enable_return_token_ids: + if request.return_token_ids: choice.delta.completion_token_ids = list(output["token_ids"]) if include_continuous_usage: chunk.usage = UsageInfo( @@ -330,11 +334,17 @@ class OpenAIServingChat: """ created_time = int(time.time()) final_res = None - enable_thinking = None - include_stop_str_in_output = False - enable_return_token_ids = request.return_token_ids or ( - request.extra_body is not None and request.extra_body.get("return_token_ids", False) + enable_thinking = ( + request.enable_thinking + if request.enable_thinking is not None + else (request.metadata or {}).get("enable_thinking") ) + include_stop_str_in_output = ( + request.include_stop_str_in_output + if request.include_stop_str_in_output is not None + else (request.metadata or {}).get("include_stop_str_in_output", False) + ) + try: dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") dealer.write([b"", request_id.encode("utf-8")]) @@ -363,9 +373,6 @@ class OpenAIServingChat: for data in response: if data.get("error_code", 200) != 200: raise ValueError("{}".format(data["error_msg"])) - if request.metadata is not None: - enable_thinking = request.metadata.get("enable_thinking") - include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False) data = self.engine_client.data_processor.process_response_dict( data, stream=False, @@ -407,8 +414,8 @@ class OpenAIServingChat: content=output["text"], reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), - prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, - completion_token_ids=(completion_token_ids if enable_return_token_ids else None), + prompt_token_ids=prompt_token_ids if request.return_token_ids else None, + completion_token_ids=completion_token_ids if request.return_token_ids else None, ) logprobs_full_res = None if logprob_contents: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index a7a058858..871604799 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -228,9 +228,11 @@ class OpenAIServingCompletion: output_tokens = [0] * num_choices inference_start_time = [0] * num_choices first_iteration = [True] * num_choices - max_streaming_response_tokens = 1 - if request.suffix is not None and request.suffix.get("max_streaming_response_tokens", 1) > 1: - max_streaming_response_tokens = request.suffix["max_streaming_response_tokens"] + max_streaming_response_tokens = ( + request.max_streaming_response_tokens + if request.max_streaming_response_tokens is not None + else (request.suffix or {}).get("max_streaming_response_tokens", 1) + ) # dierctly passed & passed in suffix choices = [] chunk = CompletionStreamResponse( id=request_id, @@ -238,9 +240,6 @@ class OpenAIServingCompletion: model=model_name, choices=choices, ) - enable_return_token_ids = request.return_token_ids or ( - request.extra_body is not None and request.extra_body.get("return_token_ids", False) - ) current_waiting_time = 0 while num_choices > 0: try: @@ -264,7 +263,7 @@ class OpenAIServingCompletion: raise ValueError("{}".format(res["error_msg"])) if first_iteration[idx]: - if enable_return_token_ids: + if request.return_token_ids: chunk = CompletionStreamResponse( id=request_id, created=created_time, @@ -273,9 +272,7 @@ class OpenAIServingCompletion: CompletionResponseStreamChoice( index=idx, text="", - prompt_token_ids=( - list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None - ), + prompt_token_ids=list(prompt_batched_token_ids[idx]), completion_token_ids=None, ) ], @@ -297,7 +294,7 @@ class OpenAIServingCompletion: index=idx, text=output["text"], prompt_token_ids=None, - completion_token_ids=(output.get("token_ids") if enable_return_token_ids else None), + completion_token_ids=output.get("token_ids") if request.return_token_ids else None, tool_calls=output.get("tool_call_content"), reasoning_content=output.get("reasoning_content"), arrival_time=arrival_time, @@ -366,9 +363,6 @@ class OpenAIServingCompletion: choices: List[CompletionResponseChoice] = [] num_prompt_tokens = 0 num_generated_tokens = 0 - enable_return_token_ids = request.return_token_ids or ( - request.extra_body is not None and request.extra_body.get("return_token_ids", False) - ) for idx in range(len(final_res_batch)): final_res = final_res_batch[idx] @@ -394,8 +388,8 @@ class OpenAIServingCompletion: token_ids=token_ids, index=len(choices), text=output_text, - prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, - completion_token_ids=(completion_token_ids if enable_return_token_ids else None), + prompt_token_ids=prompt_token_ids if request.return_token_ids else None, + completion_token_ids=completion_token_ids if request.return_token_ids else None, reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), logprobs=None, diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 9627ea773..56f00f6e8 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -696,3 +696,20 @@ def test_non_streaming_chat_completion_disable_chat_template(openai_client, caps assert hasattr(disabled_response, "choices") assert len(disabled_response.choices) > 0 assert enabled_response.choices[0].message.content == disabled_response.choices[0].message.content + + +def test_non_streaming_chat_with_min_tokens(openai_client, capsys): + """ + Test min_tokens option in non-streaming chat functionality with the local service + """ + min_tokens = 1000 + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=1, + extra_body={"min_tokens": min_tokens}, + stream=False, + ) + assert hasattr(response, "usage") + assert hasattr(response.usage, "completion_tokens") + assert response.usage.completion_tokens >= min_tokens diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index dccc1f55a..f6aa2b424 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -116,6 +116,8 @@ def setup_and_run_server(): "0.71", "--quantization", "wint4", + "--reasoning-parser", + "ernie-45-vl", ] # Start subprocess in new process group @@ -214,7 +216,11 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): resp1 = requests.post(api_url, headers=headers, json=consistent_payload) assert resp1.status_code == 200 result1 = resp1.json() - content1 = result1["choices"][0]["message"]["content"] + content1 = ( + result1["choices"][0]["message"]["reasoning_content"] + + "" + + result1["choices"][0]["message"]["content"] + ) file_res_temp = "ernie-4_5-vl" f_o = open(file_res_temp, "a") f_o.writelines(content1) @@ -338,10 +344,7 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): response = openai_client.chat.completions.create( model="default", messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 + {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 { "role": "user", "content": [ @@ -373,10 +376,7 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): response = openai_client.chat.completions.create( model="default", messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 + {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 { "role": "user", "content": [ @@ -413,10 +413,7 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): response = openai_client.chat.completions.create( model="default", messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 + {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 { "role": "user", "content": [ @@ -455,10 +452,7 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): response = openai_client.chat.completions.create( model="default", messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 + {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 { "role": "user", "content": [ @@ -486,3 +480,54 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): assert chunk.choices[0].delta.prompt_token_ids is None assert hasattr(chunk.choices[0].delta, "completion_token_ids") assert chunk.choices[0].delta.completion_token_ids is None + + +def test_chat_with_thinking(openai_client, capsys): + """ + Test enable_thinking & reasoning_max_tokens option in non-streaming chat functionality with the local service + """ + # enable thinking, non-streaming + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=10, + extra_body={"enable_thinking": True}, + ) + assert response.choices[0].message.reasoning_content is not None + + # disable thinking, non-streaming + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=10, + extra_body={"enable_thinking": False}, + ) + assert response.choices[0].message.reasoning_content is None + + # enable thinking, streaming + reasoning_max_tokens = 3 + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + extra_body={"enable_thinking": True, "reasoning_max_tokens": reasoning_max_tokens, "return_token_ids": True}, + stream=True, + max_tokens=10, + ) + completion_tokens = reasoning_tokens = 1 + total_tokens = 0 + for chunk_id, chunk in enumerate(response): + if chunk_id == 0: # the first chunk is an extra chunk + continue + delta_message = chunk.choices[0].delta + if delta_message.content != "" and delta_message.reasoning_content == "": + completion_tokens += len(delta_message.completion_token_ids) + elif delta_message.reasoning_content != "" and delta_message.content == "": + reasoning_tokens += len(delta_message.completion_token_ids) + total_tokens += len(delta_message.completion_token_ids) + assert completion_tokens + reasoning_tokens == total_tokens + assert reasoning_tokens <= reasoning_max_tokens