add support QWQ enable_thinking (#2706)

* add support QWQ enable_thinking

* add stream=True

* fix stream=true

* fix qwen

---------

Co-authored-by: lizexu <lizexu@baidu.com>
This commit is contained in:
lizexu123
2025-07-04 20:55:23 +08:00
committed by GitHub
parent dacc46f04c
commit 9cb08e71e8

View File

@@ -17,7 +17,7 @@ from collections.abc import Sequence
from typing import Optional, Union from typing import Optional, Union
from fastdeploy.entrypoints.openai.protocol import (ChatCompletionRequest, from fastdeploy.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage) DeltaMessage)
from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager
@@ -67,47 +67,47 @@ class Qwen3ReasoningParser(ReasoningParser):
- 'abc' goes to reasoning_content - 'abc' goes to reasoning_content
- 'xyz' goes to content - 'xyz' goes to content
""" """
# Skip single special tokens
if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
self.think_start_token_id, self.think_end_token_id self.think_start_token_id, self.think_end_token_id
]): ]):
return "", "" return "", ""
if self.think_start_token_id in previous_token_ids: # </think> in delta
if self.think_end_token_id in delta_token_ids: if self.think_end_token_id in delta_token_ids:
# <think> in previous, </think> in delta, #<think> in delta, </think> in delta, extract reasoning content
# extract reasoning content if self.think_start_token_id in delta_token_ids:
start_index = delta_text.find(self.think_start_token)
end_index = delta_token_ids.find(self.think_end_token)
reasoning_content = delta_text[start_index +
len(self.think_start_token
):end_index]
content = delta_text[end_index+len(self.think_end_token):]
return reasoning_content, content
# <think> in previous, </think> in delta,
else:
end_index = delta_text.find(self.think_end_token) end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[:end_index] reasoning_content = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token):] content = delta_text[end_index + len(self.think_end_token):]
content = content if content else None content = content if content else None
return reasoning_content, content return reasoning_content, content
elif self.think_end_token_id in previous_token_ids: # </think> in previous reasoning content continues
# <think> in previous, </think> in previous, elif self.think_end_token_id in previous_token_ids:
# reasoning content continues
return "", delta_text
else:
# <think> in previous, no </think> in previous or delta,
# reasoning content continues
return delta_text, ""
elif self.think_start_token_id in delta_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in delta, </think> in delta, extract reasoning content
start_index = delta_text.find(self.think_start_token)
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[start_index +
len(self.think_start_token
):end_index]
content = delta_text[end_index + len(self.think_end_token):]
content = content if content else None
return reasoning_content, content
else:
# <think> in delta, no </think> in delta,
# reasoning content continues
return delta_text, ""
else:
# thinking is disabled, just content
return "", delta_text return "", delta_text
# <think> in previous
elif self.think_start_token_id in previous_token_ids:
return delta_text,""
# <think> in delta
elif self.think_start_token_id in delta_token_ids:
start_index=delta_text.find(self.think_start_token)
reasoning_content=delta_text[start_index + len(self.think_start_token):]
content = ""
return reasoning_content, content
else:
return delta_text, ""
def extract_reasoning_content( def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
@@ -115,31 +115,47 @@ class Qwen3ReasoningParser(ReasoningParser):
""" """
Extract reasoning content from the model output. Extract reasoning content from the model output.
For text abc</think>xyz: 支持两种格式:
- 'abc' goes to reasoning_content 1. <think>abc</think>xyz - 标准格式
- 'xyz' goes to content 2. abc</think>xyz - 缺少起始标签的格式
Returns: Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content tuple[Optional[str], Optional[str]]: reasoning content and content
""" """
# Check if the model output contains the <think> and </think> tokens. # 检查是否包含结束标签
if (self.think_start_token not in model_output
or self.think_end_token not in model_output):
return None, model_output
# Check if the <think> is present in the model output, remove it
# if it is present.
model_output_parts = model_output.partition(self.think_start_token)
model_output = model_output_parts[2] if model_output_parts[
1] else model_output_parts[0]
# Check if the model output contains the </think> tokens.
# If the end token is not found, return the model output as is.
if self.think_end_token not in model_output: if self.think_end_token not in model_output:
return None, model_output return None, model_output
# Extract reasoning content from the model output. # 检查是否有起始标签
reasoning_content, _, content = model_output.partition( if self.think_start_token in model_output:
self.think_end_token) # 标准格式:<think>content</think>answer
if (self.think_start_token not in model_output
or self.think_end_token not in model_output):
return None, model_output
# Check if the <think> is present in the model output, remove it
# if it is present.
model_output_parts = model_output.partition(self.think_start_token)
model_output = model_output_parts[2] if model_output_parts[
1] else model_output_parts[0]
# Check if the model output contains the </think> tokens.
# If the end token is not found, return the model output as is.
if self.think_end_token not in model_output:
return None, model_output
final_content = content or None # Extract reasoning content from the model output.
return reasoning_content, final_content reasoning_content, _, content = model_output.partition(
self.think_end_token)
final_content = content or None
return reasoning_content, final_content
else:
# 缺少起始标签的格式content</think>answer
parts = model_output.split(self.think_end_token, 1)
if len(parts) == 2:
reasoning_content = parts[0].strip()
final_content = parts[1].strip() if parts[1].strip() else None
return reasoning_content, final_content
return None, model_output