[feat] extra parameters are all passed directly via http payload now, or in extra_body if using openai client (#3077)

* [feat] extra parameters are all passed directly via http payload now, or in extra_body if using openai client

* [fix] delete ci test case for enable_thinking

* [fix] add reasoning_parser when server starts

* [doc] update docs related to metadata

* [fix] fix ci consistency test error with reasoning parser

* [fix] cancel enable_thinking default value
This commit is contained in:
李泳桦
2025-07-30 19:25:39 +08:00
committed by GitHub
parent 0463797fc2
commit eca8fc7ca6
13 changed files with 178 additions and 98 deletions

View File

@@ -228,9 +228,11 @@ class OpenAIServingCompletion:
output_tokens = [0] * num_choices
inference_start_time = [0] * num_choices
first_iteration = [True] * num_choices
max_streaming_response_tokens = 1
if request.suffix is not None and request.suffix.get("max_streaming_response_tokens", 1) > 1:
max_streaming_response_tokens = request.suffix["max_streaming_response_tokens"]
max_streaming_response_tokens = (
request.max_streaming_response_tokens
if request.max_streaming_response_tokens is not None
else (request.suffix or {}).get("max_streaming_response_tokens", 1)
) # dierctly passed & passed in suffix
choices = []
chunk = CompletionStreamResponse(
id=request_id,
@@ -238,9 +240,6 @@ class OpenAIServingCompletion:
model=model_name,
choices=choices,
)
enable_return_token_ids = request.return_token_ids or (
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
)
current_waiting_time = 0
while num_choices > 0:
try:
@@ -264,7 +263,7 @@ class OpenAIServingCompletion:
raise ValueError("{}".format(res["error_msg"]))
if first_iteration[idx]:
if enable_return_token_ids:
if request.return_token_ids:
chunk = CompletionStreamResponse(
id=request_id,
created=created_time,
@@ -273,9 +272,7 @@ class OpenAIServingCompletion:
CompletionResponseStreamChoice(
index=idx,
text="",
prompt_token_ids=(
list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None
),
prompt_token_ids=list(prompt_batched_token_ids[idx]),
completion_token_ids=None,
)
],
@@ -297,7 +294,7 @@ class OpenAIServingCompletion:
index=idx,
text=output["text"],
prompt_token_ids=None,
completion_token_ids=(output.get("token_ids") if enable_return_token_ids else None),
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
tool_calls=output.get("tool_call_content"),
reasoning_content=output.get("reasoning_content"),
arrival_time=arrival_time,
@@ -366,9 +363,6 @@ class OpenAIServingCompletion:
choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
enable_return_token_ids = request.return_token_ids or (
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
)
for idx in range(len(final_res_batch)):
final_res = final_res_batch[idx]
@@ -394,8 +388,8 @@ class OpenAIServingCompletion:
token_ids=token_ids,
index=len(choices),
text=output_text,
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
completion_token_ids=completion_token_ids if request.return_token_ids else None,
reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call_content"),
logprobs=None,