[Feature] Add return_token_ids, prompt_token_ids, and delete training, raw_request in request body (#2940)

* [feat] add return_token_ids, prompt_token_ids, delete raw_request in request body

* [fix] return_token_ids not working in curl request

* [test] improve some test cases of return_token_ids and prompt_token_ids

* [fix] the server responds ok even if request.messages is an empty list
This commit is contained in:
李泳桦
2025-07-21 19:31:14 +08:00
committed by GitHub
parent 2845bde964
commit 8a619e9db5
8 changed files with 506 additions and 59 deletions

View File

@@ -226,7 +226,7 @@ class OpenAIServingCompletion:
model=model_name,
choices=choices,
)
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
current_waiting_time = 0
while num_choices > 0:
try:
@@ -250,18 +250,17 @@ class OpenAIServingCompletion:
raise ValueError("{}".format(res["error_msg"]))
if first_iteration[idx]:
if request.suffix is not None and request.suffix.get("training", False):
if enable_return_token_ids:
chunk = CompletionStreamResponse(
id=request_id,
created=created_time,
model=model_name,
choices=[
CompletionResponseStreamChoice(
index=idx,
text="",
token_ids=list(prompt_batched_token_ids[idx]),
)
],
choices=[CompletionResponseStreamChoice(
index=idx,
text="",
prompt_token_ids=list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None,
completion_token_ids=None,
)]
)
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
first_iteration[idx] = False
@@ -275,16 +274,15 @@ class OpenAIServingCompletion:
output = res["outputs"]
choices.append(
CompletionResponseStreamChoice(
index=idx,
text=output["text"],
token_ids=output.get("token_ids"),
tool_calls=output.get("tool_call_content"),
reasoning_content=output.get("reasoning_content"),
arrival_time=arrival_time,
)
)
choices.append(CompletionResponseStreamChoice(
index=idx,
text=output["text"],
prompt_token_ids=None,
completion_token_ids=output.get("token_ids") if enable_return_token_ids else None,
tool_calls=output.get("tool_call_content"),
reasoning_content=output.get("reasoning_content"),
arrival_time=arrival_time
))
if res["finished"]:
if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
chunk.choices[0].finish_reason = "stop"
@@ -347,6 +345,7 @@ class OpenAIServingCompletion:
choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
for idx in range(len(final_res_batch)):
final_res = final_res_batch[idx]
@@ -371,7 +370,9 @@ class OpenAIServingCompletion:
token_ids=token_ids,
index=len(choices),
text=output_text,
reasoning_content=output.get("reasoning_content"),
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
completion_token_ids=output["token_ids"] if enable_return_token_ids else None,
reasoning_content=output.get('reasoning_content'),
tool_calls=output.get("tool_call_content"),
logprobs=None,
finish_reason=None,