[Feature] Add return_token_ids, prompt_token_ids, and delete training, raw_request in request body (#2940)

* [feat] add return_token_ids, prompt_token_ids, delete raw_request in request body

* [fix] return_token_ids not working in curl request

* [test] improve some test cases of return_token_ids and prompt_token_ids

* [fix] the server responds ok even if request.messages is an empty list
This commit is contained in:
李泳桦
2025-07-21 19:31:14 +08:00
committed by GitHub
parent 2845bde964
commit 8a619e9db5
8 changed files with 506 additions and 59 deletions

View File

@@ -46,7 +46,6 @@ class Request:
preprocess_end_time: Optional[float] = None,
multimodal_inputs: Optional[dict] = None,
multimodal_data: Optional[dict] = None,
raw_request: bool = True,
disaggregate_info: Optional[dict] = None,
draft_token_ids: Optional[list[int]] = None,
guided_json: Optional[Any] = None,
@@ -74,7 +73,6 @@ class Request:
self.arrival_time = arrival_time
self.preprocess_start_time = preprocess_start_time
self.preprocess_end_time = preprocess_end_time
self.raw_request = raw_request
self.disaggregate_info = disaggregate_info
# speculative method in disaggregate-mode
@@ -117,7 +115,6 @@ class Request:
multimodal_data=d.get("multimodal_data"),
disaggregate_info=d.get("disaggregate_info"),
draft_token_ids=d.get("draft_token_ids"),
raw_request=d.get("raw_request", True),
guided_json=d.get("guided_json", None),
guided_regex=d.get("guided_regex", None),
guided_choice=d.get("guided_choice", None),
@@ -145,7 +142,6 @@ class Request:
"preprocess_end_time": self.preprocess_end_time,
"multimodal_inputs": self.multimodal_inputs,
"multimodal_data": self.multimodal_data,
"raw_request": self.raw_request,
"disaggregate_info": self.disaggregate_info,
"draft_token_ids": self.draft_token_ids,
"enable_thinking": self.enable_thinking,

View File

@@ -124,6 +124,8 @@ class ChatMessage(BaseModel):
content: str
reasoning_content: Optional[str] = None
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
class ChatCompletionResponseChoice(BaseModel):
@@ -177,7 +179,8 @@ class DeltaMessage(BaseModel):
role: Optional[str] = None
content: Optional[str] = None
token_ids: Optional[List[int]] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
reasoning_content: Optional[str] = None
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
@@ -214,7 +217,8 @@ class CompletionResponseChoice(BaseModel):
index: int
text: str
token_ids: Optional[List[int]] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
arrival_time: Optional[float] = None
logprobs: Optional[int] = None
reasoning_content: Optional[str] = None
@@ -243,7 +247,8 @@ class CompletionResponseStreamChoice(BaseModel):
index: int
text: str
arrival_time: float = None
token_ids: Optional[List[int]] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
logprobs: Optional[float] = None
reasoning_content: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
@@ -341,6 +346,9 @@ class CompletionRequest(BaseModel):
top_k: Optional[int] = None
min_p: Optional[float] = None
user: Optional[str] = None
extra_body: Optional[dict] = None
return_token_ids: Optional[bool] = False
prompt_token_ids: Optional[List[int]] = None
response_format: Optional[AnyResponseFormat] = None
guided_json: Optional[Union[str, dict, BaseModel]] = None
@@ -373,9 +381,13 @@ class CompletionRequest(BaseModel):
if prompt is not None:
req_dict["prompt"] = prompt
if isinstance(prompt[0], int):
req_dict["prompt_token_ids"] = prompt
del req_dict["prompt"]
if self.prompt_token_ids is not None or \
(self.extra_body is not None and self.extra_body.get("prompt_token_ids") is not None):
req_dict["prompt_token_ids"] = self.prompt_token_ids
if "prompt" in req_dict:
del req_dict["prompt"]
else:
assert len(prompt) > 0
guided_json_object = None
if self.response_format is not None:
@@ -464,6 +476,9 @@ class ChatCompletionRequest(BaseModel):
min_p: Optional[float] = None
user: Optional[str] = None
metadata: Optional[dict] = None
extra_body: Optional[dict] = None
return_token_ids: Optional[bool] = False
prompt_token_ids: Optional[List[int]] = None
response_format: Optional[AnyResponseFormat] = None
guided_json: Optional[Union[str, dict, BaseModel]] = None
@@ -499,12 +514,14 @@ class ChatCompletionRequest(BaseModel):
for key, value in self.dict().items():
if value is not None:
req_dict[key] = value
if isinstance(self.messages[0], int):
req_dict["prompt_token_ids"] = self.messages
del req_dict["messages"]
if "raw_request" in req_dict and not req_dict["raw_request"]:
req_dict["prompt"] = req_dict["messages"][0]["content"]
del req_dict["messages"]
if self.prompt_token_ids is not None or \
(self.extra_body is not None and self.extra_body.get("prompt_token_ids") is not None):
req_dict["prompt_token_ids"] = self.prompt_token_ids
if "messages" in req_dict:
del req_dict["messages"]
else:
assert len(self.messages) > 0
guided_json_object = None
if self.response_format is not None:

View File

@@ -144,6 +144,7 @@ class OpenAIServingChat:
if request.metadata is not None:
enable_thinking = request.metadata.get("enable_thinking")
include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
while num_choices > 0:
try:
raw_data = await asyncio.wait_for(dealer.read(), timeout=10)
@@ -185,14 +186,16 @@ class OpenAIServingChat:
choice = ChatCompletionResponseStreamChoice(
index=i,
delta=DeltaMessage(
role="assistant",
content="",
reasoning_content="",
role="assistant",
content="",
reasoning_content="",
tool_calls=None,
),
prompt_token_ids=None,
completion_token_ids=None,
)
)
if request.metadata is not None and request.metadata.get("training", False):
choice.delta.token_ids = prompt_token_ids
if enable_return_token_ids:
choice.delta.prompt_token_ids = list(prompt_token_ids)
chunk = ChatCompletionStreamResponse(
id=request_id,
object=chunk_object_type,
@@ -228,9 +231,10 @@ class OpenAIServingChat:
previous_num_tokens += len(output["token_ids"])
delta_message = DeltaMessage(
content=delta_text,
reasoning_content=output.get("reasoning_content"),
token_ids=output.get("token_ids"),
content=delta_text,
reasoning_content=output.get("reasoning_content"), \
prompt_token_ids=None,
completion_token_ids=None,
tool_calls=output.get("tool_call_content", []),
)
@@ -260,8 +264,8 @@ class OpenAIServingChat:
if res.get("error_msg") is not None and "Recover" in res["error_msg"]:
choice.finish_reason = "recover_stop"
if request.metadata is not None and request.metadata.get("training", False) and delta_text != "":
choice.delta.token_ids = output["token_ids"]
if enable_return_token_ids:
choice.delta.completion_token_ids = list(output["token_ids"])
if include_continuous_usage:
chunk.usage = UsageInfo(
prompt_tokens=num_prompt_tokens,
@@ -318,6 +322,7 @@ class OpenAIServingChat:
final_res = None
enable_thinking = None
include_stop_str_in_output = False
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
try:
dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
dealer.write([b"", request_id.encode("utf-8")])
@@ -388,7 +393,8 @@ class OpenAIServingChat:
content=output["text"],
reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call_content"),
token_ids=output.get("token_ids"),
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
completion_token_ids=output.get("token_ids") if enable_return_token_ids else None,
)
logprobs_full_res = None
if logprob_contents:

View File

@@ -226,7 +226,7 @@ class OpenAIServingCompletion:
model=model_name,
choices=choices,
)
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
current_waiting_time = 0
while num_choices > 0:
try:
@@ -250,18 +250,17 @@ class OpenAIServingCompletion:
raise ValueError("{}".format(res["error_msg"]))
if first_iteration[idx]:
if request.suffix is not None and request.suffix.get("training", False):
if enable_return_token_ids:
chunk = CompletionStreamResponse(
id=request_id,
created=created_time,
model=model_name,
choices=[
CompletionResponseStreamChoice(
index=idx,
text="",
token_ids=list(prompt_batched_token_ids[idx]),
)
],
choices=[CompletionResponseStreamChoice(
index=idx,
text="",
prompt_token_ids=list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None,
completion_token_ids=None,
)]
)
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
first_iteration[idx] = False
@@ -275,16 +274,15 @@ class OpenAIServingCompletion:
output = res["outputs"]
choices.append(
CompletionResponseStreamChoice(
index=idx,
text=output["text"],
token_ids=output.get("token_ids"),
tool_calls=output.get("tool_call_content"),
reasoning_content=output.get("reasoning_content"),
arrival_time=arrival_time,
)
)
choices.append(CompletionResponseStreamChoice(
index=idx,
text=output["text"],
prompt_token_ids=None,
completion_token_ids=output.get("token_ids") if enable_return_token_ids else None,
tool_calls=output.get("tool_call_content"),
reasoning_content=output.get("reasoning_content"),
arrival_time=arrival_time
))
if res["finished"]:
if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
chunk.choices[0].finish_reason = "stop"
@@ -347,6 +345,7 @@ class OpenAIServingCompletion:
choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
for idx in range(len(final_res_batch)):
final_res = final_res_batch[idx]
@@ -371,7 +370,9 @@ class OpenAIServingCompletion:
token_ids=token_ids,
index=len(choices),
text=output_text,
reasoning_content=output.get("reasoning_content"),
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
completion_token_ids=output["token_ids"] if enable_return_token_ids else None,
reasoning_content=output.get('reasoning_content'),
tool_calls=output.get("tool_call_content"),
logprobs=None,
finish_reason=None,

View File

@@ -99,8 +99,9 @@ class ErnieProcessor(BaseDataProcessor):
if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
if request.prompt is None and request.messages is None:
raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
if request.prompt is not None or not request.raw_request:
raise ValueError(
f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
if request.prompt is not None:
prompt = request.prompt if request.prompt is not None else request.messages[0]
prompt = prompt[0] if isinstance(prompt, list) else prompt
tokens = self.tokenizer.tokenize(prompt)

View File

@@ -231,7 +231,7 @@ class DataProcessor(BaseDataProcessor):
if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
if request.prompt is not None:
request.prompt_token_ids = self.text2ids(request.prompt, max_model_len, request.raw_request)
request.prompt_token_ids = self.text2ids(request.prompt, max_model_len)
elif request.messages is not None:
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")
@@ -266,7 +266,7 @@ class DataProcessor(BaseDataProcessor):
if not request.get("eos_token_ids"):
request["eos_token_ids"] = self.eos_token_ids
# 处理stop_sequences
# processing stop_sequences
stop_sequences = request.get("stop", [])
if stop_sequences:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
@@ -274,12 +274,11 @@ class DataProcessor(BaseDataProcessor):
request["stop_seqs_len"] = stop_seqs_len
data_processor_logger.info(f"Processing request {request}")
# 处理prompt_token_ids
if not request.get("prompt_token_ids"):
if "prompt" in request:
raw_request = request.get("raw_request", True)
request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len, raw_request).tolist()
elif "messages" in request:
# processing prompt_token_ids
if not request.get('prompt_token_ids'):
if 'prompt' in request:
request['prompt_token_ids'] = self.text2ids(request['prompt'], max_model_len).tolist()
elif 'messages' in request:
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")
request["prompt_token_ids"] = self.messages2ids(request)
@@ -416,7 +415,7 @@ class DataProcessor(BaseDataProcessor):
**kwargs,
)
def text2ids(self, text, max_model_len, raw_request=True):
def text2ids(self, text, max_model_len):
"""
text to token ids

View File

@@ -342,6 +342,9 @@ def test_streaming(openai_client, capsys):
output.append(chunk.choices[0].text)
assert len(output) > 0
# ==========================
# OpenAI Client additional chat/completions test
# ==========================
def test_non_streaming_with_stop_str(openai_client):
"""
@@ -405,3 +408,256 @@ def test_streaming_with_stop_str(openai_client):
for chunk in response:
last_token = chunk.choices[0].delta.content
assert last_token != "</s>"
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in non-streaming chat functionality with the local service
"""
# enable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Hello, how are you?"}],
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": True},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message')
assert hasattr(response.choices[0].message, 'prompt_token_ids')
assert isinstance(response.choices[0].message.prompt_token_ids, list)
assert hasattr(response.choices[0].message, 'completion_token_ids')
assert isinstance(response.choices[0].message.completion_token_ids, list)
# disable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Hello, how are you?"}],
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": False},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message')
assert hasattr(response.choices[0].message, 'prompt_token_ids')
assert response.choices[0].message.prompt_token_ids is None
assert hasattr(response.choices[0].message, 'completion_token_ids')
assert response.choices[0].message.completion_token_ids is None
def test_streaming_chat_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in streaming chat functionality with the local service
"""
# enable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Hello, how are you?"}],
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": True},
stream=True,
)
is_first_chunk = True
for chunk in response:
assert hasattr(chunk, 'choices')
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta')
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
if is_first_chunk:
is_first_chunk = False
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
assert chunk.choices[0].delta.completion_token_ids is None
else:
assert chunk.choices[0].delta.prompt_token_ids is None
assert isinstance(chunk.choices[0].delta.completion_token_ids, list)
# disable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Hello, how are you?"}],
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": False},
stream=True,
)
for chunk in response:
assert hasattr(chunk, 'choices')
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta')
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
assert chunk.choices[0].delta.prompt_token_ids is None
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
assert chunk.choices[0].delta.completion_token_ids is None
def test_non_streaming_completion_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in non-streaming completion functionality with the local service
"""
# enable return_token_ids
response = openai_client.completions.create(
model="default",
prompt="Hello, how are you?",
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": True},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response.choices[0], 'prompt_token_ids')
assert isinstance(response.choices[0].prompt_token_ids, list)
assert hasattr(response.choices[0], 'completion_token_ids')
assert isinstance(response.choices[0].completion_token_ids, list)
# disable return_token_ids
response = openai_client.completions.create(
model="default",
prompt="Hello, how are you?",
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": False},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response.choices[0], 'prompt_token_ids')
assert response.choices[0].prompt_token_ids is None
assert hasattr(response.choices[0], 'completion_token_ids')
assert response.choices[0].completion_token_ids is None
def test_streaming_completion_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in streaming completion functionality with the local service
"""
# enable return_token_ids
response = openai_client.completions.create(
model="default",
prompt="Hello, how are you?",
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": True},
stream=True,
)
is_first_chunk = True
for chunk in response:
assert hasattr(chunk, 'choices')
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'prompt_token_ids')
assert hasattr(chunk.choices[0], 'completion_token_ids')
if is_first_chunk:
is_first_chunk = False
assert isinstance(chunk.choices[0].prompt_token_ids, list)
assert chunk.choices[0].completion_token_ids is None
else:
assert chunk.choices[0].prompt_token_ids is None
assert isinstance(chunk.choices[0].completion_token_ids, list)
# disable return_token_ids
response = openai_client.completions.create(
model="default",
prompt="Hello, how are you?",
temperature=1,
max_tokens=5,
extra_body={"return_token_ids": False},
stream=True,
)
for chunk in response:
assert hasattr(chunk, 'choices')
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'prompt_token_ids')
assert chunk.choices[0].prompt_token_ids is None
assert hasattr(chunk.choices[0], 'completion_token_ids')
assert chunk.choices[0].completion_token_ids is None
def test_non_streaming_chat_with_prompt_token_ids(openai_client, capsys):
"""
Test prompt_token_ids option in non-streaming chat functionality with the local service
"""
response = openai_client.chat.completions.create(
model="default",
messages=[],
temperature=1,
max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response, 'usage')
assert hasattr(response.usage, 'prompt_tokens')
assert response.usage.prompt_tokens == 9
def test_streaming_chat_with_prompt_token_ids(openai_client, capsys):
"""
Test prompt_token_ids option in streaming chat functionality with the local service
"""
response = openai_client.chat.completions.create(
model="default",
messages=[],
temperature=1,
max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
stream=True,
stream_options={"include_usage": True},
)
for chunk in response:
assert hasattr(chunk, 'choices')
assert hasattr(chunk, 'usage')
if len(chunk.choices) > 0:
assert chunk.usage is None
else:
assert hasattr(chunk.usage, 'prompt_tokens')
assert chunk.usage.prompt_tokens == 9
def test_non_streaming_completion_with_prompt_token_ids(openai_client, capsys):
"""
Test prompt_token_ids option in streaming completion functionality with the local service
"""
response = openai_client.completions.create(
model="default",
prompt="",
temperature=1,
max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response, 'usage')
assert hasattr(response.usage, 'prompt_tokens')
assert response.usage.prompt_tokens == 9
def test_streaming_completion_with_prompt_token_ids(openai_client, capsys):
"""
Test prompt_token_ids option in non-streaming completion functionality with the local service
"""
response = openai_client.completions.create(
model="default",
prompt="",
temperature=1,
max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
stream=True,
stream_options={"include_usage": True},
)
for chunk in response:
assert hasattr(chunk, 'choices')
assert hasattr(chunk, 'usage')
if len(chunk.choices) > 0:
assert chunk.usage is None
else:
assert hasattr(chunk.usage, 'prompt_tokens')
assert chunk.usage.prompt_tokens == 9

View File

@@ -323,3 +323,174 @@ def test_streaming_chat(openai_client, capsys):
if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
output.append(chunk.choices[0].delta.content)
assert len(output) > 2
# ==========================
# OpenAI Client additional chat/completions test
# ==========================
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in non-streaming chat functionality with the local service
"""
# 设定 return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{
"role": "system",
"content": "You are a helpful AI assistant."
}, # system不是必需可选
{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url":
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high"
}
}, {
"type": "text",
"text": "请描述图片内容"
}]
}
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": True},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message')
assert hasattr(response.choices[0].message, 'prompt_token_ids')
assert isinstance(response.choices[0].message.prompt_token_ids, list)
assert hasattr(response.choices[0].message, 'completion_token_ids')
assert isinstance(response.choices[0].message.completion_token_ids, list)
# 不设定 return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{
"role": "system",
"content": "You are a helpful AI assistant."
}, # system不是必需可选
{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url":
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high"
}
}, {
"type": "text",
"text": "请描述图片内容"
}]
}
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": False},
stream=False,
)
assert hasattr(response, 'choices')
assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message')
assert hasattr(response.choices[0].message, 'prompt_token_ids')
assert response.choices[0].message.prompt_token_ids is None
assert hasattr(response.choices[0].message, 'completion_token_ids')
assert response.choices[0].message.completion_token_ids is None
def test_streaming_chat_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in streaming chat functionality with the local service
"""
# enable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{
"role": "system",
"content": "You are a helpful AI assistant."
}, # system不是必需可选
{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url":
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high"
}
}, {
"type": "text",
"text": "请描述图片内容"
}]
}
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": True},
stream=True,
)
is_first_chunk = True
for chunk in response:
assert hasattr(chunk, 'choices')
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta')
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
if is_first_chunk:
is_first_chunk = False
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
assert chunk.choices[0].delta.completion_token_ids is None
else:
assert chunk.choices[0].delta.prompt_token_ids is None
assert isinstance(chunk.choices[0].delta.completion_token_ids, list)
# disable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{
"role": "system",
"content": "You are a helpful AI assistant."
}, # system不是必需可选
{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url":
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high"
}
}, {
"type": "text",
"text": "请描述图片内容"
}]
}
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": False},
stream=True,
)
for chunk in response:
assert hasattr(chunk, 'choices')
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta')
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
assert chunk.choices[0].delta.prompt_token_ids is None
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
assert chunk.choices[0].delta.completion_token_ids is None