diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 541254ed6..f31a00ce0 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -46,7 +46,6 @@ class Request: preprocess_end_time: Optional[float] = None, multimodal_inputs: Optional[dict] = None, multimodal_data: Optional[dict] = None, - raw_request: bool = True, disaggregate_info: Optional[dict] = None, draft_token_ids: Optional[list[int]] = None, guided_json: Optional[Any] = None, @@ -74,7 +73,6 @@ class Request: self.arrival_time = arrival_time self.preprocess_start_time = preprocess_start_time self.preprocess_end_time = preprocess_end_time - self.raw_request = raw_request self.disaggregate_info = disaggregate_info # speculative method in disaggregate-mode @@ -117,7 +115,6 @@ class Request: multimodal_data=d.get("multimodal_data"), disaggregate_info=d.get("disaggregate_info"), draft_token_ids=d.get("draft_token_ids"), - raw_request=d.get("raw_request", True), guided_json=d.get("guided_json", None), guided_regex=d.get("guided_regex", None), guided_choice=d.get("guided_choice", None), @@ -145,7 +142,6 @@ class Request: "preprocess_end_time": self.preprocess_end_time, "multimodal_inputs": self.multimodal_inputs, "multimodal_data": self.multimodal_data, - "raw_request": self.raw_request, "disaggregate_info": self.disaggregate_info, "draft_token_ids": self.draft_token_ids, "enable_thinking": self.enable_thinking, diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index fd8c970bb..af257a8bd 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -124,6 +124,8 @@ class ChatMessage(BaseModel): content: str reasoning_content: Optional[str] = None tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None class ChatCompletionResponseChoice(BaseModel): @@ -177,7 +179,8 @@ class DeltaMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None - token_ids: Optional[List[int]] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None @@ -214,7 +217,8 @@ class CompletionResponseChoice(BaseModel): index: int text: str - token_ids: Optional[List[int]] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None arrival_time: Optional[float] = None logprobs: Optional[int] = None reasoning_content: Optional[str] = None @@ -243,7 +247,8 @@ class CompletionResponseStreamChoice(BaseModel): index: int text: str arrival_time: float = None - token_ids: Optional[List[int]] = None + prompt_token_ids: Optional[List[int]] = None + completion_token_ids: Optional[List[int]] = None logprobs: Optional[float] = None reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None @@ -341,6 +346,9 @@ class CompletionRequest(BaseModel): top_k: Optional[int] = None min_p: Optional[float] = None user: Optional[str] = None + extra_body: Optional[dict] = None + return_token_ids: Optional[bool] = False + prompt_token_ids: Optional[List[int]] = None response_format: Optional[AnyResponseFormat] = None guided_json: Optional[Union[str, dict, BaseModel]] = None @@ -373,9 +381,13 @@ class CompletionRequest(BaseModel): if prompt is not None: req_dict["prompt"] = prompt - if isinstance(prompt[0], int): - req_dict["prompt_token_ids"] = prompt - del req_dict["prompt"] + if self.prompt_token_ids is not None or \ + (self.extra_body is not None and self.extra_body.get("prompt_token_ids") is not None): + req_dict["prompt_token_ids"] = self.prompt_token_ids + if "prompt" in req_dict: + del req_dict["prompt"] + else: + assert len(prompt) > 0 guided_json_object = None if self.response_format is not None: @@ -464,6 +476,9 @@ class ChatCompletionRequest(BaseModel): min_p: Optional[float] = None user: Optional[str] = None metadata: Optional[dict] = None + extra_body: Optional[dict] = None + return_token_ids: Optional[bool] = False + prompt_token_ids: Optional[List[int]] = None response_format: Optional[AnyResponseFormat] = None guided_json: Optional[Union[str, dict, BaseModel]] = None @@ -499,12 +514,14 @@ class ChatCompletionRequest(BaseModel): for key, value in self.dict().items(): if value is not None: req_dict[key] = value - if isinstance(self.messages[0], int): - req_dict["prompt_token_ids"] = self.messages - del req_dict["messages"] - if "raw_request" in req_dict and not req_dict["raw_request"]: - req_dict["prompt"] = req_dict["messages"][0]["content"] - del req_dict["messages"] + + if self.prompt_token_ids is not None or \ + (self.extra_body is not None and self.extra_body.get("prompt_token_ids") is not None): + req_dict["prompt_token_ids"] = self.prompt_token_ids + if "messages" in req_dict: + del req_dict["messages"] + else: + assert len(self.messages) > 0 guided_json_object = None if self.response_format is not None: diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 653d1e171..bc4fd679e 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -144,6 +144,7 @@ class OpenAIServingChat: if request.metadata is not None: enable_thinking = request.metadata.get("enable_thinking") include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False) + enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) while num_choices > 0: try: raw_data = await asyncio.wait_for(dealer.read(), timeout=10) @@ -185,14 +186,16 @@ class OpenAIServingChat: choice = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage( - role="assistant", - content="", - reasoning_content="", + role="assistant", + content="", + reasoning_content="", tool_calls=None, - ), + prompt_token_ids=None, + completion_token_ids=None, + ) ) - if request.metadata is not None and request.metadata.get("training", False): - choice.delta.token_ids = prompt_token_ids + if enable_return_token_ids: + choice.delta.prompt_token_ids = list(prompt_token_ids) chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, @@ -228,9 +231,10 @@ class OpenAIServingChat: previous_num_tokens += len(output["token_ids"]) delta_message = DeltaMessage( - content=delta_text, - reasoning_content=output.get("reasoning_content"), - token_ids=output.get("token_ids"), + content=delta_text, + reasoning_content=output.get("reasoning_content"), \ + prompt_token_ids=None, + completion_token_ids=None, tool_calls=output.get("tool_call_content", []), ) @@ -260,8 +264,8 @@ class OpenAIServingChat: if res.get("error_msg") is not None and "Recover" in res["error_msg"]: choice.finish_reason = "recover_stop" - if request.metadata is not None and request.metadata.get("training", False) and delta_text != "": - choice.delta.token_ids = output["token_ids"] + if enable_return_token_ids: + choice.delta.completion_token_ids = list(output["token_ids"]) if include_continuous_usage: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, @@ -318,6 +322,7 @@ class OpenAIServingChat: final_res = None enable_thinking = None include_stop_str_in_output = False + enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) try: dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") dealer.write([b"", request_id.encode("utf-8")]) @@ -388,7 +393,8 @@ class OpenAIServingChat: content=output["text"], reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), - token_ids=output.get("token_ids"), + prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, + completion_token_ids=output.get("token_ids") if enable_return_token_ids else None, ) logprobs_full_res = None if logprob_contents: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 648376d3d..5ad554566 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -226,7 +226,7 @@ class OpenAIServingCompletion: model=model_name, choices=choices, ) - + enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) current_waiting_time = 0 while num_choices > 0: try: @@ -250,18 +250,17 @@ class OpenAIServingCompletion: raise ValueError("{}".format(res["error_msg"])) if first_iteration[idx]: - if request.suffix is not None and request.suffix.get("training", False): + if enable_return_token_ids: chunk = CompletionStreamResponse( id=request_id, created=created_time, model=model_name, - choices=[ - CompletionResponseStreamChoice( - index=idx, - text="", - token_ids=list(prompt_batched_token_ids[idx]), - ) - ], + choices=[CompletionResponseStreamChoice( + index=idx, + text="", + prompt_token_ids=list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None, + completion_token_ids=None, + )] ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" first_iteration[idx] = False @@ -275,16 +274,15 @@ class OpenAIServingCompletion: output = res["outputs"] - choices.append( - CompletionResponseStreamChoice( - index=idx, - text=output["text"], - token_ids=output.get("token_ids"), - tool_calls=output.get("tool_call_content"), - reasoning_content=output.get("reasoning_content"), - arrival_time=arrival_time, - ) - ) + choices.append(CompletionResponseStreamChoice( + index=idx, + text=output["text"], + prompt_token_ids=None, + completion_token_ids=output.get("token_ids") if enable_return_token_ids else None, + tool_calls=output.get("tool_call_content"), + reasoning_content=output.get("reasoning_content"), + arrival_time=arrival_time + )) if res["finished"]: if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens: chunk.choices[0].finish_reason = "stop" @@ -347,6 +345,7 @@ class OpenAIServingCompletion: choices: List[CompletionResponseChoice] = [] num_prompt_tokens = 0 num_generated_tokens = 0 + enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) for idx in range(len(final_res_batch)): final_res = final_res_batch[idx] @@ -371,7 +370,9 @@ class OpenAIServingCompletion: token_ids=token_ids, index=len(choices), text=output_text, - reasoning_content=output.get("reasoning_content"), + prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, + completion_token_ids=output["token_ids"] if enable_return_token_ids else None, + reasoning_content=output.get('reasoning_content'), tool_calls=output.get("tool_call_content"), logprobs=None, finish_reason=None, diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index 0fe996d4f..6bb7b5011 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -99,8 +99,9 @@ class ErnieProcessor(BaseDataProcessor): if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt is None and request.messages is None: - raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.") - if request.prompt is not None or not request.raw_request: + raise ValueError( + f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.") + if request.prompt is not None: prompt = request.prompt if request.prompt is not None else request.messages[0] prompt = prompt[0] if isinstance(prompt, list) else prompt tokens = self.tokenizer.tokenize(prompt) diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index b79d469c5..a60b0a781 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -231,7 +231,7 @@ class DataProcessor(BaseDataProcessor): if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt is not None: - request.prompt_token_ids = self.text2ids(request.prompt, max_model_len, request.raw_request) + request.prompt_token_ids = self.text2ids(request.prompt, max_model_len) elif request.messages is not None: if self.tokenizer.chat_template is None: raise ValueError("This model does not support chat_template.") @@ -266,7 +266,7 @@ class DataProcessor(BaseDataProcessor): if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids - # 处理stop_sequences + # processing stop_sequences stop_sequences = request.get("stop", []) if stop_sequences: stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) @@ -274,12 +274,11 @@ class DataProcessor(BaseDataProcessor): request["stop_seqs_len"] = stop_seqs_len data_processor_logger.info(f"Processing request {request}") - # 处理prompt_token_ids - if not request.get("prompt_token_ids"): - if "prompt" in request: - raw_request = request.get("raw_request", True) - request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len, raw_request).tolist() - elif "messages" in request: + # processing prompt_token_ids + if not request.get('prompt_token_ids'): + if 'prompt' in request: + request['prompt_token_ids'] = self.text2ids(request['prompt'], max_model_len).tolist() + elif 'messages' in request: if self.tokenizer.chat_template is None: raise ValueError("This model does not support chat_template.") request["prompt_token_ids"] = self.messages2ids(request) @@ -416,7 +415,7 @@ class DataProcessor(BaseDataProcessor): **kwargs, ) - def text2ids(self, text, max_model_len, raw_request=True): + def text2ids(self, text, max_model_len): """ text to token ids diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 645997673..8d08cbaa2 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -342,6 +342,9 @@ def test_streaming(openai_client, capsys): output.append(chunk.choices[0].text) assert len(output) > 0 +# ========================== +# OpenAI Client additional chat/completions test +# ========================== def test_non_streaming_with_stop_str(openai_client): """ @@ -405,3 +408,256 @@ def test_streaming_with_stop_str(openai_client): for chunk in response: last_token = chunk.choices[0].delta.content assert last_token != "" + + +def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): + """ + Test return_token_ids option in non-streaming chat functionality with the local service + """ + # enable return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": True}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response.choices[0], 'message') + assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert isinstance(response.choices[0].message.prompt_token_ids, list) + assert hasattr(response.choices[0].message, 'completion_token_ids') + assert isinstance(response.choices[0].message.completion_token_ids, list) + + # disable return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": False}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response.choices[0], 'message') + assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert response.choices[0].message.prompt_token_ids is None + assert hasattr(response.choices[0].message, 'completion_token_ids') + assert response.choices[0].message.completion_token_ids is None + + +def test_streaming_chat_with_return_token_ids(openai_client, capsys): + """ + Test return_token_ids option in streaming chat functionality with the local service + """ + # enable return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": True}, + stream=True, + ) + is_first_chunk = True + for chunk in response: + assert hasattr(chunk, 'choices') + assert len(chunk.choices) > 0 + assert hasattr(chunk.choices[0], 'delta') + assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') + assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + if is_first_chunk: + is_first_chunk = False + assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) + assert chunk.choices[0].delta.completion_token_ids is None + else: + assert chunk.choices[0].delta.prompt_token_ids is None + assert isinstance(chunk.choices[0].delta.completion_token_ids, list) + + # disable return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": False}, + stream=True, + ) + for chunk in response: + assert hasattr(chunk, 'choices') + assert len(chunk.choices) > 0 + assert hasattr(chunk.choices[0], 'delta') + assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') + assert chunk.choices[0].delta.prompt_token_ids is None + assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + assert chunk.choices[0].delta.completion_token_ids is None + + +def test_non_streaming_completion_with_return_token_ids(openai_client, capsys): + """ + Test return_token_ids option in non-streaming completion functionality with the local service + """ + # enable return_token_ids + response = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": True}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response.choices[0], 'prompt_token_ids') + assert isinstance(response.choices[0].prompt_token_ids, list) + assert hasattr(response.choices[0], 'completion_token_ids') + assert isinstance(response.choices[0].completion_token_ids, list) + + # disable return_token_ids + response = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": False}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response.choices[0], 'prompt_token_ids') + assert response.choices[0].prompt_token_ids is None + assert hasattr(response.choices[0], 'completion_token_ids') + assert response.choices[0].completion_token_ids is None + + +def test_streaming_completion_with_return_token_ids(openai_client, capsys): + """ + Test return_token_ids option in streaming completion functionality with the local service + """ + # enable return_token_ids + response = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": True}, + stream=True, + ) + is_first_chunk = True + for chunk in response: + assert hasattr(chunk, 'choices') + assert len(chunk.choices) > 0 + assert hasattr(chunk.choices[0], 'prompt_token_ids') + assert hasattr(chunk.choices[0], 'completion_token_ids') + if is_first_chunk: + is_first_chunk = False + assert isinstance(chunk.choices[0].prompt_token_ids, list) + assert chunk.choices[0].completion_token_ids is None + else: + assert chunk.choices[0].prompt_token_ids is None + assert isinstance(chunk.choices[0].completion_token_ids, list) + + # disable return_token_ids + response = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + temperature=1, + max_tokens=5, + extra_body={"return_token_ids": False}, + stream=True, + ) + for chunk in response: + assert hasattr(chunk, 'choices') + assert len(chunk.choices) > 0 + assert hasattr(chunk.choices[0], 'prompt_token_ids') + assert chunk.choices[0].prompt_token_ids is None + assert hasattr(chunk.choices[0], 'completion_token_ids') + assert chunk.choices[0].completion_token_ids is None + + +def test_non_streaming_chat_with_prompt_token_ids(openai_client, capsys): + """ + Test prompt_token_ids option in non-streaming chat functionality with the local service + """ + response = openai_client.chat.completions.create( + model="default", + messages=[], + temperature=1, + max_tokens=5, + extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response, 'usage') + assert hasattr(response.usage, 'prompt_tokens') + assert response.usage.prompt_tokens == 9 + + +def test_streaming_chat_with_prompt_token_ids(openai_client, capsys): + """ + Test prompt_token_ids option in streaming chat functionality with the local service + """ + response = openai_client.chat.completions.create( + model="default", + messages=[], + temperature=1, + max_tokens=5, + extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + stream=True, + stream_options={"include_usage": True}, + ) + for chunk in response: + assert hasattr(chunk, 'choices') + assert hasattr(chunk, 'usage') + if len(chunk.choices) > 0: + assert chunk.usage is None + else: + assert hasattr(chunk.usage, 'prompt_tokens') + assert chunk.usage.prompt_tokens == 9 + + +def test_non_streaming_completion_with_prompt_token_ids(openai_client, capsys): + """ + Test prompt_token_ids option in streaming completion functionality with the local service + """ + response = openai_client.completions.create( + model="default", + prompt="", + temperature=1, + max_tokens=5, + extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response, 'usage') + assert hasattr(response.usage, 'prompt_tokens') + assert response.usage.prompt_tokens == 9 + + +def test_streaming_completion_with_prompt_token_ids(openai_client, capsys): + """ + Test prompt_token_ids option in non-streaming completion functionality with the local service + """ + response = openai_client.completions.create( + model="default", + prompt="", + temperature=1, + max_tokens=5, + extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, + stream=True, + stream_options={"include_usage": True}, + ) + for chunk in response: + assert hasattr(chunk, 'choices') + assert hasattr(chunk, 'usage') + if len(chunk.choices) > 0: + assert chunk.usage is None + else: + assert hasattr(chunk.usage, 'prompt_tokens') + assert chunk.usage.prompt_tokens == 9 + diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index b362ba0bd..083f4fc74 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -323,3 +323,174 @@ def test_streaming_chat(openai_client, capsys): if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): output.append(chunk.choices[0].delta.content) assert len(output) > 2 + + + +# ========================== +# OpenAI Client additional chat/completions test +# ========================== + +def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): + """ + Test return_token_ids option in non-streaming chat functionality with the local service + """ + # 设定 return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[ + { + "role": "system", + "content": "You are a helpful AI assistant." + }, # system不是必需,可选 + { + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": + "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high" + } + }, { + "type": "text", + "text": "请描述图片内容" + }] + } + ], + temperature=1, + max_tokens=53, + extra_body={"return_token_ids": True}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response.choices[0], 'message') + assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert isinstance(response.choices[0].message.prompt_token_ids, list) + assert hasattr(response.choices[0].message, 'completion_token_ids') + assert isinstance(response.choices[0].message.completion_token_ids, list) + + # 不设定 return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[ + { + "role": "system", + "content": "You are a helpful AI assistant." + }, # system不是必需,可选 + { + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": + "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high" + } + }, { + "type": "text", + "text": "请描述图片内容" + }] + } + ], + temperature=1, + max_tokens=53, + extra_body={"return_token_ids": False}, + stream=False, + ) + assert hasattr(response, 'choices') + assert len(response.choices) > 0 + assert hasattr(response.choices[0], 'message') + assert hasattr(response.choices[0].message, 'prompt_token_ids') + assert response.choices[0].message.prompt_token_ids is None + assert hasattr(response.choices[0].message, 'completion_token_ids') + assert response.choices[0].message.completion_token_ids is None + + +def test_streaming_chat_with_return_token_ids(openai_client, capsys): + """ + Test return_token_ids option in streaming chat functionality with the local service + """ + # enable return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[ + { + "role": "system", + "content": "You are a helpful AI assistant." + }, # system不是必需,可选 + { + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": + "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high" + } + }, { + "type": "text", + "text": "请描述图片内容" + }] + } + ], + temperature=1, + max_tokens=53, + extra_body={"return_token_ids": True}, + stream=True, + ) + is_first_chunk = True + for chunk in response: + assert hasattr(chunk, 'choices') + assert len(chunk.choices) > 0 + assert hasattr(chunk.choices[0], 'delta') + assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') + assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + if is_first_chunk: + is_first_chunk = False + assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) + assert chunk.choices[0].delta.completion_token_ids is None + else: + assert chunk.choices[0].delta.prompt_token_ids is None + assert isinstance(chunk.choices[0].delta.completion_token_ids, list) + + # disable return_token_ids + response = openai_client.chat.completions.create( + model="default", + messages=[ + { + "role": "system", + "content": "You are a helpful AI assistant." + }, # system不是必需,可选 + { + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": + "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", + "detail": "high" + } + }, { + "type": "text", + "text": "请描述图片内容" + }] + } + ], + temperature=1, + max_tokens=53, + extra_body={"return_token_ids": False}, + stream=True, + ) + for chunk in response: + assert hasattr(chunk, 'choices') + assert len(chunk.choices) > 0 + assert hasattr(chunk.choices[0], 'delta') + assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') + assert chunk.choices[0].delta.prompt_token_ids is None + assert hasattr(chunk.choices[0].delta, 'completion_token_ids') + assert chunk.choices[0].delta.completion_token_ids is None