mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
[Feature] Support include_stop_str_in_output in completion api (#3096)
* [Feature] Support include_stop_str_in_output in completion api * Fix ci test --------- Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
This commit is contained in:
@@ -345,6 +345,7 @@ class CompletionRequest(BaseModel):
|
|||||||
top_p: Optional[float] = None
|
top_p: Optional[float] = None
|
||||||
top_k: Optional[int] = None
|
top_k: Optional[int] = None
|
||||||
min_p: Optional[float] = None
|
min_p: Optional[float] = None
|
||||||
|
include_stop_str_in_output: Optional[bool] = False
|
||||||
user: Optional[str] = None
|
user: Optional[str] = None
|
||||||
|
|
||||||
min_tokens: Optional[int] = None
|
min_tokens: Optional[int] = None
|
||||||
@@ -488,7 +489,7 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
enable_thinking: Optional[bool] = None
|
enable_thinking: Optional[bool] = None
|
||||||
reasoning_max_tokens: Optional[int] = None
|
reasoning_max_tokens: Optional[int] = None
|
||||||
max_streaming_response_tokens: Optional[int] = None
|
max_streaming_response_tokens: Optional[int] = None
|
||||||
include_stop_str_in_output: Optional[bool] = None
|
include_stop_str_in_output: Optional[bool] = False
|
||||||
bad_words: Optional[List[str]] = None
|
bad_words: Optional[List[str]] = None
|
||||||
|
|
||||||
response_format: Optional[AnyResponseFormat] = None
|
response_format: Optional[AnyResponseFormat] = None
|
||||||
|
@@ -134,11 +134,7 @@ class OpenAIServingChat:
|
|||||||
if request.enable_thinking is not None
|
if request.enable_thinking is not None
|
||||||
else (request.metadata or {}).get("enable_thinking")
|
else (request.metadata or {}).get("enable_thinking")
|
||||||
)
|
)
|
||||||
include_stop_str_in_output = (
|
include_stop_str_in_output = request.include_stop_str_in_output
|
||||||
request.include_stop_str_in_output
|
|
||||||
if request.include_stop_str_in_output is not None
|
|
||||||
else (request.metadata or {}).get("include_stop_str_in_output", False)
|
|
||||||
)
|
|
||||||
|
|
||||||
stream_options = request.stream_options
|
stream_options = request.stream_options
|
||||||
if stream_options is None:
|
if stream_options is None:
|
||||||
@@ -339,11 +335,7 @@ class OpenAIServingChat:
|
|||||||
if request.enable_thinking is not None
|
if request.enable_thinking is not None
|
||||||
else (request.metadata or {}).get("enable_thinking")
|
else (request.metadata or {}).get("enable_thinking")
|
||||||
)
|
)
|
||||||
include_stop_str_in_output = (
|
include_stop_str_in_output = request.include_stop_str_in_output
|
||||||
request.include_stop_str_in_output
|
|
||||||
if request.include_stop_str_in_output is not None
|
|
||||||
else (request.metadata or {}).get("include_stop_str_in_output", False)
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
|
dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
|
||||||
|
@@ -182,7 +182,9 @@ class OpenAIServingCompletion:
|
|||||||
if data.get("error_code", 200) != 200:
|
if data.get("error_code", 200) != 200:
|
||||||
raise ValueError("{}".format(data["error_msg"]))
|
raise ValueError("{}".format(data["error_msg"]))
|
||||||
|
|
||||||
self.engine_client.data_processor.process_response_dict(data, stream=False)
|
self.engine_client.data_processor.process_response_dict(
|
||||||
|
data, stream=False, include_stop_str_in_output=request.include_stop_str_in_output
|
||||||
|
)
|
||||||
output_tokens[rid] += len(data["outputs"]["token_ids"])
|
output_tokens[rid] += len(data["outputs"]["token_ids"])
|
||||||
completion_batched_token_ids[rid].extend(data["outputs"]["token_ids"])
|
completion_batched_token_ids[rid].extend(data["outputs"]["token_ids"])
|
||||||
if data.get("finished", False):
|
if data.get("finished", False):
|
||||||
@@ -280,7 +282,9 @@ class OpenAIServingCompletion:
|
|||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
first_iteration[idx] = False
|
first_iteration[idx] = False
|
||||||
|
|
||||||
self.engine_client.data_processor.process_response_dict(res, stream=True)
|
self.engine_client.data_processor.process_response_dict(
|
||||||
|
res, stream=True, include_stop_str_in_output=request.include_stop_str_in_output
|
||||||
|
)
|
||||||
if res["metrics"].get("first_token_time") is not None:
|
if res["metrics"].get("first_token_time") is not None:
|
||||||
arrival_time = res["metrics"]["first_token_time"]
|
arrival_time = res["metrics"]["first_token_time"]
|
||||||
inference_start_time[idx] = res["metrics"]["inference_start_time"]
|
inference_start_time[idx] = res["metrics"]["inference_start_time"]
|
||||||
|
@@ -357,7 +357,7 @@ def test_non_streaming_with_stop_str(openai_client):
|
|||||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
metadata={"include_stop_str_in_output": True},
|
extra_body={"include_stop_str_in_output": True},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
# Assertions to check the response structure
|
# Assertions to check the response structure
|
||||||
@@ -370,7 +370,7 @@ def test_non_streaming_with_stop_str(openai_client):
|
|||||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
metadata={"include_stop_str_in_output": False},
|
extra_body={"include_stop_str_in_output": False},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
# Assertions to check the response structure
|
# Assertions to check the response structure
|
||||||
@@ -378,6 +378,25 @@ def test_non_streaming_with_stop_str(openai_client):
|
|||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert not response.choices[0].message.content.endswith("</s>")
|
assert not response.choices[0].message.content.endswith("</s>")
|
||||||
|
|
||||||
|
response = openai_client.completions.create(
|
||||||
|
model="default",
|
||||||
|
prompt="Hello, how are you?",
|
||||||
|
temperature=1,
|
||||||
|
max_tokens=1024,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
assert not response.choices[0].text.endswith("</s>")
|
||||||
|
|
||||||
|
response = openai_client.completions.create(
|
||||||
|
model="default",
|
||||||
|
prompt="Hello, how are you?",
|
||||||
|
temperature=1,
|
||||||
|
max_tokens=1024,
|
||||||
|
extra_body={"include_stop_str_in_output": True},
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
assert response.choices[0].text.endswith("</s>")
|
||||||
|
|
||||||
|
|
||||||
def test_streaming_with_stop_str(openai_client):
|
def test_streaming_with_stop_str(openai_client):
|
||||||
"""
|
"""
|
||||||
@@ -388,7 +407,7 @@ def test_streaming_with_stop_str(openai_client):
|
|||||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
metadata={"include_stop_str_in_output": True},
|
extra_body={"include_stop_str_in_output": True},
|
||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
# Assertions to check the response structure
|
# Assertions to check the response structure
|
||||||
@@ -402,7 +421,7 @@ def test_streaming_with_stop_str(openai_client):
|
|||||||
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
metadata={"include_stop_str_in_output": False},
|
extra_body={"include_stop_str_in_output": False},
|
||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
# Assertions to check the response structure
|
# Assertions to check the response structure
|
||||||
@@ -411,6 +430,29 @@ def test_streaming_with_stop_str(openai_client):
|
|||||||
last_token = chunk.choices[0].delta.content
|
last_token = chunk.choices[0].delta.content
|
||||||
assert last_token != "</s>"
|
assert last_token != "</s>"
|
||||||
|
|
||||||
|
response_1 = openai_client.completions.create(
|
||||||
|
model="default",
|
||||||
|
prompt="Hello, how are you?",
|
||||||
|
max_tokens=10,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
last_token = ""
|
||||||
|
for chunk in response_1:
|
||||||
|
last_token = chunk.choices[0].text
|
||||||
|
assert not last_token.endswith("</s>")
|
||||||
|
|
||||||
|
response_1 = openai_client.completions.create(
|
||||||
|
model="default",
|
||||||
|
prompt="Hello, how are you?",
|
||||||
|
max_tokens=10,
|
||||||
|
extra_body={"include_stop_str_in_output": True},
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
last_token = ""
|
||||||
|
for chunk in response_1:
|
||||||
|
last_token = chunk.choices[0].text
|
||||||
|
assert last_token.endswith("</s>")
|
||||||
|
|
||||||
|
|
||||||
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
||||||
"""
|
"""
|
||||||
|
Reference in New Issue
Block a user