From 0616c208d23cf62d2b311a1d66d44879bb6192c6 Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Date: Wed, 30 Jul 2025 22:18:48 +0800 Subject: [PATCH] [Feature] Support include_stop_str_in_output in completion api (#3096) * [Feature] Support include_stop_str_in_output in completion api * Fix ci test --------- Co-authored-by: Jiang-Jia-Jun --- fastdeploy/entrypoints/openai/protocol.py | 3 +- fastdeploy/entrypoints/openai/serving_chat.py | 12 +---- .../entrypoints/openai/serving_completion.py | 8 ++- test/ci_use/EB_Lite/test_EB_Lite_serving.py | 50 +++++++++++++++++-- 4 files changed, 56 insertions(+), 17 deletions(-) diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 94f2d5757..482399b48 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -345,6 +345,7 @@ class CompletionRequest(BaseModel): top_p: Optional[float] = None top_k: Optional[int] = None min_p: Optional[float] = None + include_stop_str_in_output: Optional[bool] = False user: Optional[str] = None min_tokens: Optional[int] = None @@ -488,7 +489,7 @@ class ChatCompletionRequest(BaseModel): enable_thinking: Optional[bool] = None reasoning_max_tokens: Optional[int] = None max_streaming_response_tokens: Optional[int] = None - include_stop_str_in_output: Optional[bool] = None + include_stop_str_in_output: Optional[bool] = False bad_words: Optional[List[str]] = None response_format: Optional[AnyResponseFormat] = None diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index d28eb3c7f..8b2141a4b 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -134,11 +134,7 @@ class OpenAIServingChat: if request.enable_thinking is not None else (request.metadata or {}).get("enable_thinking") ) - include_stop_str_in_output = ( - request.include_stop_str_in_output - if request.include_stop_str_in_output is not None - else (request.metadata or {}).get("include_stop_str_in_output", False) - ) + include_stop_str_in_output = request.include_stop_str_in_output stream_options = request.stream_options if stream_options is None: @@ -339,11 +335,7 @@ class OpenAIServingChat: if request.enable_thinking is not None else (request.metadata or {}).get("enable_thinking") ) - include_stop_str_in_output = ( - request.include_stop_str_in_output - if request.include_stop_str_in_output is not None - else (request.metadata or {}).get("include_stop_str_in_output", False) - ) + include_stop_str_in_output = request.include_stop_str_in_output try: dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 871604799..268cae2ff 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -182,7 +182,9 @@ class OpenAIServingCompletion: if data.get("error_code", 200) != 200: raise ValueError("{}".format(data["error_msg"])) - self.engine_client.data_processor.process_response_dict(data, stream=False) + self.engine_client.data_processor.process_response_dict( + data, stream=False, include_stop_str_in_output=request.include_stop_str_in_output + ) output_tokens[rid] += len(data["outputs"]["token_ids"]) completion_batched_token_ids[rid].extend(data["outputs"]["token_ids"]) if data.get("finished", False): @@ -280,7 +282,9 @@ class OpenAIServingCompletion: yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" first_iteration[idx] = False - self.engine_client.data_processor.process_response_dict(res, stream=True) + self.engine_client.data_processor.process_response_dict( + res, stream=True, include_stop_str_in_output=request.include_stop_str_in_output + ) if res["metrics"].get("first_token_time") is not None: arrival_time = res["metrics"]["first_token_time"] inference_start_time[idx] = res["metrics"]["inference_start_time"] diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 01ddcfa3d..eefd653d2 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -357,7 +357,7 @@ def test_non_streaming_with_stop_str(openai_client): messages=[{"role": "user", "content": "Hello, how are you?"}], temperature=1, max_tokens=5, - metadata={"include_stop_str_in_output": True}, + extra_body={"include_stop_str_in_output": True}, stream=False, ) # Assertions to check the response structure @@ -370,7 +370,7 @@ def test_non_streaming_with_stop_str(openai_client): messages=[{"role": "user", "content": "Hello, how are you?"}], temperature=1, max_tokens=5, - metadata={"include_stop_str_in_output": False}, + extra_body={"include_stop_str_in_output": False}, stream=False, ) # Assertions to check the response structure @@ -378,6 +378,25 @@ def test_non_streaming_with_stop_str(openai_client): assert len(response.choices) > 0 assert not response.choices[0].message.content.endswith("") + response = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + temperature=1, + max_tokens=1024, + stream=False, + ) + assert not response.choices[0].text.endswith("") + + response = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + temperature=1, + max_tokens=1024, + extra_body={"include_stop_str_in_output": True}, + stream=False, + ) + assert response.choices[0].text.endswith("") + def test_streaming_with_stop_str(openai_client): """ @@ -388,7 +407,7 @@ def test_streaming_with_stop_str(openai_client): messages=[{"role": "user", "content": "Hello, how are you?"}], temperature=1, max_tokens=5, - metadata={"include_stop_str_in_output": True}, + extra_body={"include_stop_str_in_output": True}, stream=True, ) # Assertions to check the response structure @@ -402,7 +421,7 @@ def test_streaming_with_stop_str(openai_client): messages=[{"role": "user", "content": "Hello, how are you?"}], temperature=1, max_tokens=5, - metadata={"include_stop_str_in_output": False}, + extra_body={"include_stop_str_in_output": False}, stream=True, ) # Assertions to check the response structure @@ -411,6 +430,29 @@ def test_streaming_with_stop_str(openai_client): last_token = chunk.choices[0].delta.content assert last_token != "" + response_1 = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + max_tokens=10, + stream=True, + ) + last_token = "" + for chunk in response_1: + last_token = chunk.choices[0].text + assert not last_token.endswith("") + + response_1 = openai_client.completions.create( + model="default", + prompt="Hello, how are you?", + max_tokens=10, + extra_body={"include_stop_str_in_output": True}, + stream=True, + ) + last_token = "" + for chunk in response_1: + last_token = chunk.choices[0].text + assert last_token.endswith("") + def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): """