From 0616c208d23cf62d2b311a1d66d44879bb6192c6 Mon Sep 17 00:00:00 2001
From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Date: Wed, 30 Jul 2025 22:18:48 +0800
Subject: [PATCH] [Feature] Support include_stop_str_in_output in completion
 api (#3096)

* [Feature] Support include_stop_str_in_output in completion api

* Fix ci test

---------

Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
---
 fastdeploy/entrypoints/openai/protocol.py     |  3 +-
 fastdeploy/entrypoints/openai/serving_chat.py | 12 +----
 .../entrypoints/openai/serving_completion.py  |  8 ++-
 test/ci_use/EB_Lite/test_EB_Lite_serving.py   | 50 +++++++++++++++++--
 4 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index 94f2d5757..482399b48 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -345,6 +345,7 @@ class CompletionRequest(BaseModel):
     top_p: Optional[float] = None
     top_k: Optional[int] = None
     min_p: Optional[float] = None
+    include_stop_str_in_output: Optional[bool] = False
     user: Optional[str] = None
 
     min_tokens: Optional[int] = None
@@ -488,7 +489,7 @@ class ChatCompletionRequest(BaseModel):
     enable_thinking: Optional[bool] = None
     reasoning_max_tokens: Optional[int] = None
     max_streaming_response_tokens: Optional[int] = None
-    include_stop_str_in_output: Optional[bool] = None
+    include_stop_str_in_output: Optional[bool] = False
     bad_words: Optional[List[str]] = None
 
     response_format: Optional[AnyResponseFormat] = None
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index d28eb3c7f..8b2141a4b 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -134,11 +134,7 @@ class OpenAIServingChat:
             if request.enable_thinking is not None
             else (request.metadata or {}).get("enable_thinking")
         )
-        include_stop_str_in_output = (
-            request.include_stop_str_in_output
-            if request.include_stop_str_in_output is not None
-            else (request.metadata or {}).get("include_stop_str_in_output", False)
-        )
+        include_stop_str_in_output = request.include_stop_str_in_output
 
         stream_options = request.stream_options
         if stream_options is None:
@@ -339,11 +335,7 @@ class OpenAIServingChat:
             if request.enable_thinking is not None
             else (request.metadata or {}).get("enable_thinking")
         )
-        include_stop_str_in_output = (
-            request.include_stop_str_in_output
-            if request.include_stop_str_in_output is not None
-            else (request.metadata or {}).get("include_stop_str_in_output", False)
-        )
+        include_stop_str_in_output = request.include_stop_str_in_output
 
         try:
             dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 871604799..268cae2ff 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -182,7 +182,9 @@ class OpenAIServingCompletion:
                     if data.get("error_code", 200) != 200:
                         raise ValueError("{}".format(data["error_msg"]))
 
-                    self.engine_client.data_processor.process_response_dict(data, stream=False)
+                    self.engine_client.data_processor.process_response_dict(
+                        data, stream=False, include_stop_str_in_output=request.include_stop_str_in_output
+                    )
                     output_tokens[rid] += len(data["outputs"]["token_ids"])
                     completion_batched_token_ids[rid].extend(data["outputs"]["token_ids"])
                     if data.get("finished", False):
@@ -280,7 +282,9 @@ class OpenAIServingCompletion:
                             yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
                         first_iteration[idx] = False
 
-                    self.engine_client.data_processor.process_response_dict(res, stream=True)
+                    self.engine_client.data_processor.process_response_dict(
+                        res, stream=True, include_stop_str_in_output=request.include_stop_str_in_output
+                    )
                     if res["metrics"].get("first_token_time") is not None:
                         arrival_time = res["metrics"]["first_token_time"]
                         inference_start_time[idx] = res["metrics"]["inference_start_time"]
diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
index 01ddcfa3d..eefd653d2 100644
--- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py
+++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py
@@ -357,7 +357,7 @@ def test_non_streaming_with_stop_str(openai_client):
         messages=[{"role": "user", "content": "Hello, how are you?"}],
         temperature=1,
         max_tokens=5,
-        metadata={"include_stop_str_in_output": True},
+        extra_body={"include_stop_str_in_output": True},
         stream=False,
     )
     # Assertions to check the response structure
@@ -370,7 +370,7 @@ def test_non_streaming_with_stop_str(openai_client):
         messages=[{"role": "user", "content": "Hello, how are you?"}],
         temperature=1,
         max_tokens=5,
-        metadata={"include_stop_str_in_output": False},
+        extra_body={"include_stop_str_in_output": False},
         stream=False,
     )
     # Assertions to check the response structure
@@ -378,6 +378,25 @@ def test_non_streaming_with_stop_str(openai_client):
     assert len(response.choices) > 0
     assert not response.choices[0].message.content.endswith("</s>")
 
+    response = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        temperature=1,
+        max_tokens=1024,
+        stream=False,
+    )
+    assert not response.choices[0].text.endswith("</s>")
+
+    response = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        temperature=1,
+        max_tokens=1024,
+        extra_body={"include_stop_str_in_output": True},
+        stream=False,
+    )
+    assert response.choices[0].text.endswith("</s>")
+
 
 def test_streaming_with_stop_str(openai_client):
     """
@@ -388,7 +407,7 @@ def test_streaming_with_stop_str(openai_client):
         messages=[{"role": "user", "content": "Hello, how are you?"}],
         temperature=1,
         max_tokens=5,
-        metadata={"include_stop_str_in_output": True},
+        extra_body={"include_stop_str_in_output": True},
         stream=True,
     )
     # Assertions to check the response structure
@@ -402,7 +421,7 @@ def test_streaming_with_stop_str(openai_client):
         messages=[{"role": "user", "content": "Hello, how are you?"}],
         temperature=1,
         max_tokens=5,
-        metadata={"include_stop_str_in_output": False},
+        extra_body={"include_stop_str_in_output": False},
         stream=True,
     )
     # Assertions to check the response structure
@@ -411,6 +430,29 @@ def test_streaming_with_stop_str(openai_client):
         last_token = chunk.choices[0].delta.content
     assert last_token != "</s>"
 
+    response_1 = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        max_tokens=10,
+        stream=True,
+    )
+    last_token = ""
+    for chunk in response_1:
+        last_token = chunk.choices[0].text
+    assert not last_token.endswith("</s>")
+
+    response_1 = openai_client.completions.create(
+        model="default",
+        prompt="Hello, how are you?",
+        max_tokens=10,
+        extra_body={"include_stop_str_in_output": True},
+        stream=True,
+    )
+    last_token = ""
+    for chunk in response_1:
+        last_token = chunk.choices[0].text
+    assert last_token.endswith("</s>")
+
 
 def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
     """