diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index e1c255b49..db183bb27 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -60,6 +60,7 @@ class Request: preprocess_end_time: Optional[float] = None, multimodal_inputs: Optional[dict] = None, multimodal_data: Optional[dict] = None, + disable_chat_template: bool = False, disaggregate_info: Optional[dict] = None, draft_token_ids: Optional[list[int]] = None, guided_json: Optional[Any] = None, @@ -87,6 +88,7 @@ class Request: self.arrival_time = arrival_time self.preprocess_start_time = preprocess_start_time self.preprocess_end_time = preprocess_end_time + self.disable_chat_template = disable_chat_template self.disaggregate_info = disaggregate_info # speculative method in disaggregate-mode @@ -136,6 +138,7 @@ class Request: preprocess_end_time=d.get("preprocess_end_time"), multimodal_inputs=d.get("multimodal_inputs"), multimodal_data=d.get("multimodal_data"), + disable_chat_template=d.get("disable_chat_template"), disaggregate_info=d.get("disaggregate_info"), draft_token_ids=d.get("draft_token_ids"), guided_json=d.get("guided_json", None), @@ -180,6 +183,7 @@ class Request: "preprocess_end_time": self.preprocess_end_time, "multimodal_inputs": self.multimodal_inputs, "multimodal_data": self.multimodal_data, + "disable_chat_template": self.disable_chat_template, "disaggregate_info": self.disaggregate_info, "draft_token_ids": self.draft_token_ids, "enable_thinking": self.enable_thinking, diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index e3c759e57..ca6232dfb 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -483,6 +483,7 @@ class ChatCompletionRequest(BaseModel): extra_body: Optional[dict] = None return_token_ids: Optional[bool] = False prompt_token_ids: Optional[List[int]] = None + disable_chat_template: Optional[bool] = False response_format: Optional[AnyResponseFormat] = None guided_json: Optional[Union[str, dict, BaseModel]] = None @@ -531,6 +532,11 @@ class ChatCompletionRequest(BaseModel): else: assert len(self.messages) > 0 + # If disable_chat_template is set, then the first message in messages will be used as the prompt. + if self.disable_chat_template: + req_dict["prompt"] = req_dict["messages"][0]["content"] + del req_dict["messages"] + guided_json_object = None if self.response_format is not None: if self.response_format.type == "json_object": diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index fe615e465..9627ea773 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -662,3 +662,37 @@ def test_streaming_completion_with_prompt_token_ids(openai_client, capsys): else: assert hasattr(chunk.usage, "prompt_tokens") assert chunk.usage.prompt_tokens == 9 + + +def test_non_streaming_chat_completion_disable_chat_template(openai_client, capsys): + """ + Test disable_chat_template option in chat functionality with the local service. + """ + enabled_response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello, how are you?"}], + max_tokens=10, + temperature=0.0, + top_p=0, + extra_body={"disable_chat_template": False}, + stream=False, + ) + assert hasattr(enabled_response, "choices") + assert len(enabled_response.choices) > 0 + + # from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer + # tokenizer = ErnieBotTokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True) + # prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False) + prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: " + disabled_response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": prompt}], + max_tokens=10, + temperature=0, + top_p=0, + extra_body={"disable_chat_template": True}, + stream=False, + ) + assert hasattr(disabled_response, "choices") + assert len(disabled_response.choices) > 0 + assert enabled_response.choices[0].message.content == disabled_response.choices[0].message.content