From 73c97a22fe6efb7e9bce417fe706073f6d8a3a9b Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 15 Aug 2025 11:18:15 +0800 Subject: [PATCH] fix chat_template_args --- fastdeploy/engine/engine.py | 5 +---- fastdeploy/entrypoints/llm.py | 7 ++----- fastdeploy/input/ernie_processor.py | 3 ++- fastdeploy/input/ernie_vl_processor.py | 3 ++- fastdeploy/input/text_processor.py | 3 ++- 5 files changed, 9 insertions(+), 12 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index db3bdefff..fa7be2f98 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -465,10 +465,7 @@ class LLMEngine: request.sampling_params = sampling_params request.preprocess_start_time = time.time() - enable_thinking = None - if kwargs is not None: - enable_thinking = kwargs.get("enable_thinking", None) - request = self.data_processor.process_request(request, self.cfg.max_model_len, enable_thinking=enable_thinking) + request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs) request.prompt_token_ids_len = len(request.prompt_token_ids) request.need_prefill_tokens = request.prompt_token_ids_len input_ids_len = request.prompt_token_ids_len diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 001cfad3e..1b0d9ddac 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -248,7 +248,7 @@ class LLM: self, prompts, sampling_params, - chat_template_kwargs: Optional[dict[str, Any]] = None, + **kwargs, ): """ 添加一个请求到 LLM Engine,并返回该请求的 ID。 @@ -289,10 +289,7 @@ class LLM: current_sampling_params = sampling_params[i] else: current_sampling_params = sampling_params - enable_thinking = None - if chat_template_kwargs is not None: - enable_thinking = chat_template_kwargs.get("enable_thinking", None) - self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking) + self.llm_engine.add_requests(tasks, current_sampling_params, **kwargs) return req_ids def _decode_token(self, token_id: int) -> str: diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index e4424a0b8..4ff4d5e14 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -90,6 +90,7 @@ class ErnieProcessor(BaseDataProcessor): request = self._apply_default_parameters(request) if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0: request.eos_token_ids = self.eos_token_ids + request.enable_thinking = kwargs.get("chat_template_kwargs", {}).get("enable_thinking") stop_sequences = request.get("stop", []) if stop_sequences is not None and len(stop_sequences) != 0: stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) @@ -140,7 +141,7 @@ class ErnieProcessor(BaseDataProcessor): request = self._apply_default_parameters(request) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids - + request["enable_thinking"] = request.get("chat_template_kwargs", {}).get("enable_thinking") # processing stop_sequences stop_sequences = request.get("stop", []) if stop_sequences: diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index e8239f7ad..756011553 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -110,7 +110,7 @@ class ErnieMoEVLProcessor(ErnieProcessor): def process_request(self, request, max_model_len=None, **kwargs): """process the input data""" task = request.to_dict() - task["enable_thinking"] = kwargs.get("enable_thinking", True) + task["chat_template_kwargs"] = kwargs.get("chat_template_kwargs", {}) self.process_request_dict(task, max_model_len) request = Request.from_dict(task) request = self._apply_default_parameters(request) @@ -198,6 +198,7 @@ class ErnieMoEVLProcessor(ErnieProcessor): request = self._apply_default_parameters(request) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids + request["enable_thinking"] = request.get("chat_template_kwargs", {}).get("enable_thinking") stop_sequences = request.get("stop", []) if stop_sequences: diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index e842e964b..10f5081b4 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -207,7 +207,7 @@ class DataProcessor(BaseDataProcessor): request = self._apply_default_parameters(request) if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0: request.eos_token_ids = self.eos_token_ids - + request.enable_thinking = kwargs.get("chat_template_kwargs", {}).get("enable_thinking") stop_sequences = request.get("stop", []) if stop_sequences is not None and len(stop_sequences) != 0: stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) @@ -254,6 +254,7 @@ class DataProcessor(BaseDataProcessor): request = self._apply_default_parameters(request) if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids + request["enable_thinking"] = request.get("chat_template_kwargs", {}).get("enable_thinking") # processing stop_sequences stop_sequences = request.get("stop", [])