diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py index 46d9fd8ac..1cd77d2b1 100644 --- a/fastdeploy/engine/sampling_params.py +++ b/fastdeploy/engine/sampling_params.py @@ -218,20 +218,22 @@ class SamplingParams: prompt_token_ids = tokenizer.encode(text=prompt, add_special_tokens=False)["input_ids"] if len(prompt_token_ids) != 1: - logger.warning( - f"Skip bad_words: {prompt}." - f"Bad words should be a single token." - f"Got tokens: {prompt_token_ids}." - ) + if not add_prefix_space: + logger.warning( + f"Skip bad_words: <{prompt}>." + f"Bad words should be a single token." + f"Got tokens: {prompt_token_ids}." + ) continue if prompt_token_ids[0] > tokenizer.vocab_size: - logger.warning( - f"Skip bad_words: {prompt}." - f"All token id values should be satisfying:" - f" 0 <= token_id < {tokenizer.vocab_size}." - f"Got token: {prompt_token_ids}." - ) + if not add_prefix_space: + logger.warning( + f"Skip bad_words: <{prompt}>." + f"All token id values should be satisfying:" + f" 0 <= token_id < {tokenizer.vocab_size}." + f"Got token: {prompt_token_ids}." + ) continue if prompt_token_ids not in self._bad_words_token_ids: diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index e35dc4d23..e0086b503 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -272,13 +272,15 @@ class GCUModelRunner(ModelRunnerBase): request.block_tables, dtype="int32" ) - if request.get("bad_words_token_ids") is not None: + if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0: bad_words_len = len(request.get("bad_words_token_ids")) - if bad_words_len > 0: - self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len - self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( - request.get("bad_words_token_ids"), dtype="int64" - ) + self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len + self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( + request.get("bad_words_token_ids"), dtype="int64" + ) + else: + self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1 + self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64") if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len")) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index c551364ef..447f249d4 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -495,13 +495,15 @@ class GPUModelRunner(ModelRunnerBase): request.block_tables, dtype="int32" ) - if request.get("bad_words_token_ids") is not None: + if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0: bad_words_len = len(request.get("bad_words_token_ids")) - if bad_words_len > 0: - self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len - self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( - request.get("bad_words_token_ids"), dtype="int64" - ) + self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len + self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( + request.get("bad_words_token_ids"), dtype="int64" + ) + else: + self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1 + self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64") if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len")) diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 526f3361e..4a7aaaf8d 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -243,13 +243,15 @@ class IluvatarModelRunner(ModelRunnerBase): request.block_tables, dtype="int32" ) - if request.get("bad_words_token_ids") is not None: + if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0: bad_words_len = len(request.get("bad_words_token_ids")) - if bad_words_len > 0: - self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len - self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( - request.get("bad_words_token_ids"), dtype="int64" - ) + self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len + self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( + request.get("bad_words_token_ids"), dtype="int64" + ) + else: + self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1 + self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64") if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len")) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index b61e84b9f..a153e5556 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -507,13 +507,15 @@ class XPUModelRunner(ModelRunnerBase): request.block_tables, dtype="int32" ) - if request.get("bad_words_token_ids") is not None: + if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0: bad_words_len = len(request.get("bad_words_token_ids")) - if bad_words_len > 0: - self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len - self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( - request.get("bad_words_token_ids"), dtype="int64" - ) + self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len + self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( + request.get("bad_words_token_ids"), dtype="int64" + ) + else: + self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1 + self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64") if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len"))