[Feature] Support limit thinking len for text models (#3527)

* support limit thinking len * remove default think_end_id * remove reasoning_max_tokens * update think_end_id for ernie * update think_end_id for ernie. --------- Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”> Co-authored-by: luukunn <981429396@qq.com>
2025-10-07 01:22:59 +08:00 · 2025-08-22 14:48:15 +08:00
parent 4d6fb96cd6
commit 93d999b830
6 changed files with 64 additions and 26 deletions
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -123,6 +123,28 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None:
        fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel


+def update_think_end_id_for_ernie(fd_config: FDConfig) -> None:
+    """
+    Updates the think_end_id in the model config. Uses the ID of '</think>'
+    if it exists, otherwise defaults to None.
+    """
+    is_ernie = ErnieArchitectures.contains_ernie_arch(fd_config.model_config.architectures)
+    if is_ernie:
+        tokenizer = ErnieBotTokenizer.from_pretrained(
+            fd_config.model_config.model,
+            model_max_length=fd_config.parallel_config.max_model_len,
+            padding_side="right",
+            use_fast=False,
+        )
+
+        vocab = tokenizer.get_vocab()
+        fd_config.model_config.think_end_id = vocab.get("</think>", None)
+        if fd_config.model_config.think_end_id is not None:
+            logger.info(f"Get think_end_id {fd_config.model_config.think_end_id} from vocab.")
+        else:
+            logger.info(("No </think> token found in vocabulary, The model can not do reasoning."))
+
+
 class PaddleDisWorkerProc:
    """
    Paddle Distributed wrapper for fastdeploy.worker.Worker,
@@ -710,7 +732,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
        cache_config=cache_config,
    )
    update_fd_config_for_mm(fd_config)
-
+    update_think_end_id_for_ernie(fd_config)
    return fd_config