[Feature] Support limit thinking len for text models (#3527)

* support limit thinking len

* remove default think_end_id

* remove reasoning_max_tokens

* update think_end_id for ernie

* update think_end_id for ernie.

---------

Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
Co-authored-by: luukunn <981429396@qq.com>
This commit is contained in:
K11OntheBoat
2025-08-22 14:48:15 +08:00
committed by GitHub
parent 4d6fb96cd6
commit 93d999b830
6 changed files with 64 additions and 26 deletions

View File

@@ -123,6 +123,28 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None:
fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel
def update_think_end_id_for_ernie(fd_config: FDConfig) -> None:
"""
Updates the think_end_id in the model config. Uses the ID of '</think>'
if it exists, otherwise defaults to None.
"""
is_ernie = ErnieArchitectures.contains_ernie_arch(fd_config.model_config.architectures)
if is_ernie:
tokenizer = ErnieBotTokenizer.from_pretrained(
fd_config.model_config.model,
model_max_length=fd_config.parallel_config.max_model_len,
padding_side="right",
use_fast=False,
)
vocab = tokenizer.get_vocab()
fd_config.model_config.think_end_id = vocab.get("</think>", None)
if fd_config.model_config.think_end_id is not None:
logger.info(f"Get think_end_id {fd_config.model_config.think_end_id} from vocab.")
else:
logger.info(("No </think> token found in vocabulary, The model can not do reasoning."))
class PaddleDisWorkerProc:
"""
Paddle Distributed wrapper for fastdeploy.worker.Worker,
@@ -710,7 +732,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
cache_config=cache_config,
)
update_fd_config_for_mm(fd_config)
update_think_end_id_for_ernie(fd_config)
return fd_config