spec token map lazy. (#2715)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

This commit is contained in:
Ting
2025-07-05 00:14:54 +08:00
committed by GitHub
parent b37585e693
commit 90ef28d982

View File

@@ -83,7 +83,6 @@ class ErnieBotTokenizer(PretrainedTokenizer):
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
# pre-process map-type all spec token for decode accelerate.
self.all_spec_tok = set(self.all_special_tokens)
@property
def space_token(self):
@@ -138,8 +137,13 @@ class ErnieBotTokenizer(PretrainedTokenizer):
"""doc"""
return self.sp_model.id_to_piece(id)
def spec_init(self):
if not hasattr(self, "all_spec_tok"):
self.all_spec_tok = set(self.all_special_tokens)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
spec_init()
current_sub_tokens = []
out_string = ""
# prev_is_special = False
@@ -212,6 +216,7 @@ class ErnieBotTokenizer(PretrainedTokenizer):
# if isinstance(t, AddedToken)
# )
spec_init()
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
# TODO: should this be in the base class?