Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""
ErnieVLTokenizer
"""
@@ -25,12 +24,11 @@ from typing import Dict, List, Optional, Tuple
import numpy as np
import paddle
import sentencepiece as spm
from paddlenlp.transformers import PretrainedTokenizer
from paddlenlp.transformers.tokenizer_utils_base import (
PaddingStrategy,
TextInput,
)
from paddlenlp.utils.log import logger
from paddleformers.transformers import PretrainedTokenizer
from paddleformers.transformers.tokenizer_utils_base import (PaddingStrategy,
TextInput)
from fastdeploy.utils import console_logger as logger
class ErnieVLTokenizer(PretrainedTokenizer):
@@ -43,7 +41,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
pretrained_init_configuration = {
"ernie-bot-10b": {},
}
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
model_input_names = [
"input_ids", "position_ids", "attention_mask", "labels"
]
padding_side = "right"
def __init__(
@@ -114,7 +114,10 @@ class ErnieVLTokenizer(PretrainedTokenizer):
def get_vocab(self):
"""doc"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
@@ -157,7 +160,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
@@ -167,19 +172,22 @@ class ErnieVLTokenizer(PretrainedTokenizer):
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
logger.error(
f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
(filename_prefix + "-" if filename_prefix else "") +
self.resource_files_names["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
return (out_vocab_file, )
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
"""
@@ -203,10 +211,13 @@ class ErnieVLTokenizer(PretrainedTokenizer):
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
escaped_special_toks = [
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens +
self.all_special_tokens)
]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
text = re.sub(pattern,
lambda m: m.groups()[0] or m.groups()[1].lower(),
text)
no_split_token = set(self.unique_no_split_tokens)
tokens = self.tokens_trie.split(text)
@@ -248,19 +259,27 @@ class ErnieVLTokenizer(PretrainedTokenizer):
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
if max_length is not None and pad_to_multiple_of is not None and (
max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) +
1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
required_input) != max_length
if "attention_mask" in encoded_inputs and encoded_inputs[
"attention_mask"] is not None:
attention_mask = encoded_inputs.pop("attention_mask")
if isinstance(attention_mask, paddle.Tensor):
attention_mask = attention_mask.numpy()
elif isinstance(attention_mask, list):
attention_mask = np.array(attention_mask)
elif not isinstance(attention_mask, np.ndarray):
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
raise ValueError(
f"Unexpected type {type(attention_mask)} of attention_mask, "
)
else:
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
attention_mask = np.tril(
np.ones((len(required_input), len(required_input)),
dtype=np.int64))
attention_mask = np.expand_dims(attention_mask, axis=0)
if needs_to_be_padded:
difference = max_length - len(required_input)
@@ -275,7 +294,8 @@ class ErnieVLTokenizer(PretrainedTokenizer):
else:
pad_width = [(0, 0), (difference, 0), (difference, 0)]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" +
str(self.padding_side))
attention_mask = np.pad(
attention_mask,
pad_width=pad_width,
@@ -342,7 +362,8 @@ def add_special_tokens(
# check
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
assert first_special_tokens[
0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
assert (
len(tokenizer.get_vocab()) < special_token_ids_end
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"