mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-07 01:22:59 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
@@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""
|
||||
ErnieVLTokenizer
|
||||
"""
|
||||
@@ -25,12 +24,11 @@ from typing import Dict, List, Optional, Tuple
|
||||
import numpy as np
|
||||
import paddle
|
||||
import sentencepiece as spm
|
||||
from paddlenlp.transformers import PretrainedTokenizer
|
||||
from paddlenlp.transformers.tokenizer_utils_base import (
|
||||
PaddingStrategy,
|
||||
TextInput,
|
||||
)
|
||||
from paddlenlp.utils.log import logger
|
||||
from paddleformers.transformers import PretrainedTokenizer
|
||||
from paddleformers.transformers.tokenizer_utils_base import (PaddingStrategy,
|
||||
TextInput)
|
||||
|
||||
from fastdeploy.utils import console_logger as logger
|
||||
|
||||
|
||||
class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
@@ -43,7 +41,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
pretrained_init_configuration = {
|
||||
"ernie-bot-10b": {},
|
||||
}
|
||||
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
|
||||
model_input_names = [
|
||||
"input_ids", "position_ids", "attention_mask", "labels"
|
||||
]
|
||||
padding_side = "right"
|
||||
|
||||
def __init__(
|
||||
@@ -114,7 +114,10 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
|
||||
def get_vocab(self):
|
||||
"""doc"""
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab = {
|
||||
self.convert_ids_to_tokens(i): i
|
||||
for i in range(self.vocab_size)
|
||||
}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
@@ -157,7 +160,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
|
||||
return super().prepare_for_model(*args, **kwargs)
|
||||
|
||||
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
def save_vocabulary(self,
|
||||
save_directory,
|
||||
filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
"""
|
||||
Save the vocabulary and special tokens file to a directory.
|
||||
Args:
|
||||
@@ -167,19 +172,22 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
logger.error(
|
||||
f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory,
|
||||
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
|
||||
(filename_prefix + "-" if filename_prefix else "") +
|
||||
self.resource_files_names["vocab_file"],
|
||||
)
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
||||
out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
return (out_vocab_file,)
|
||||
return (out_vocab_file, )
|
||||
|
||||
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
|
||||
"""
|
||||
@@ -203,10 +211,13 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
if hasattr(self, "do_lower_case") and self.do_lower_case:
|
||||
# convert non-special tokens to lowercase
|
||||
escaped_special_toks = [
|
||||
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
|
||||
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens +
|
||||
self.all_special_tokens)
|
||||
]
|
||||
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
|
||||
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
|
||||
text = re.sub(pattern,
|
||||
lambda m: m.groups()[0] or m.groups()[1].lower(),
|
||||
text)
|
||||
|
||||
no_split_token = set(self.unique_no_split_tokens)
|
||||
tokens = self.tokens_trie.split(text)
|
||||
@@ -248,19 +259,27 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
required_input = encoded_inputs[self.model_input_names[0]]
|
||||
if padding_strategy == PaddingStrategy.LONGEST:
|
||||
max_length = len(required_input)
|
||||
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
||||
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
||||
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
||||
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
|
||||
if max_length is not None and pad_to_multiple_of is not None and (
|
||||
max_length % pad_to_multiple_of != 0):
|
||||
max_length = ((max_length // pad_to_multiple_of) +
|
||||
1) * pad_to_multiple_of
|
||||
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
|
||||
required_input) != max_length
|
||||
if "attention_mask" in encoded_inputs and encoded_inputs[
|
||||
"attention_mask"] is not None:
|
||||
attention_mask = encoded_inputs.pop("attention_mask")
|
||||
if isinstance(attention_mask, paddle.Tensor):
|
||||
attention_mask = attention_mask.numpy()
|
||||
elif isinstance(attention_mask, list):
|
||||
attention_mask = np.array(attention_mask)
|
||||
elif not isinstance(attention_mask, np.ndarray):
|
||||
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
|
||||
raise ValueError(
|
||||
f"Unexpected type {type(attention_mask)} of attention_mask, "
|
||||
)
|
||||
else:
|
||||
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
|
||||
attention_mask = np.tril(
|
||||
np.ones((len(required_input), len(required_input)),
|
||||
dtype=np.int64))
|
||||
attention_mask = np.expand_dims(attention_mask, axis=0)
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
@@ -275,7 +294,8 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
else:
|
||||
pad_width = [(0, 0), (difference, 0), (difference, 0)]
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" +
|
||||
str(self.padding_side))
|
||||
attention_mask = np.pad(
|
||||
attention_mask,
|
||||
pad_width=pad_width,
|
||||
@@ -342,7 +362,8 @@ def add_special_tokens(
|
||||
# check
|
||||
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
|
||||
|
||||
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
|
||||
assert first_special_tokens[
|
||||
0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
|
||||
assert (
|
||||
len(tokenizer.get_vocab()) < special_token_ids_end
|
||||
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
|
||||
|
Reference in New Issue
Block a user