mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-25 17:40:35 +08:00 
			
		
		
		
	Sync v2.0 version of code to github repo
This commit is contained in:
		| @@ -13,7 +13,6 @@ | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| """ | ||||
|  | ||||
| """ | ||||
| ErnieVLTokenizer | ||||
| """ | ||||
| @@ -25,12 +24,11 @@ from typing import Dict, List, Optional, Tuple | ||||
| import numpy as np | ||||
| import paddle | ||||
| import sentencepiece as spm | ||||
| from paddlenlp.transformers import PretrainedTokenizer | ||||
| from paddlenlp.transformers.tokenizer_utils_base import ( | ||||
|     PaddingStrategy, | ||||
|     TextInput, | ||||
| ) | ||||
| from paddlenlp.utils.log import logger | ||||
| from paddleformers.transformers import PretrainedTokenizer | ||||
| from paddleformers.transformers.tokenizer_utils_base import (PaddingStrategy, | ||||
|                                                              TextInput) | ||||
|  | ||||
| from fastdeploy.utils import console_logger as logger | ||||
|  | ||||
|  | ||||
| class ErnieVLTokenizer(PretrainedTokenizer): | ||||
| @@ -43,7 +41,9 @@ class ErnieVLTokenizer(PretrainedTokenizer): | ||||
|     pretrained_init_configuration = { | ||||
|         "ernie-bot-10b": {}, | ||||
|     } | ||||
|     model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"] | ||||
|     model_input_names = [ | ||||
|         "input_ids", "position_ids", "attention_mask", "labels" | ||||
|     ] | ||||
|     padding_side = "right" | ||||
|  | ||||
|     def __init__( | ||||
| @@ -114,7 +114,10 @@ class ErnieVLTokenizer(PretrainedTokenizer): | ||||
|  | ||||
|     def get_vocab(self): | ||||
|         """doc""" | ||||
|         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} | ||||
|         vocab = { | ||||
|             self.convert_ids_to_tokens(i): i | ||||
|             for i in range(self.vocab_size) | ||||
|         } | ||||
|         vocab.update(self.added_tokens_encoder) | ||||
|         return vocab | ||||
|  | ||||
| @@ -157,7 +160,9 @@ class ErnieVLTokenizer(PretrainedTokenizer): | ||||
|             # logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`') | ||||
|         return super().prepare_for_model(*args, **kwargs) | ||||
|  | ||||
|     def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: | ||||
|     def save_vocabulary(self, | ||||
|                         save_directory, | ||||
|                         filename_prefix: Optional[str] = None) -> Tuple[str]: | ||||
|         """ | ||||
|         Save the vocabulary and special tokens file to a directory. | ||||
|         Args: | ||||
| @@ -167,19 +172,22 @@ class ErnieVLTokenizer(PretrainedTokenizer): | ||||
|             `Tuple(str)`: Paths to the files saved. | ||||
|         """ | ||||
|         if not os.path.isdir(save_directory): | ||||
|             logger.error(f"Vocabulary path ({save_directory}) should be a directory") | ||||
|             logger.error( | ||||
|                 f"Vocabulary path ({save_directory}) should be a directory") | ||||
|             return | ||||
|         out_vocab_file = os.path.join( | ||||
|             save_directory, | ||||
|             (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"], | ||||
|             (filename_prefix + "-" if filename_prefix else "") + | ||||
|             self.resource_files_names["vocab_file"], | ||||
|         ) | ||||
|         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): | ||||
|         if os.path.abspath(self.vocab_file) != os.path.abspath( | ||||
|                 out_vocab_file) and os.path.isfile(self.vocab_file): | ||||
|             copyfile(self.vocab_file, out_vocab_file) | ||||
|         elif not os.path.isfile(self.vocab_file): | ||||
|             with open(out_vocab_file, "wb") as fi: | ||||
|                 content_spiece_model = self.sp_model.serialized_model_proto() | ||||
|                 fi.write(content_spiece_model) | ||||
|         return (out_vocab_file,) | ||||
|         return (out_vocab_file, ) | ||||
|  | ||||
|     def tokenize(self, text: TextInput, **kwargs) -> List[str]: | ||||
|         """ | ||||
| @@ -203,10 +211,13 @@ class ErnieVLTokenizer(PretrainedTokenizer): | ||||
|         if hasattr(self, "do_lower_case") and self.do_lower_case: | ||||
|             # convert non-special tokens to lowercase | ||||
|             escaped_special_toks = [ | ||||
|                 re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) | ||||
|                 re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + | ||||
|                                                self.all_special_tokens) | ||||
|             ] | ||||
|             pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" | ||||
|             text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) | ||||
|             text = re.sub(pattern, | ||||
|                           lambda m: m.groups()[0] or m.groups()[1].lower(), | ||||
|                           text) | ||||
|  | ||||
|         no_split_token = set(self.unique_no_split_tokens) | ||||
|         tokens = self.tokens_trie.split(text) | ||||
| @@ -248,19 +259,27 @@ class ErnieVLTokenizer(PretrainedTokenizer): | ||||
|             required_input = encoded_inputs[self.model_input_names[0]] | ||||
|             if padding_strategy == PaddingStrategy.LONGEST: | ||||
|                 max_length = len(required_input) | ||||
|             if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): | ||||
|                 max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of | ||||
|             needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length | ||||
|             if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None: | ||||
|             if max_length is not None and pad_to_multiple_of is not None and ( | ||||
|                     max_length % pad_to_multiple_of != 0): | ||||
|                 max_length = ((max_length // pad_to_multiple_of) + | ||||
|                               1) * pad_to_multiple_of | ||||
|             needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len( | ||||
|                 required_input) != max_length | ||||
|             if "attention_mask" in encoded_inputs and encoded_inputs[ | ||||
|                     "attention_mask"] is not None: | ||||
|                 attention_mask = encoded_inputs.pop("attention_mask") | ||||
|                 if isinstance(attention_mask, paddle.Tensor): | ||||
|                     attention_mask = attention_mask.numpy() | ||||
|                 elif isinstance(attention_mask, list): | ||||
|                     attention_mask = np.array(attention_mask) | ||||
|                 elif not isinstance(attention_mask, np.ndarray): | ||||
|                     raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ") | ||||
|                     raise ValueError( | ||||
|                         f"Unexpected type {type(attention_mask)} of attention_mask, " | ||||
|                     ) | ||||
|             else: | ||||
|                 attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64)) | ||||
|                 attention_mask = np.tril( | ||||
|                     np.ones((len(required_input), len(required_input)), | ||||
|                             dtype=np.int64)) | ||||
|                 attention_mask = np.expand_dims(attention_mask, axis=0) | ||||
|             if needs_to_be_padded: | ||||
|                 difference = max_length - len(required_input) | ||||
| @@ -275,7 +294,8 @@ class ErnieVLTokenizer(PretrainedTokenizer): | ||||
|                     else: | ||||
|                         pad_width = [(0, 0), (difference, 0), (difference, 0)] | ||||
|                 else: | ||||
|                     raise ValueError("Invalid padding strategy:" + str(self.padding_side)) | ||||
|                     raise ValueError("Invalid padding strategy:" + | ||||
|                                      str(self.padding_side)) | ||||
|                 attention_mask = np.pad( | ||||
|                     attention_mask, | ||||
|                     pad_width=pad_width, | ||||
| @@ -342,7 +362,8 @@ def add_special_tokens( | ||||
|     # check | ||||
|     first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"] | ||||
|  | ||||
|     assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}" | ||||
|     assert first_special_tokens[ | ||||
|         0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}" | ||||
|     assert ( | ||||
|         len(tokenizer.get_vocab()) < special_token_ids_end | ||||
|     ), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!" | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jiang-Jia-Jun
					Jiang-Jia-Jun