mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[LLM] First commit the llm deployment code
This commit is contained in:
15
fastdeploy/input/__init__.py
Normal file
15
fastdeploy/input/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
269
fastdeploy/input/ernie_tokenizer.py
Normal file
269
fastdeploy/input/ernie_tokenizer.py
Normal file
@@ -0,0 +1,269 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
from shutil import copyfile
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
from paddlenlp.transformers import AddedToken, PretrainedTokenizer
|
||||
from paddlenlp.utils import logger
|
||||
|
||||
__all__ = ["ErnieBotTokenizer"]
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {},
|
||||
"tokenizer_file": {},
|
||||
}
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
||||
|
||||
|
||||
class ErnieBotTokenizer(PretrainedTokenizer):
|
||||
"""
|
||||
Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
Path to the vocabulary file.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
resource_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
unk_token="<unk>",
|
||||
bos_token="<s>",
|
||||
eos_token="</s>",
|
||||
pad_token="<pad>",
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
add_bos_token=True,
|
||||
add_eos_token=False,
|
||||
clean_up_tokenization_spaces=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_file = vocab_file
|
||||
self.add_bos_token = add_bos_token
|
||||
self.add_eos_token = add_eos_token
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
bos_token = AddedToken(bos_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
pad_token, str) else pad_token
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
add_bos_token=add_bos_token,
|
||||
add_eos_token=add_eos_token,
|
||||
verbose=False,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
**kwargs,
|
||||
)
|
||||
# for eb35 reader
|
||||
self.bos_id = self.bos_token_id
|
||||
self.eos_id = self.eos_token_id
|
||||
self.sep_id = self.sep_token_id
|
||||
self.pad_id = self.pad_token_id
|
||||
self.unk_id = self.unk_token_id
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Returns vocab size"""
|
||||
return self.sp_model.get_piece_size()
|
||||
|
||||
def get_vocab(self):
|
||||
"""Returns vocab as a dict"""
|
||||
vocab = {
|
||||
self.convert_ids_to_tokens(i): i
|
||||
for i in range(self.vocab_size)
|
||||
}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Returns a tokenized string."""
|
||||
return self._tokenize(text)
|
||||
|
||||
def _tokenize(self, text):
|
||||
"""Returns a tokenized string."""
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
def decode(self,
|
||||
tokens,
|
||||
skip_special_tokens=False,
|
||||
clean_up_tokenization_spaces=False):
|
||||
"""Returns a tokenized string."""
|
||||
return self.sp_model.decode(tokens)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
return self.sp_model.piece_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
current_sub_tokens = []
|
||||
out_string = ""
|
||||
prev_is_special = False
|
||||
for i, token in enumerate(tokens):
|
||||
# make sure that special tokens are not decoded using sentencepiece model
|
||||
if token in self.all_special_tokens:
|
||||
if not prev_is_special and i != 0:
|
||||
out_string += " "
|
||||
out_string += self.sp_model.decode(current_sub_tokens) + token
|
||||
prev_is_special = True
|
||||
current_sub_tokens = []
|
||||
else:
|
||||
current_sub_tokens.append(token)
|
||||
prev_is_special = False
|
||||
out_string += self.sp_model.decode(current_sub_tokens)
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(self,
|
||||
save_directory,
|
||||
filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
"""
|
||||
Save the vocabulary and special tokens file to a directory.
|
||||
Args:
|
||||
save_directory (`str`):
|
||||
The directory in which to save the vocabulary.
|
||||
Returns:
|
||||
`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(
|
||||
f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory,
|
||||
(filename_prefix + "-" if filename_prefix else "") +
|
||||
VOCAB_FILES_NAMES["vocab_file"])
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
||||
out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
|
||||
return (out_vocab_file, )
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
""" build_inputs_with_special_tokens """
|
||||
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
||||
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
||||
|
||||
output = bos_token_id + token_ids_0 + eos_token_id
|
||||
|
||||
if token_ids_1 is not None:
|
||||
output = output + bos_token_id + token_ids_1 + eos_token_id
|
||||
|
||||
return output
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None,
|
||||
already_has_special_tokens: bool = False) -> List[int]:
|
||||
"""
|
||||
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer `prepare_for_model` method.
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
Returns:
|
||||
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
if already_has_special_tokens:
|
||||
return super().get_special_tokens_mask(
|
||||
token_ids_0=token_ids_0,
|
||||
token_ids_1=token_ids_1,
|
||||
already_has_special_tokens=True)
|
||||
|
||||
bos_token_id = [1] if self.add_bos_token else []
|
||||
eos_token_id = [1] if self.add_eos_token else []
|
||||
|
||||
if token_ids_1 is None:
|
||||
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
||||
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
|
||||
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None) -> List[int]:
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
||||
sequence pair mask has the following format:
|
||||
```
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||
| first sequence | second sequence |
|
||||
```
|
||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of ids.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
Returns:
|
||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
||||
"""
|
||||
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
||||
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
||||
|
||||
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
||||
|
||||
if token_ids_1 is not None:
|
||||
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
||||
|
||||
return output
|
||||
23
fastdeploy/input/mm_processor/__init__.py
Normal file
23
fastdeploy/input/mm_processor/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from .process import DataProcessor, fancy_print, IDS_TYPE_FLAG
|
||||
|
||||
__all__ = [
|
||||
'DataProcessor',
|
||||
'fancy_print',
|
||||
'IDS_TYPE_FLAG',
|
||||
]
|
||||
20
fastdeploy/input/mm_processor/image_preprocessor/__init__.py
Normal file
20
fastdeploy/input/mm_processor/image_preprocessor/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from .get_image_preprocessor import get_image_preprocessor
|
||||
from .image_preprocessor_adaptive import AdaptiveImageProcessor
|
||||
|
||||
__all__ = ['get_image_preprocessor', 'AdaptiveImageProcessor']
|
||||
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""get image preprocessor"""
|
||||
|
||||
from .image_preprocessor_adaptive import AdaptiveImageProcessor
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
|
||||
def get_image_preprocessor(args):
|
||||
"""
|
||||
get_image_preprocessor from args
|
||||
"""
|
||||
|
||||
if args.vision_model_name_or_path is None:
|
||||
return None
|
||||
|
||||
data_processor_logger.info("use AdaptiveImageProcessor")
|
||||
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
|
||||
return image_preprocess
|
||||
@@ -0,0 +1,568 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""image preprocessor adaptive"""
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddlenlp.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddlenlp.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddlenlp.transformers.image_transforms import (
|
||||
convert_to_rgb,
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from paddlenlp.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_valid_image,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddlenlp.transformers.tokenizer_utils_base import (
|
||||
TensorType,
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
IMAGE_FACTOR = 28
|
||||
MIN_PIXELS = 4 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
MAX_RATIO = 200
|
||||
|
||||
VIDEO_MIN_PIXELS = 128 * 28 * 28
|
||||
VIDEO_MAX_PIXELS = 768 * 28 * 28
|
||||
VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
|
||||
FRAME_FACTOR = 2
|
||||
FPS = 2.0
|
||||
FPS_MIN_FRAMES = 4
|
||||
FPS_MAX_FRAMES = 768
|
||||
|
||||
|
||||
VideoInput = Union[
|
||||
List["PIL.Image.Image"],
|
||||
"np.ndarray",
|
||||
"paddle.Tensor",
|
||||
List["np.ndarray"],
|
||||
List["paddle.Tensor"],
|
||||
List[List["PIL.Image.Image"]],
|
||||
List[List["np.ndarrray"]],
|
||||
List[List["paddle.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"AdaptiveImageProcessor",
|
||||
]
|
||||
|
||||
|
||||
def is_scaled_image(image: np.ndarray) -> bool:
|
||||
"""
|
||||
Checks to see whether the pixel values have already been rescaled to [0, 1].
|
||||
"""
|
||||
if image.dtype == np.uint8:
|
||||
return False
|
||||
|
||||
# It's possible the image has pixel values in [0, 255] but is of floating type
|
||||
return np.min(image) >= 0 and np.max(image) <= 1
|
||||
|
||||
|
||||
def make_batched_images(images) -> List[List[ImageInput]]:
|
||||
"""
|
||||
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
||||
|
||||
Args:
|
||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
|
||||
Returns:
|
||||
list: A list of images.
|
||||
"""
|
||||
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
||||
return [img for img_list in images for img in img_list]
|
||||
|
||||
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
||||
return images
|
||||
|
||||
elif is_valid_image(images):
|
||||
return [images]
|
||||
|
||||
raise ValueError(f"Could not make batched images from {images}")
|
||||
|
||||
|
||||
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
|
||||
def make_batched_videos(videos) -> List[VideoInput]:
|
||||
"""dummy"""
|
||||
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
|
||||
return videos
|
||||
|
||||
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
|
||||
if isinstance(videos[0], Image.Image):
|
||||
return [videos]
|
||||
elif len(videos[0].shape) == 4:
|
||||
return [list(video) for video in videos]
|
||||
|
||||
elif is_valid_image(videos) and len(videos.shape) == 4:
|
||||
return [list(videos)]
|
||||
|
||||
raise ValueError(f"Could not make batched video from {videos}")
|
||||
|
||||
|
||||
class AdaptiveImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a adaptive image processor that dynamically resizes images based on the original images.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
|
||||
in the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
||||
The min pixels of the image to resize the image.
|
||||
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
||||
The max pixels of the image to resize the image.
|
||||
patch_size (`int`, *optional*, defaults to 14):
|
||||
The spacial patch size of the vision encoder.
|
||||
temporal_conv_size (`int`, *optional*, defaults to 2):
|
||||
The temporal conv size in resampler.
|
||||
merge_size (`int`, *optional*, defaults to 2):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = True,
|
||||
min_pixels: int = 56 * 56,
|
||||
max_pixels: int = 28 * 28 * 1280,
|
||||
patch_size: int = 14,
|
||||
temporal_conv_size: int = 2,
|
||||
merge_size: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""init"""
|
||||
super().__init__(**kwargs)
|
||||
self.do_resize = do_resize
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
self.patch_size = patch_size
|
||||
self.temporal_conv_size = temporal_conv_size
|
||||
self.merge_size = merge_size
|
||||
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
|
||||
"""设定pixels"""
|
||||
if min_pixels is not None:
|
||||
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
|
||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
|
||||
self.min_pixels = min_pixels
|
||||
self.size["min_pixels"] = int(min_pixels)
|
||||
if max_pixels is not None:
|
||||
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
|
||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
|
||||
self.max_pixels = max_pixels
|
||||
self.size["max_pixels"] = int(max_pixels)
|
||||
|
||||
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
|
||||
"""dummy"""
|
||||
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=actual_min_pixels,
|
||||
max_pixels=actual_max_pixels,
|
||||
)
|
||||
return (resized_height, resized_width), (resized_height // self.patch_size, resized_width // self.patch_size)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
do_resize: bool = True,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = False,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
predetermined_grid_thw=None,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
|
||||
If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||
vision_info (`List[Dict]`, *optional*):
|
||||
Optional list of dictionaries containing additional information about vision inputs.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Mean to use if normalizing the image.
|
||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Standard deviation to use if normalizing the image.
|
||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
data_processor_logger.warning(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
|
||||
if predetermined_grid_thw is not None:
|
||||
assert len(predetermined_grid_thw) == len(
|
||||
images
|
||||
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
|
||||
|
||||
for img_idx, image in enumerate(images):
|
||||
if do_resize:
|
||||
if predetermined_grid_thw is not None:
|
||||
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
|
||||
resized_height *= self.patch_size
|
||||
resized_width *= self.patch_size
|
||||
else:
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
|
||||
# 直接fromarray,不要靠paddlenlp里面的
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
size=(resized_height, resized_width),
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
if do_rescale:
|
||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = normalize(image=image, mean=image_mean, std=image_std, data_format=input_data_format)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||
|
||||
processed_images.append(image)
|
||||
patches = np.array(processed_images)
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose([0, 3, 1, 2])
|
||||
|
||||
channel = patches.shape[1] # [time, C, H, W]
|
||||
grid_t = patches.shape[0]
|
||||
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
|
||||
patches = patches.reshape(
|
||||
[
|
||||
grid_t,
|
||||
channel,
|
||||
grid_h // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
grid_w // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
]
|
||||
)
|
||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
|
||||
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
|
||||
|
||||
flatten_patches = patches.reshape(
|
||||
[grid_t * grid_h * grid_w, channel * self.patch_size * self.patch_size]
|
||||
) # [grid_t * grid_h * grid_w, C * psz * psz]
|
||||
|
||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: VideoInput = None,
|
||||
do_resize: bool = True,
|
||||
size: Optional[Union[int, List[int]]] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = False,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
predetermined_grid_thw=None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
videos (`VideoInput`):
|
||||
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
||||
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
if images is not None:
|
||||
images = make_batched_images(images)
|
||||
if videos is not None:
|
||||
videos = make_batched_videos(videos)
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||
|
||||
if images is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for img_idx, image in enumerate(images):
|
||||
if predetermined_grid_thw is not None:
|
||||
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
|
||||
else:
|
||||
predetermined_grid_thw_one = None
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
image,
|
||||
do_resize=do_resize,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
predetermined_grid_thw=predetermined_grid_thw_one,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(image_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
|
||||
|
||||
if videos is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for images in videos:
|
||||
patches, video_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
predetermined_grid_thw=predetermined_grid_thw,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(video_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
|
||||
data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
def round_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
||||
):
|
||||
"""
|
||||
Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if max(height, width) / min(height, width) > MAX_RATIO:
|
||||
if height > width:
|
||||
new_width = max(factor, round_by_factor(width, factor))
|
||||
new_height = floor_by_factor(new_width * MAX_RATIO, factor)
|
||||
else:
|
||||
new_height = max(factor, round_by_factor(height, factor))
|
||||
new_width = floor_by_factor(new_height * MAX_RATIO, factor)
|
||||
|
||||
data_processor_logger.info(
|
||||
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
|
||||
resize to {max(new_height, new_width) / min(new_height, new_width)}"
|
||||
)
|
||||
|
||||
height = new_height
|
||||
width = new_width
|
||||
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
|
||||
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
|
||||
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
|
||||
|
||||
return h_bar, w_bar
|
||||
388
fastdeploy/input/mm_processor/process.py
Normal file
388
fastdeploy/input/mm_processor/process.py
Normal file
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
""" process.py """
|
||||
import copy
|
||||
import io
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
from paddlenlp.transformers.image_utils import ChannelDimension
|
||||
from PIL import Image
|
||||
|
||||
|
||||
|
||||
from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
|
||||
from .process_video import read_frames_decord, read_video_decord
|
||||
from .utils.io_utils import RAW_IMAGE_DIR, get_downloadable
|
||||
from .utils.render_timestamp import render_frame_timestamp
|
||||
|
||||
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
|
||||
|
||||
|
||||
def fancy_print(input_ids, tokenizer, image_patch_id=None):
|
||||
"""
|
||||
input_ids: input_ids
|
||||
tokenizer: the tokenizer of models
|
||||
"""
|
||||
i = 0
|
||||
res = ""
|
||||
text_ids = []
|
||||
real_image_token_len = 0
|
||||
while i < len(input_ids):
|
||||
if input_ids[i] == image_patch_id:
|
||||
if len(text_ids) > 0:
|
||||
res += tokenizer.decode(text_ids)
|
||||
text_ids = []
|
||||
|
||||
real_image_token_len += 1
|
||||
else:
|
||||
if real_image_token_len != 0:
|
||||
res += f"<|IMAGE@{real_image_token_len}|>"
|
||||
real_image_token_len = 0
|
||||
|
||||
text_ids.append(input_ids[i])
|
||||
|
||||
i += 1
|
||||
if len(text_ids) > 0:
|
||||
|
||||
res += tokenizer.decode(text_ids)
|
||||
text_ids = []
|
||||
return res
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
"""
|
||||
Processes multimodal chat messages into model-ready inputs,
|
||||
handling text, images, and videos with 3D positional embeddings.
|
||||
"""
|
||||
|
||||
CLS_TOKEN = "<|begin_of_sentence|>"
|
||||
SEP_TOKEN = "<|end_of_sentence|>"
|
||||
IMG_START = "<|IMAGE_START|>"
|
||||
IMG_END = "<|IMAGE_END|>"
|
||||
VID_START = "<|VIDEO_START|>"
|
||||
VID_END = "<|VIDEO_END|>"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer_name: str,
|
||||
image_preprocessor_name: str,
|
||||
spatial_conv_size: int = 2,
|
||||
temporal_conv_size: int = 2,
|
||||
image_min_pixels: int = 4 * 28 * 28,
|
||||
image_max_pixels: int = 6177 * 28 * 28,
|
||||
video_min_pixels: int = 299 * 28 * 28,
|
||||
video_max_pixels: int = 1196 * 28 * 28,
|
||||
video_target_frames: int = -1,
|
||||
video_frames_sample: str = "leading",
|
||||
video_max_frames: int = 180,
|
||||
video_min_frames: int = 16,
|
||||
video_fps: int = 2,
|
||||
) -> None:
|
||||
# Tokenizer and image preprocessor
|
||||
self.tokenizer = ErnieVLTokenizer.from_pretrained(tokenizer_name, verbose=False)
|
||||
self.tokenizer.ignored_index = -100
|
||||
self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name)
|
||||
|
||||
# Convolution sizes for patch aggregation
|
||||
self.spatial_conv_size = spatial_conv_size
|
||||
self.temporal_conv_size = temporal_conv_size
|
||||
|
||||
# Pixel constraints
|
||||
self.image_min_pixels = image_min_pixels
|
||||
self.image_max_pixels = image_max_pixels
|
||||
self.video_min_pixels = video_min_pixels
|
||||
self.video_max_pixels = video_max_pixels
|
||||
|
||||
# Video sampling parameters
|
||||
self.target_frames = video_target_frames
|
||||
self.frames_sample = video_frames_sample
|
||||
self.max_frames = video_max_frames
|
||||
self.min_frames = video_min_frames
|
||||
self.fps = video_fps
|
||||
|
||||
# Special tokens and IDs
|
||||
self.cls_token = self.CLS_TOKEN
|
||||
self.sep_token = self.SEP_TOKEN
|
||||
self.image_start = self.IMG_START
|
||||
self.image_end = self.IMG_END
|
||||
self.video_start = self.VID_START
|
||||
self.video_end = self.VID_END
|
||||
self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
|
||||
|
||||
self.token_type_mapping = self._build_token_type_mapping()
|
||||
self.is_training = True
|
||||
self.role_prefixes = {"system": "", "user": "User: ", "bot": "Assistant: ", "assistant": "Assistant: "}
|
||||
|
||||
def _build_token_type_mapping(self) -> Dict[Any, int]:
|
||||
mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
|
||||
for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END):
|
||||
mapping[token] = IDS_TYPE_FLAG["image"]
|
||||
mapping[self.image_patch_id] = IDS_TYPE_FLAG["image"]
|
||||
return mapping
|
||||
|
||||
def train(self) -> None:
|
||||
"""Enable training mode (produces labels)."""
|
||||
self.is_training = True
|
||||
|
||||
def eval(self) -> None:
|
||||
"""Enable evaluation mode (doesn't produce labels)."""
|
||||
self.is_training = False
|
||||
|
||||
def process(self, messages: List[Dict[str, Any]]) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
|
||||
"""
|
||||
Convert chat messages into model inputs.
|
||||
Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
|
||||
"""
|
||||
outputs = {
|
||||
"input_ids": [],
|
||||
"token_type_ids": [],
|
||||
"position_ids": [],
|
||||
"images": [],
|
||||
"grid_thw": [],
|
||||
"image_type_ids": [],
|
||||
"labels": [],
|
||||
"cur_position": 0,
|
||||
"pic_cnt": 0,
|
||||
"video_cnt": 0,
|
||||
}
|
||||
self._add_special_token(self.cls_token, outputs)
|
||||
|
||||
for msg in messages:
|
||||
role = msg.get("role")
|
||||
assert role in self.role_prefixes, f"Unsupported role: {role}"
|
||||
prefix = self.role_prefixes[role]
|
||||
if prefix:
|
||||
self._add_text(prefix, outputs)
|
||||
|
||||
content_items = msg.get("content")
|
||||
if not isinstance(content_items, list):
|
||||
content_items = [content_items]
|
||||
|
||||
for item in content_items:
|
||||
if isinstance(item, str) or item.get("type") == "text":
|
||||
text = item if isinstance(item, str) else item.get("text", "")
|
||||
self._add_text(text, outputs)
|
||||
elif item.get("type") == "image_url" or item.get("type") == "image":
|
||||
self._add_image(item, outputs)
|
||||
elif item.get("type") == "video_url" or item.get("type") == "video":
|
||||
self._add_video(item, outputs)
|
||||
|
||||
if role in ("user", "system"):
|
||||
self._add_text("\n", outputs)
|
||||
else:
|
||||
self._add_special_token(self.sep_token, outputs)
|
||||
|
||||
if not self.is_training:
|
||||
# Append assistant prefix in eval
|
||||
self._add_text(self.role_prefixes["bot"], outputs)
|
||||
|
||||
return outputs
|
||||
|
||||
def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
|
||||
token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token)
|
||||
outputs["input_ids"].append(token_id)
|
||||
outputs["token_type_ids"].append(self.token_type_mapping[token])
|
||||
pos = outputs["cur_position"]
|
||||
outputs["position_ids"].append([pos] * 3)
|
||||
outputs["cur_position"] += 1
|
||||
|
||||
def _add_text(self, text: str, outputs: Dict) -> None:
|
||||
tokens = self.tokenizer.encode(text, add_special_tokens=False)["input_ids"]
|
||||
outputs["input_ids"].extend(tokens)
|
||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
|
||||
|
||||
start = outputs["cur_position"]
|
||||
for i in range(len(tokens)):
|
||||
outputs["position_ids"].append([start + i] * 3)
|
||||
outputs["cur_position"] += len(tokens)
|
||||
|
||||
def _add_image(self, item: Dict, outputs: Dict) -> None:
|
||||
url_info = item.get("image_url", {})
|
||||
w = url_info.get("image_width", None)
|
||||
h = url_info.get("image_height", None)
|
||||
|
||||
if "image" in item:
|
||||
img = item["image"]
|
||||
else:
|
||||
url = url_info.get("url")
|
||||
data = get_downloadable(url, download_dir=RAW_IMAGE_DIR, save_to_disk=False)
|
||||
img = Image.open(io.BytesIO(data) if isinstance(data, bytes) else data)
|
||||
|
||||
if w and h:
|
||||
img = img.resize((w, h))
|
||||
|
||||
outputs["pic_cnt"] += 1
|
||||
self._add_text(f"Picture {outputs['pic_cnt']}:", outputs)
|
||||
self._add_special_token(self.IMG_START, outputs)
|
||||
|
||||
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
|
||||
img.height,
|
||||
img.width,
|
||||
min_pixels=self.image_min_pixels,
|
||||
max_pixels=self.image_max_pixels,
|
||||
)[1]
|
||||
num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
|
||||
|
||||
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
|
||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
||||
|
||||
pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
|
||||
outputs["position_ids"].extend(pos_ids)
|
||||
outputs["cur_position"] = np.max(pos_ids) + 1
|
||||
|
||||
# Preprocess pixels
|
||||
ret = self.image_preprocessor.preprocess(
|
||||
images=[img.convert("RGB")],
|
||||
do_normalize=False,
|
||||
do_rescale=False,
|
||||
predetermined_grid_thw=np.array([[patches_h, patches_w]]),
|
||||
do_convert_rgb=True,
|
||||
input_data_format=ChannelDimension.LAST,
|
||||
)
|
||||
outputs["images"].append(ret["pixel_values"])
|
||||
outputs["grid_thw"].append(ret["image_grid_thw"])
|
||||
outputs["image_type_ids"].append(0)
|
||||
|
||||
self._add_special_token(self.IMG_END, outputs)
|
||||
|
||||
def _add_video(self, item: Dict, outputs: Dict) -> None:
|
||||
url_info = item.get("video_url", {})
|
||||
url = url_info.get("url")
|
||||
outputs["video_cnt"] += 1
|
||||
self._add_text(f"Video {outputs['video_cnt']}:", outputs)
|
||||
self._add_special_token(self.VID_START, outputs)
|
||||
|
||||
if "video" in item:
|
||||
video_path = item["video"]
|
||||
frames = self._load_and_process_video(video_path, item)
|
||||
else:
|
||||
video_path = get_downloadable(url, save_to_disk=False)
|
||||
frames = self._load_and_process_video(video_path, item)
|
||||
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
|
||||
frames[0].height,
|
||||
frames[0].width,
|
||||
min_pixels=self.video_min_pixels,
|
||||
max_pixels=self.video_max_pixels,
|
||||
)[1]
|
||||
num_frames = len(frames)
|
||||
num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
|
||||
|
||||
pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
|
||||
ret = self.image_preprocessor.preprocess(
|
||||
images=None,
|
||||
videos=pixel_stack,
|
||||
do_normalize=False,
|
||||
do_rescale=False,
|
||||
predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
|
||||
do_convert_rgb=True,
|
||||
input_data_format=ChannelDimension.LAST,
|
||||
)
|
||||
outputs["images"].append(ret["pixel_values_videos"])
|
||||
outputs["grid_thw"].append(ret["video_grid_thw"])
|
||||
outputs["image_type_ids"].extend([1] * num_frames)
|
||||
|
||||
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
|
||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||
|
||||
pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
|
||||
outputs["position_ids"].extend(pos_ids)
|
||||
outputs["cur_position"] = np.max(pos_ids) + 1
|
||||
|
||||
self._add_special_token(self.VID_END, outputs)
|
||||
|
||||
def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]:
|
||||
reader, meta, path = read_video_decord(url, save_to_disk=False)
|
||||
|
||||
video_frame_args = dict()
|
||||
video_frame_args["fps"] = item.get("fps", self.fps)
|
||||
video_frame_args["min_frames"] = item.get("min_frames", self.min_frames)
|
||||
video_frame_args["max_frames"] = item.get("max_frames", self.max_frames)
|
||||
video_frame_args["target_frames"] = item.get("target_frames", self.target_frames)
|
||||
video_frame_args["frames_sample"] = item.get("frames_sample", self.frames_sample)
|
||||
|
||||
video_frame_args = self._set_video_frame_args(video_frame_args, meta)
|
||||
|
||||
frames_data, _, timestamps = read_frames_decord(
|
||||
path,
|
||||
reader,
|
||||
meta,
|
||||
target_frames=video_frame_args["target_frames"],
|
||||
target_fps=video_frame_args["fps"],
|
||||
frames_sample=video_frame_args["frames_sample"],
|
||||
save_to_disk=False,
|
||||
)
|
||||
|
||||
frames: List[Image.Image] = []
|
||||
for img_array, ts in zip(frames_data, timestamps):
|
||||
frames.append(render_frame_timestamp(img_array, ts))
|
||||
# Ensure even number of frames for temporal conv
|
||||
if len(frames) % 2 != 0:
|
||||
frames.append(copy.deepcopy(frames[-1]))
|
||||
return frames
|
||||
|
||||
def _set_video_frame_args(self, video_frame_args, video_meta):
|
||||
"""
|
||||
根据已知参数和优先级,设定最终的抽帧参数
|
||||
"""
|
||||
# 优先级:video_target_frames > (video_min_frames, video_max_frames) > video_fps
|
||||
if video_frame_args["target_frames"] > 0:
|
||||
if video_frame_args["fps"] >= 0:
|
||||
raise ValueError("fps must be negative if target_frames is given")
|
||||
if (
|
||||
video_frame_args["min_frames"] > 0
|
||||
and video_frame_args["target_frames"] < video_frame_args["min_frames"]
|
||||
):
|
||||
raise ValueError("target_frames must be larger than min_frames")
|
||||
if (
|
||||
video_frame_args["max_frames"] > 0
|
||||
and video_frame_args["target_frames"] > video_frame_args["max_frames"]
|
||||
):
|
||||
raise ValueError("target_frames must be smaller than max_frames")
|
||||
else:
|
||||
if video_frame_args["fps"] < 0:
|
||||
raise ValueError("Must provide either positive target_fps or positive target_frames.")
|
||||
# 先计算在video_fps下抽到的帧数
|
||||
frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
|
||||
# 判断是否在目标区间内,如果不是,则取target_frames为上界或下界
|
||||
if (
|
||||
video_frame_args["min_frames"] > 0
|
||||
and video_frame_args["max_frames"] > 0
|
||||
and video_frame_args["min_frames"] > video_frame_args["max_frames"]
|
||||
):
|
||||
raise ValueError("min_frames must be smaller than max_frames")
|
||||
if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
|
||||
video_frame_args["target_frames"] = video_frame_args["min_frames"]
|
||||
video_frame_args["fps"] = -1
|
||||
if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
|
||||
video_frame_args["target_frames"] = video_frame_args["max_frames"]
|
||||
video_frame_args["fps"] = -1
|
||||
|
||||
return video_frame_args
|
||||
|
||||
def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int) -> List[List[int]]:
|
||||
# Downsample time if needed
|
||||
t_eff = t // self.temporal_conv_size if t != 1 else 1
|
||||
gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
|
||||
time_idx = np.repeat(np.arange(t_eff), gh * gw)
|
||||
h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
|
||||
w_idx = np.tile(np.arange(gw), t_eff * gh)
|
||||
|
||||
coords = list(zip(time_idx, h_idx, w_idx))
|
||||
return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
|
||||
201
fastdeploy/input/mm_processor/process_video.py
Normal file
201
fastdeploy/input/mm_processor/process_video.py
Normal file
@@ -0,0 +1,201 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from .utils.io_utils import EXTRACTED_FRAME_DIR, get_downloadable, get_filename
|
||||
from .utils.video_utils import VideoReaderWrapper
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
|
||||
def read_video_decord(video_path, save_to_disk):
|
||||
"""get reader and meta by decord"""
|
||||
data_in_mem = False
|
||||
# video_path = get_downloadable(video_path, save_to_disk=save_to_disk)
|
||||
if isinstance(video_path, VideoReaderWrapper):
|
||||
data_in_mem = True
|
||||
video_reader = video_path
|
||||
else:
|
||||
if isinstance(video_path, bytes):
|
||||
video_path = io.BytesIO(video_path)
|
||||
video_reader = VideoReaderWrapper(video_path, num_threads=1)
|
||||
vlen = len(video_reader)
|
||||
fps = video_reader.get_avg_fps()
|
||||
duration = vlen / float(fps)
|
||||
|
||||
video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
|
||||
|
||||
return video_reader, video_meta, video_path
|
||||
|
||||
|
||||
def get_frame_indices(
|
||||
vlen,
|
||||
target_frames=-1,
|
||||
target_fps=-1,
|
||||
frames_sample="middle",
|
||||
fix_start=None,
|
||||
input_fps=-1,
|
||||
):
|
||||
"""
|
||||
取出对应的frame index
|
||||
"""
|
||||
assert frames_sample in ["rand", "middle", "leading"]
|
||||
if target_frames > 0:
|
||||
assert target_fps <= 0, "target_fps must be negative if target_frames is given."
|
||||
if target_frames > vlen:
|
||||
acc_samples = vlen
|
||||
data_processor_logger.info(
|
||||
f"target_frames={target_frames} is larger than video length {vlen}, "
|
||||
f"will sample {acc_samples} frames."
|
||||
)
|
||||
else:
|
||||
acc_samples = target_frames
|
||||
data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
|
||||
|
||||
# split the video into `acc_samples` intervals, and sample from each interval.
|
||||
intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
|
||||
ranges = []
|
||||
for idx, interv in enumerate(intervals[:-1]):
|
||||
ranges.append((interv, intervals[idx + 1] - 1))
|
||||
if frames_sample == "rand":
|
||||
try:
|
||||
frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
|
||||
except Exception as e:
|
||||
frame_indices = np.random.permutation(vlen)[:acc_samples]
|
||||
frame_indices.sort()
|
||||
frame_indices = list(frame_indices)
|
||||
elif fix_start is not None:
|
||||
frame_indices = [x[0] + fix_start for x in ranges]
|
||||
elif frames_sample == "leading":
|
||||
frame_indices = [x[0] for x in ranges]
|
||||
elif frames_sample == "middle":
|
||||
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
elif target_fps > 0:
|
||||
assert target_frames <= 0, "target_frames must be negative if target_fps is given."
|
||||
assert input_fps > 0, "input_fps must be provided if target_fps is given."
|
||||
data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
|
||||
duration = float(vlen) / input_fps
|
||||
delta = 1 / target_fps # gap between frames, this is also the clip length each frame represents
|
||||
if frames_sample == "middle":
|
||||
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
|
||||
elif frames_sample == "leading":
|
||||
frame_seconds = np.arange(0, duration, delta)
|
||||
if frames_sample == "rand":
|
||||
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
|
||||
rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
|
||||
frame_seconds += rand_offset * delta
|
||||
frame_indices = np.around(frame_seconds * input_fps).astype(int)
|
||||
frame_indices = [e for e in frame_indices if e < vlen]
|
||||
|
||||
else:
|
||||
raise ValueError("Must provide either positive target_fps or positive target_frames.")
|
||||
|
||||
return frame_indices
|
||||
|
||||
|
||||
def read_frames_decord(
|
||||
video_path,
|
||||
video_reader,
|
||||
video_meta,
|
||||
target_frames=-1,
|
||||
target_fps=-1,
|
||||
frames_sample="middle",
|
||||
fix_start=None,
|
||||
save_to_disk=False,
|
||||
cache_dir=EXTRACTED_FRAME_DIR,
|
||||
frame_indices=None,
|
||||
tol=10,
|
||||
):
|
||||
"""get frames by decord"""
|
||||
|
||||
if frame_indices is None:
|
||||
frame_indices = get_frame_indices(
|
||||
video_meta["num_of_frame"],
|
||||
target_frames=target_frames,
|
||||
target_fps=target_fps,
|
||||
frames_sample=frames_sample,
|
||||
fix_start=fix_start,
|
||||
input_fps=video_meta["fps"],
|
||||
)
|
||||
|
||||
frames = []
|
||||
for frame_indice_index in range(0, len(frame_indices)):
|
||||
frame_indice = frame_indices[frame_indice_index]
|
||||
try:
|
||||
frames.append(video_reader[frame_indice].asnumpy()) # (T, H, W, C)
|
||||
except Exception as e:
|
||||
data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
|
||||
previous_counter = 1
|
||||
later_counter = 1
|
||||
previous_after_flag = True
|
||||
if frame_indice == 0 or frame_indice == len(video_reader) - 1:
|
||||
cur_tol = tol * 2
|
||||
else:
|
||||
cur_tol = tol
|
||||
while previous_counter < cur_tol or later_counter < cur_tol:
|
||||
if previous_after_flag:
|
||||
if frame_indice - previous_counter < 0:
|
||||
previous_counter += 1
|
||||
previous_after_flag = not previous_after_flag
|
||||
continue
|
||||
try:
|
||||
frames.append(video_reader[frame_indice - previous_counter].asnumpy())
|
||||
data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame")
|
||||
frame_indices[frame_indice_index] = frame_indice - previous_counter
|
||||
break
|
||||
except Exception as e:
|
||||
previous_counter += 1
|
||||
else:
|
||||
if frame_indice + later_counter >= len(video_reader):
|
||||
later_counter += 1
|
||||
previous_after_flag = not previous_after_flag
|
||||
continue
|
||||
try:
|
||||
frames.append(video_reader[frame_indice + later_counter].asnumpy())
|
||||
data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame")
|
||||
frame_indices[frame_indice_index] = frame_indice + later_counter
|
||||
break
|
||||
except Exception as e:
|
||||
later_counter += 1
|
||||
previous_after_flag = not previous_after_flag
|
||||
|
||||
frames = np.stack(frames, axis=0)
|
||||
assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
|
||||
|
||||
ret = []
|
||||
|
||||
url_sha1 = get_filename()
|
||||
for idx, frame in enumerate(frames):
|
||||
tmp = Image.fromarray(frame, "RGB")
|
||||
if save_to_disk:
|
||||
save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
|
||||
if not os.path.exists(os.path.dirname(save_path)):
|
||||
os.makedirs(os.path.dirname(save_path))
|
||||
tmp.save(save_path)
|
||||
tmp = save_path
|
||||
ret.append(tmp)
|
||||
|
||||
time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
|
||||
|
||||
return ret, frame_indices, time_stamps
|
||||
19
fastdeploy/input/mm_processor/tokenizer/__init__.py
Normal file
19
fastdeploy/input/mm_processor/tokenizer/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from .tokenizer_vl import ErnieVLTokenizer
|
||||
|
||||
__all__ = ['ErnieVLTokenizer']
|
||||
348
fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py
Normal file
348
fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""
|
||||
ErnieVLTokenizer
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from shutil import copyfile
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import sentencepiece as spm
|
||||
from paddlenlp.transformers import PretrainedTokenizer
|
||||
from paddlenlp.transformers.tokenizer_utils_base import (
|
||||
PaddingStrategy,
|
||||
TextInput,
|
||||
)
|
||||
from paddlenlp.utils.log import logger
|
||||
|
||||
|
||||
class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
"""doc"""
|
||||
|
||||
resource_files_names = {
|
||||
"vocab_file": "tokenizer.model",
|
||||
}
|
||||
pretrained_resource_files_map = {"vocab_file": {"ernie-bot-10b": None}}
|
||||
pretrained_init_configuration = {
|
||||
"ernie-bot-10b": {},
|
||||
}
|
||||
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
|
||||
padding_side = "right"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
bos_token="<s>",
|
||||
cls_token="<cls>",
|
||||
eos_token="</s>",
|
||||
mask_token="<mask:0>",
|
||||
pad_token="<pad>",
|
||||
sep_token="<sep>",
|
||||
unk_token="<unk>",
|
||||
additional_special_tokens=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""doc"""
|
||||
if additional_special_tokens is None:
|
||||
additional_special_tokens = ["<mask:1>", "<mask:7>"]
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
cls_token=cls_token,
|
||||
eos_token=eos_token,
|
||||
mask_token=mask_token,
|
||||
pad_token=pad_token,
|
||||
sep_token=sep_token,
|
||||
unk_token=unk_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def space_token(self):
|
||||
"""doc"""
|
||||
return "<mask:1>"
|
||||
|
||||
@property
|
||||
def space_token_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<mask:1>")
|
||||
|
||||
@property
|
||||
def gend_token(self):
|
||||
"""doc"""
|
||||
return "<mask:7>"
|
||||
|
||||
@property
|
||||
def gend_token_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<mask:7>")
|
||||
|
||||
@property
|
||||
def im_start_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<|im_start|>")
|
||||
|
||||
@property
|
||||
def im_end_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<|im_end|>")
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""doc"""
|
||||
return self.sp_model.vocab_size()
|
||||
|
||||
def get_vocab(self):
|
||||
"""doc"""
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text):
|
||||
"""doc"""
|
||||
return self.sp_model.encode_as_pieces(text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, id):
|
||||
"""doc"""
|
||||
return self.sp_model.id_to_piece(id)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
current_sub_tokens = []
|
||||
out_string = ""
|
||||
# prev_is_special = False
|
||||
for token in tokens:
|
||||
# make sure that special tokens are not decoded using sentencepiece model
|
||||
if token in self.all_special_tokens:
|
||||
# if not prev_is_special:
|
||||
# out_string += " "
|
||||
out_string += self.sp_model.decode(current_sub_tokens) + token
|
||||
# prev_is_special = True
|
||||
|
||||
current_sub_tokens = []
|
||||
else:
|
||||
current_sub_tokens.append(token)
|
||||
# prev_is_special = False
|
||||
out_string += self.sp_model.decode(current_sub_tokens)
|
||||
return out_string # .strip()
|
||||
|
||||
def prepare_for_model(self, *args, **kwargs):
|
||||
"""doc"""
|
||||
if "add_special_tokens" in kwargs:
|
||||
kwargs.pop("add_special_tokens")
|
||||
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
|
||||
return super().prepare_for_model(*args, **kwargs)
|
||||
|
||||
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
"""
|
||||
Save the vocabulary and special tokens file to a directory.
|
||||
Args:
|
||||
save_directory (`str`):
|
||||
The directory in which to save the vocabulary.
|
||||
Returns:
|
||||
`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory,
|
||||
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
|
||||
)
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
return (out_vocab_file,)
|
||||
|
||||
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
|
||||
"""
|
||||
Converts a string in a sequence of tokens, using the tokenizer.
|
||||
|
||||
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
|
||||
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
|
||||
|
||||
Args:
|
||||
text (`str`):
|
||||
The sequence to be encoded.
|
||||
**kwargs (additional keyword arguments):
|
||||
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
|
||||
|
||||
Returns:
|
||||
`List[str]`: The list of tokens.
|
||||
"""
|
||||
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
|
||||
|
||||
# TODO: should this be in the base class?
|
||||
if hasattr(self, "do_lower_case") and self.do_lower_case:
|
||||
# convert non-special tokens to lowercase
|
||||
escaped_special_toks = [
|
||||
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
|
||||
]
|
||||
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
|
||||
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
|
||||
|
||||
no_split_token = set(self.unique_no_split_tokens)
|
||||
tokens = self.tokens_trie.split(text)
|
||||
|
||||
tokenized_text = []
|
||||
for token in tokens:
|
||||
# Need to skip eventual empty (fully stripped) tokens
|
||||
if not token:
|
||||
continue
|
||||
if token in no_split_token:
|
||||
tokenized_text.append(token)
|
||||
else:
|
||||
tokenized_text.extend(self._tokenize(token))
|
||||
return tokenized_text
|
||||
|
||||
def _decode(self, *args, **kwargs):
|
||||
"""doc"""
|
||||
kwargs.pop("clean_up_tokenization_spaces", None)
|
||||
kwargs.pop("spaces_between_special_tokens", None)
|
||||
return super()._decode(
|
||||
*args,
|
||||
**kwargs,
|
||||
clean_up_tokenization_spaces=False,
|
||||
spaces_between_special_tokens=False,
|
||||
)
|
||||
|
||||
def _pad(
|
||||
self,
|
||||
encoded_inputs: Dict,
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy=PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""doc"""
|
||||
if return_attention_mask is None:
|
||||
return_attention_mask = "attention_mask" in self.model_input_names
|
||||
if return_attention_mask:
|
||||
required_input = encoded_inputs[self.model_input_names[0]]
|
||||
if padding_strategy == PaddingStrategy.LONGEST:
|
||||
max_length = len(required_input)
|
||||
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
||||
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
||||
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
||||
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
|
||||
attention_mask = encoded_inputs.pop("attention_mask")
|
||||
if isinstance(attention_mask, paddle.Tensor):
|
||||
attention_mask = attention_mask.numpy()
|
||||
elif isinstance(attention_mask, list):
|
||||
attention_mask = np.array(attention_mask)
|
||||
elif not isinstance(attention_mask, np.ndarray):
|
||||
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
|
||||
else:
|
||||
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
|
||||
attention_mask = np.expand_dims(attention_mask, axis=0)
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
if attention_mask.ndim == 1:
|
||||
pad_width = [(0, difference)]
|
||||
else:
|
||||
pad_width = [(0, 0), (0, difference), (0, difference)]
|
||||
elif self.padding_side == "left":
|
||||
if attention_mask.ndim == 1:
|
||||
pad_width = [(difference, 0)]
|
||||
else:
|
||||
pad_width = [(0, 0), (difference, 0), (difference, 0)]
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
attention_mask = np.pad(
|
||||
attention_mask,
|
||||
pad_width=pad_width,
|
||||
mode="constant",
|
||||
constant_values=0,
|
||||
)
|
||||
encoded_inputs = super()._pad(
|
||||
encoded_inputs,
|
||||
max_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_attention_mask=False,
|
||||
)
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = attention_mask.tolist()
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def add_special_tokens(
|
||||
tokenizer,
|
||||
special_tokens_info,
|
||||
use_ocr_specialtoken=False,
|
||||
use_crop_specialtoken=False,
|
||||
special_token_ids_start=254208,
|
||||
special_token_ids_end=256256,
|
||||
):
|
||||
"""
|
||||
增加 special token
|
||||
|
||||
placeholder [<|IMAGE_PLACEHOLDER|>, <|AUDIO_PLACEHOLDER|>, <|VIDEO_PLACEHOLDER|>] 共3个
|
||||
|
||||
模态起始截止 special tokens [<|BOI|> <|EOI|> <|BOA|> <|EOA|> <|BOV|> <|EOV|>]
|
||||
|
||||
ocr special tokens [<|LOC_0|> <|LOC_1|> ... <|LOC_1000|>] 共1001个
|
||||
|
||||
crop special tokens [<|CROP_COL_SEP|>, <|CROP_ROW_SEP|>, <|CROP_IMAGE_SEP|>] 共3个
|
||||
<|CROP_COL_SEP|> for col 维度切 图片width(替换原明文逗号)
|
||||
<|CROP_ROW_SEP|> for row 维度切 图片height(替换原明文回车)
|
||||
<|CROP_IMAGE_SEP|> for 区分原图和crop图 图片width(替换原明文两个回车)
|
||||
|
||||
共2048个 unsed token
|
||||
|
||||
Args:
|
||||
tokenizer (ErnieTokenizer): tokenizer
|
||||
special_token_ids_start (int, optional): special token 起点 ids. Defaults to 254208.
|
||||
special_token_ids_end (int, optional): 词表最多支持大小. Defaults to 256256.
|
||||
"""
|
||||
special_tokens = [
|
||||
special_tokens_info["image_placeholder"],
|
||||
special_tokens_info["audio_placeholder"],
|
||||
]
|
||||
|
||||
if use_ocr_specialtoken:
|
||||
special_tokens.extend(special_tokens_info["ocr_coor"])
|
||||
special_tokens.extend(special_tokens_info["ocr_begin_end"])
|
||||
|
||||
if use_crop_specialtoken:
|
||||
special_tokens.extend(special_tokens_info["crop"])
|
||||
|
||||
# add special_tokens
|
||||
additional_special_tokens = {"additional_special_tokens": special_tokens}
|
||||
tokenizer.add_special_tokens(additional_special_tokens)
|
||||
|
||||
# check
|
||||
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
|
||||
|
||||
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
|
||||
assert (
|
||||
len(tokenizer.get_vocab()) < special_token_ids_end
|
||||
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
|
||||
BIN
fastdeploy/input/mm_processor/utils/Roboto-Regular.ttf
Normal file
BIN
fastdeploy/input/mm_processor/utils/Roboto-Regular.ttf
Normal file
Binary file not shown.
15
fastdeploy/input/mm_processor/utils/__init__.py
Normal file
15
fastdeploy/input/mm_processor/utils/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
253
fastdeploy/input/mm_processor/utils/io_utils.py
Normal file
253
fastdeploy/input/mm_processor/utils/io_utils.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import datetime
|
||||
import hashlib
|
||||
import io
|
||||
import os
|
||||
import threading
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from PIL import Image
|
||||
from PIL.ExifTags import TAGS
|
||||
|
||||
RAW_VIDEO_DIR = "./download_tmp/raw_video/"
|
||||
RAW_IMAGE_DIR = "./download_tmp/raw_images/"
|
||||
EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
|
||||
TMP_DIR = "./download_tmp/upload_tmp/"
|
||||
|
||||
|
||||
def file_download(url, download_dir, save_to_disk=False, retry=0, retry_interval=3):
|
||||
"""
|
||||
Description: 下载url,如果url是PIL直接返回
|
||||
Args:
|
||||
url(str, PIL): http/本地路径/io.Bytes,注意io.Bytes是图片字节流
|
||||
download_path: 在save_to_disk=True的情况下生效,返回保存地址
|
||||
save_to_disk: 是否保存在本地路径
|
||||
|
||||
"""
|
||||
from .video_utils import VideoReaderWrapper
|
||||
|
||||
if isinstance(url, Image.Image):
|
||||
return url
|
||||
elif isinstance(url, VideoReaderWrapper):
|
||||
return url
|
||||
elif url.startswith("http"):
|
||||
response = requests.get(url)
|
||||
bytes_data = response.content
|
||||
elif os.path.isfile(url):
|
||||
if save_to_disk:
|
||||
return url
|
||||
bytes_data = open(url, "rb").read()
|
||||
else:
|
||||
bytes_data = base64.b64decode(url)
|
||||
if not save_to_disk:
|
||||
return bytes_data
|
||||
|
||||
download_path = os.path.join(download_dir, get_filename(url))
|
||||
Path(download_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(download_path, "wb") as f:
|
||||
f.write(bytes_data)
|
||||
return download_path
|
||||
|
||||
|
||||
def get_filename(url=None):
|
||||
"""
|
||||
Get Filename
|
||||
"""
|
||||
if url is None:
|
||||
return str(uuid.uuid4()).replace("-", "")
|
||||
t = datetime.datetime.now()
|
||||
if not isinstance(url, bytes):
|
||||
url = url.encode("utf-8")
|
||||
|
||||
md5_hash = hashlib.md5(url).hexdigest()
|
||||
pid = os.getpid()
|
||||
tid = threading.get_ident()
|
||||
|
||||
# 去掉后缀,防止save-jpg报错
|
||||
image_filname = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
|
||||
return image_filname
|
||||
|
||||
|
||||
def get_downloadable(url, download_dir=RAW_VIDEO_DIR, save_to_disk=False, retry=0, retry_interval=3):
|
||||
"""download video and store it in the disk
|
||||
|
||||
return downloaded **path** if save_to_disk is set to true
|
||||
return downloaded **bytes** if save_to_disk is set to false
|
||||
"""
|
||||
|
||||
if not os.path.exists(download_dir):
|
||||
os.makedirs(download_dir)
|
||||
downloaded_path = file_download(
|
||||
url,
|
||||
download_dir,
|
||||
save_to_disk=save_to_disk,
|
||||
retry=retry,
|
||||
retry_interval=retry_interval,
|
||||
)
|
||||
return downloaded_path
|
||||
|
||||
|
||||
def get_downloadable_image(download_path, need_exif_info, retry_max_time=0, retry_interval=3):
|
||||
"""
|
||||
带上exif info和图像处理的get downloadable
|
||||
"""
|
||||
|
||||
def get_image_exif(image):
|
||||
exif_data = image._getexif()
|
||||
exif_info = {}
|
||||
if exif_data is not None:
|
||||
for tag, value in exif_data.items():
|
||||
tag_name = TAGS.get(tag, tag)
|
||||
exif_info[tag_name] = value.strip()
|
||||
return exif_info
|
||||
|
||||
def has_transparent_background(img):
|
||||
"""判断图片是否有背景"""
|
||||
if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
|
||||
# Check for any pixel with alpha channel less than 255 (fully opaque)
|
||||
alpha = img.convert("RGBA").split()[-1]
|
||||
if alpha.getextrema()[0] < 255:
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_white_background(img):
|
||||
"""
|
||||
给透明背景的图,加个白色背景
|
||||
"""
|
||||
if img.mode != "RGBA":
|
||||
img = img.convert("RGBA")
|
||||
# 创建一个白色背景的图像,尺寸与原图一致
|
||||
img_white_background = Image.new("RGBA", img.size, (255, 255, 255))
|
||||
|
||||
# 将原图粘贴到白色背景上
|
||||
img_white_background.paste(img, (0, 0), img)
|
||||
|
||||
return img_white_background
|
||||
|
||||
def change_I16_to_L(img):
|
||||
"""
|
||||
将图片从I;16模式转换为L模式
|
||||
"""
|
||||
# 由于I模式的point函数只支持加减乘,所以下面的* (1 / 256)不能改成除法
|
||||
return img.point(lambda i: i * (1 / 256)).convert("L")
|
||||
|
||||
image = get_downloadable(download_path, save_to_disk=False, retry=retry_max_time, retry_interval=retry_interval)
|
||||
if isinstance(image, Image.Image):
|
||||
pil_image = image
|
||||
else:
|
||||
pil_image = Image.open(io.BytesIO(image))
|
||||
if need_exif_info:
|
||||
try:
|
||||
exif_info = get_image_exif(pil_image)
|
||||
except Exception as why:
|
||||
exif_info = {}
|
||||
else:
|
||||
exif_info = {}
|
||||
|
||||
try:
|
||||
if pil_image.mode == "I;16":
|
||||
pil_image = change_I16_to_L(pil_image)
|
||||
if has_transparent_background(pil_image):
|
||||
pil_image = add_white_background(pil_image)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return pil_image.convert("RGB"), exif_info
|
||||
|
||||
|
||||
def str2hash(url):
|
||||
"""
|
||||
从一个str的到url
|
||||
"""
|
||||
return hashlib.sha256(url.encode()).hexdigest()
|
||||
|
||||
|
||||
def pil2hash(pil):
|
||||
"""
|
||||
从一个PIL.Image到hash
|
||||
"""
|
||||
byte_io = io.BytesIO()
|
||||
pil.save(byte_io, format="PNG") # 选择无损格式,避免压缩影响
|
||||
image_bytes = byte_io.getvalue()
|
||||
|
||||
return hashlib.sha256(image_bytes).hexdigest()
|
||||
|
||||
|
||||
def imagepath_to_base64(image_path):
|
||||
"""imagepath_to_base64"""
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format="JPEG")
|
||||
image_bytes = buffer.getvalue()
|
||||
base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
|
||||
return base64_encoded
|
||||
|
||||
|
||||
def pil_image_to_base64(image):
|
||||
"""pil_image_to_base64"""
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format="JPEG")
|
||||
image_bytes = buffer.getvalue()
|
||||
base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
|
||||
return base64_encoded
|
||||
|
||||
|
||||
def http_to_pil_image(url):
|
||||
"""http_to_pil_image"""
|
||||
response = requests.get(url)
|
||||
image_data = io.BytesIO(response.content)
|
||||
pil_image = Image.open(image_data).convert("RGB")
|
||||
return pil_image
|
||||
|
||||
|
||||
def http_to_image_base64(url):
|
||||
"""http_to_image_base64"""
|
||||
response = requests.get(url)
|
||||
image_data = io.BytesIO(response.content)
|
||||
return base64.b64encode(image_data.getvalue()).decode("utf-8")
|
||||
|
||||
|
||||
def base64_to_pil_image(base64_string):
|
||||
""" " base64_to_pil_image"""
|
||||
image_bytes = base64.b64decode(base64_string)
|
||||
buffer = io.BytesIO(image_bytes)
|
||||
image = Image.open(buffer)
|
||||
return image
|
||||
|
||||
|
||||
def get_hashable(to_be_hashed):
|
||||
"""get hashable"""
|
||||
if isinstance(to_be_hashed, bytes):
|
||||
return to_be_hashed
|
||||
elif isinstance(to_be_hashed, Image.Image):
|
||||
return to_be_hashed.tobytes()
|
||||
elif isinstance(to_be_hashed, str):
|
||||
return to_be_hashed.encode("utf-8")
|
||||
else:
|
||||
raise ValueError(f"not support type: {type(to_be_hashed)}")
|
||||
|
||||
|
||||
def load_dict_from_npz(npzfile):
|
||||
"""从npz文件读取数据"""
|
||||
with np.load(npzfile, allow_pickle=True) as data:
|
||||
loaded_dict = {key: data[key] for key in data.files}
|
||||
return loaded_dict
|
||||
96
fastdeploy/input/mm_processor/utils/render_timestamp.py
Normal file
96
fastdeploy/input/mm_processor/utils/render_timestamp.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
cur_directory = Path(__file__).parent.absolute()
|
||||
FONT_PATH = os.path.join(cur_directory, "Roboto-Regular.ttf")
|
||||
|
||||
|
||||
def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
|
||||
"""
|
||||
函数功能: 给pil.image的图片渲染时间戳
|
||||
时间戳的大小为 min(width, height)的rate
|
||||
字体的颜色为黑色, 轮廓是白色, 轮廓的大小是字体的10%
|
||||
返回一个 Image 对象
|
||||
"""
|
||||
draw = ImageDraw.Draw(image) # 创建一个可绘制对象
|
||||
width, height = image.size # 获取图片大小
|
||||
font_size = int(min(width, height) * rate) # 设置字体大小
|
||||
outline_size = int(font_size * 0.1) # 设置轮廓大小
|
||||
font = ImageFont.truetype(font_path, font_size) # 加载字体文件, 设置字体大小
|
||||
x = 0
|
||||
y = 0 # 文本的x坐标, y坐标
|
||||
|
||||
# 绘制黑色的时间戳,白色的边框
|
||||
draw.text((x, y), number, font=font, fill=(0, 0, 0), stroke_width=outline_size, stroke_fill=(255, 255, 255))
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def timestamp_converting(time_stamp_in_seconds):
|
||||
"""
|
||||
convert timestamp format from seconds to hr:min:sec
|
||||
"""
|
||||
# get hours
|
||||
hours = 0
|
||||
while time_stamp_in_seconds >= 3600:
|
||||
hours += 1
|
||||
time_stamp_in_seconds -= 3600
|
||||
# get minutes
|
||||
mins = 0
|
||||
while time_stamp_in_seconds >= 60:
|
||||
mins += 1
|
||||
time_stamp_in_seconds -= 60
|
||||
time_hours = f"{int(hours):02d}"
|
||||
time_mins = f"{int(mins):02d}"
|
||||
time_secs = f"{time_stamp_in_seconds:05.02f}"
|
||||
fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
|
||||
|
||||
return fi_time_stamp
|
||||
|
||||
|
||||
def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
|
||||
"""
|
||||
function: get the timestamp of a frame, 在均匀抽帧时用。
|
||||
|
||||
num_frames: 总帧数
|
||||
frameid_list: 被抽帧的帧的索引
|
||||
duration: 视频的总时长
|
||||
return: timestamp; xx:xx:xx (str)
|
||||
"""
|
||||
time_stamp = duration * 1.0 * frame_id / num_frames
|
||||
|
||||
return time_stamp
|
||||
|
||||
|
||||
def render_frame_timestamp(frame, timestamp, font_rate=0.1):
|
||||
"""
|
||||
函数功能, 给frame, 按照顺序将 index 渲染上去
|
||||
逻辑思路: 把index渲染到图片的左上方
|
||||
|
||||
frame: 帧,PIL.Image object
|
||||
timestamp: 时间戳,单位是秒
|
||||
font_rate: 字体大小占 min(wi, hei)的比率
|
||||
"""
|
||||
|
||||
time_stamp = "time: " + timestamp_converting(timestamp)
|
||||
new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
|
||||
|
||||
return new_frame
|
||||
83
fastdeploy/input/mm_processor/utils/video_utils.py
Normal file
83
fastdeploy/input/mm_processor/utils/video_utils.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
from tempfile import NamedTemporaryFile as ntf
|
||||
|
||||
import decord
|
||||
|
||||
try:
|
||||
# moviepy 1.0
|
||||
import moviepy.editor as mp
|
||||
except:
|
||||
# moviepy 2.0
|
||||
import moviepy as mp
|
||||
|
||||
|
||||
def is_gif(data: bytes) -> bool:
|
||||
"""
|
||||
check if a bytes is a gif based on the magic head
|
||||
"""
|
||||
return data[:6] in (b"GIF87a", b"GIF89a")
|
||||
|
||||
|
||||
class VideoReaderWrapper(decord.VideoReader):
|
||||
"""
|
||||
Solving memory leak bug
|
||||
|
||||
https://github.com/dmlc/decord/issues/208
|
||||
"""
|
||||
|
||||
def __init__(self, video_path, *args, **kwargs):
|
||||
with ntf(delete=True, suffix=".gif") as gif_file:
|
||||
gif_input = None
|
||||
self.original_file = None
|
||||
if isinstance(video_path, str):
|
||||
self.original_file = video_path
|
||||
if video_path.lower().endswith(".gif"):
|
||||
gif_input = video_path
|
||||
elif isinstance(video_path, bytes):
|
||||
if is_gif(video_path):
|
||||
gif_file.write(video_path)
|
||||
gif_input = gif_file.name
|
||||
elif isinstance(video_path, io.BytesIO):
|
||||
video_path.seek(0)
|
||||
tmp_bytes = video_path.read()
|
||||
video_path.seek(0)
|
||||
if is_gif(tmp_bytes):
|
||||
gif_file.write(tmp_bytes)
|
||||
gif_input = gif_file.name
|
||||
|
||||
if gif_input is not None:
|
||||
clip = mp.VideoFileClip(gif_input)
|
||||
mp4_file = ntf(delete=False, suffix=".mp4")
|
||||
clip.write_videofile(mp4_file.name, verbose=False, logger=None)
|
||||
clip.close()
|
||||
video_path = mp4_file.name
|
||||
self.original_file = video_path
|
||||
|
||||
super().__init__(video_path, *args, **kwargs)
|
||||
self.seek(0)
|
||||
|
||||
def __getitem__(self, key):
|
||||
frames = super().__getitem__(key)
|
||||
self.seek(0)
|
||||
return frames
|
||||
|
||||
def __del__(self):
|
||||
if self.original_file and os.path.exists(self.original_file):
|
||||
os.remove(self.original_file)
|
||||
15
fastdeploy/input/multimodal/__init__.py
Normal file
15
fastdeploy/input/multimodal/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
127
fastdeploy/input/multimodal/audio.py
Normal file
127
fastdeploy/input/multimodal/audio.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
|
||||
from .base import MediaIO, MultiModalPlugin
|
||||
from .inputs import AudioItem, ModalityData, MultiModalKwargs
|
||||
|
||||
# TODO 多模数据处理
|
||||
# try:
|
||||
# import librosa
|
||||
# except ImportError:
|
||||
# librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
# try:
|
||||
# import soundfile
|
||||
# except ImportError:
|
||||
# soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
|
||||
|
||||
|
||||
def resample_audio(
|
||||
audio: npt.NDArray[np.floating],
|
||||
*,
|
||||
orig_sr: float,
|
||||
target_sr: float,
|
||||
) -> npt.NDArray[np.floating]:
|
||||
"""
|
||||
将音频数据从原始采样率(`orig_sr`)重采样到目标采样率(`target_sr`)。
|
||||
|
||||
Args:
|
||||
audio (npt.NDArray[np.floating]): 带有单通道浮点型音频数据的 numpy ndarray,形状为 `(samples,)`。
|
||||
orig_sr (float): 音频数据的原始采样率。
|
||||
target_sr (float): 需要转换到的目标采样率。
|
||||
|
||||
Returns:
|
||||
npt.NDArray[np.floating]: 带有单通道浮点型音频数据的 numpy ndarray,形状为 `(samples,)`,已经被重采样到目标采样率。
|
||||
|
||||
Raises:
|
||||
None.
|
||||
"""
|
||||
import librosa
|
||||
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
||||
|
||||
|
||||
|
||||
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
||||
|
||||
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
|
||||
"""
|
||||
加载字节数据,返回音频信号和采样率。
|
||||
参数:
|
||||
data (bytes) - 字节数据,包含音频文件的内容。
|
||||
返回值(tuple):
|
||||
(array, float) - 第一个元素是一个numpy数组,表示音频信号,第二个元素是一个浮点数,表示采样率。
|
||||
如果解码失败,则返回 None。
|
||||
"""
|
||||
import librosa
|
||||
return librosa.load(BytesIO(data), sr=None)
|
||||
|
||||
|
||||
def load_base64(
|
||||
self,
|
||||
media_type: str,
|
||||
data: str,
|
||||
) -> tuple[npt.NDArray, float]:
|
||||
"""
|
||||
将 base64 编码的字符串转换为 numpy 数组和尺度。
|
||||
|
||||
Args:
|
||||
media_type (str): 媒体类型,例如 'image/jpeg'、'image/png' 等。
|
||||
data (str): base64 编码的字符串,表示图像或其他二进制数据。
|
||||
|
||||
Returns:
|
||||
tuple[npt.NDArray, float]: 包含以下两个元素:
|
||||
- npt.NDArray: 形状为(H,W,C)的 numpy 数组,表示图像或其他二进制数据。
|
||||
- float: 图像的尺度,单位为像素。
|
||||
|
||||
Raises:
|
||||
ValueError: 当 media_type 不是有效的媒体类型时引发。
|
||||
"""
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
|
||||
"""
|
||||
加载音频文件,返回音频数据和采样率。
|
||||
参数:
|
||||
filepath (Path): 音频文件路径(Path类型)。
|
||||
返回值:
|
||||
tuple[npt.NDArray, float]:包含两个元素的元组,第一个是音频数据(npt.NDArray类型),
|
||||
第二个是采样率(float类型)。
|
||||
"""
|
||||
import librosa
|
||||
return librosa.load(filepath, sr=None)
|
||||
|
||||
def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
|
||||
"""
|
||||
将音频数据和采样率转换为Base64编码的字符串。
|
||||
参数:
|
||||
media (tuple[numpy.ndarray, float]): 包含音频数据和采样率的元组,其中音频数据是一个numpy数组,采样率是一个浮点数。
|
||||
返回值 (str): Base64编码的字符串,表示音频数据和采样率。
|
||||
"""
|
||||
audio, sr = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
import soundfile
|
||||
soundfile.write(buffer, audio, sr, format="WAV")
|
||||
data = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(data).decode('utf-8')
|
||||
69
fastdeploy/input/multimodal/base.py
Normal file
69
fastdeploy/input/multimodal/base.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict
|
||||
from collections.abc import Sequence
|
||||
from pathlib import Path
|
||||
from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
|
||||
Optional, TypeVar, Union)
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
|
||||
class MediaIO(ABC, Generic[_T]):
|
||||
|
||||
@abstractmethod
|
||||
def load_bytes(self, data: bytes) -> _T:
|
||||
"""
|
||||
将字节数据加载为对象,并返回该对象。
|
||||
如果加载失败,则抛出异常。
|
||||
|
||||
Args:
|
||||
data (bytes): 要加载的字节数据。
|
||||
|
||||
Raises:
|
||||
NotImplementedError: 当前类未实现此方法。
|
||||
|
||||
Returns:
|
||||
_T: 加载后的对象。
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def load_base64(self, media_type: str, data: str) -> _T:
|
||||
"""
|
||||
List of media types:
|
||||
https://www.iana.org/assignments/media-types/media-types.xhtml
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def load_file(self, filepath: Path) -> _T:
|
||||
"""
|
||||
加载文件,返回解析后的数据。
|
||||
|
||||
Args:
|
||||
filepath (Path): 文件路径,必须是一个绝对路径。
|
||||
|
||||
Raises:
|
||||
NotImplementedError: 当前方法未被实现。
|
||||
|
||||
Returns:
|
||||
_T: 任意类型,表示解析后的数据。
|
||||
"""
|
||||
raise NotImplementedError
|
||||
145
fastdeploy/input/multimodal/image.py
Normal file
145
fastdeploy/input/multimodal/image.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from .base import MediaIO
|
||||
|
||||
|
||||
class ImageMediaIO(MediaIO[Image.Image]):
|
||||
|
||||
def __init__(self, *, image_mode: str = "RGB") -> None:
|
||||
"""
|
||||
Initializes the object.
|
||||
|
||||
Args:
|
||||
image_mode (str, optional): The mode of the image, defaults to "RGB". Should be one of "L", "LA", "P",
|
||||
"RGB", "RGBA", "CMYK", or "YCbCr".
|
||||
|
||||
Raises:
|
||||
ValueError: If `image_mode` is not a valid mode.
|
||||
|
||||
Returns:
|
||||
None: This method does not return anything. It initializes the object with the given parameters.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.image_mode = image_mode
|
||||
|
||||
def load_bytes(self, data: bytes) -> Image.Image:
|
||||
"""
|
||||
将字节数据转换为图像对象,并返回。
|
||||
该方法会自动调用Image.open和Image.load方法,以及convert方法将图像转换为指定模式(默认为RGB)。
|
||||
|
||||
Args:
|
||||
data (bytes): 包含图像数据的字节对象。
|
||||
|
||||
Returns:
|
||||
Image.Image: 一个包含了原始图像数据的Image对象,已经被转换为指定模式。
|
||||
|
||||
Raises:
|
||||
无。
|
||||
"""
|
||||
image = Image.open(BytesIO(data))
|
||||
image.load()
|
||||
return image.convert(self.image_mode)
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> Image.Image:
|
||||
"""
|
||||
将 base64 编码的字符串转换为图片对象。
|
||||
|
||||
Args:
|
||||
media_type (str): 媒体类型,例如 "image/jpeg"。
|
||||
data (str): base64 编码的字符串数据。
|
||||
|
||||
Returns:
|
||||
Image.Image: PIL 中的图片对象。
|
||||
|
||||
Raises:
|
||||
无。
|
||||
"""
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> Image.Image:
|
||||
"""
|
||||
加载文件,并转换为指定模式。
|
||||
如果文件不存在或无法打开,将抛出FileNotFoundError异常。
|
||||
|
||||
Args:
|
||||
filepath (Path): 文件路径(Pathlib.Path对象)。
|
||||
|
||||
Returns:
|
||||
Image.Image: 返回一个Image.Image对象,表示已经加载和转换的图像。
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: 当文件不存在时抛出此异常。
|
||||
"""
|
||||
image = Image.open(filepath)
|
||||
image.load()
|
||||
return image.convert(self.image_mode)
|
||||
|
||||
def load_file_request(self, request: Any) -> Image.Image:
|
||||
"""
|
||||
从请求中加载图像文件,并返回一个PIL Image对象。
|
||||
该函数需要传入一个包含图像URL的字符串或者可迭代对象(如requests库的Response对象)。
|
||||
该函数会自动处理图像的格式和大小,并将其转换为指定的模式(默认为RGB)。
|
||||
|
||||
Args:
|
||||
request (Any): 包含图像URL的字符串或者可迭代对象(如requests库的Response对象)。
|
||||
|
||||
Returns:
|
||||
Image.Image: PIL Image对象,表示已经加载并转换好的图像。
|
||||
|
||||
Raises:
|
||||
无。
|
||||
"""
|
||||
image = Image.open(requests.get(request, stream=True).raw)
|
||||
image.load()
|
||||
return image.convert(self.image_mode)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: Image.Image,
|
||||
*,
|
||||
image_format: str = "JPEG",
|
||||
) -> str:
|
||||
"""
|
||||
将图像转换为Base64编码的字符串。
|
||||
|
||||
Args:
|
||||
media (Image.Image): 待处理的图像对象,支持PIL库中的Image类型。
|
||||
image_format (str, optional): 指定图像格式,默认为"JPEG"。可选项包括:"PNG", "JPEG", "BMP", "TIFF"等。
|
||||
PIL库中的所有图片格式都可以使用,但是不建议使用"PPM"和"XBM"格式,因为这两种格式在Python3中已经被弃用了。
|
||||
|
||||
Returns:
|
||||
str: Base64编码后的字符串,可以直接作为HTML或者JSON数据传输。
|
||||
|
||||
Raises:
|
||||
None
|
||||
"""
|
||||
image = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
image = image.convert(self.image_mode)
|
||||
image.save(buffer, image_format)
|
||||
data = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(data).decode('utf-8')
|
||||
192
fastdeploy/input/multimodal/utils.py
Normal file
192
fastdeploy/input/multimodal/utils.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import os
|
||||
import random
|
||||
|
||||
import socket
|
||||
from urllib.parse import urlparse
|
||||
import ipaddress
|
||||
|
||||
import requests
|
||||
from PIL import Image, ImageOps
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
import pyheif
|
||||
from pdf2image import convert_from_path
|
||||
import cairosvg
|
||||
import subprocess
|
||||
import tempfile
|
||||
import mimetypes
|
||||
|
||||
def process_image_data(image_data, mime_type, url):
|
||||
"""处理不同类型的图像数据并返回 PIL 图像对象"""
|
||||
|
||||
if mime_type in ['image/heif', 'image/heic'] or url.lower().endswith('.heif') or url.lower().endswith('.heic'):
|
||||
heif_file = pyheif.read(image_data)
|
||||
pil_image = Image.frombytes(
|
||||
heif_file.mode, heif_file.size, heif_file.data,
|
||||
"raw", heif_file.mode, heif_file.stride
|
||||
)
|
||||
elif mime_type == 'application/pdf' or url.lower().endswith('.pdf'):
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
||||
temp_pdf.write(image_data.getvalue())
|
||||
temp_pdf_path = temp_pdf.name
|
||||
images = convert_from_path(temp_pdf_path)
|
||||
pil_image = images[0]
|
||||
os.remove(temp_pdf_path)
|
||||
elif mime_type == 'image/svg+xml' or url.lower().endswith('.svg'):
|
||||
png_data = cairosvg.svg2png(bytestring=image_data.getvalue())
|
||||
pil_image = Image.open(io.BytesIO(png_data))
|
||||
elif mime_type in ['application/postscript', 'application/illustrator'] or url.lower().endswith('.ai'):
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.ai') as ai_temp, tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as pdf_temp:
|
||||
ai_temp_path = ai_temp.name
|
||||
pdf_temp_path = pdf_temp.name
|
||||
ai_temp.write(image_data.getvalue())
|
||||
ai_temp.close()
|
||||
subprocess.run(['inkscape', ai_temp_path, '--export-pdf=' + pdf_temp_path], check=True)
|
||||
images = convert_from_path(pdf_temp_path)
|
||||
pil_image = images[0]
|
||||
os.remove(ai_temp_path)
|
||||
os.remove(pdf_temp_path)
|
||||
|
||||
elif mime_type == 'image/gif' or url.lower().endswith('.gif'):
|
||||
pil_image = Image.open(image_data)
|
||||
else:
|
||||
pil_image = Image.open(image_data)
|
||||
|
||||
return pil_image
|
||||
|
||||
def http_to_pil_image(url):
|
||||
"""http_to_pil_image"""
|
||||
if is_public_url(url) and int(os.getenv("DOWNLOAD_WITH_TP_SERVER", "0")):
|
||||
return http_to_pil_image_with_tp_server(url)
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
raise Exception("Failed to download the image from URL.")
|
||||
image_data = io.BytesIO(response.content)
|
||||
|
||||
mime_type = response.headers.get('Content-Type')
|
||||
if mime_type is None:
|
||||
mime_type, _ = mimetypes.guess_type(url)
|
||||
|
||||
data_processor_logger.info(f"Detected MIME type: {mime_type}") # 调试信息
|
||||
pil_image = process_image_data(image_data, mime_type, url)
|
||||
|
||||
return pil_image
|
||||
|
||||
def http_to_pil_image_with_tp_server(url, retry_time=6):
|
||||
"""cnap平台没有外网访问权限,需要使用tp服务下载图片"""
|
||||
proxies = [{"http": "http://10.229.197.142:8807"}, {"http": "http://10.229.197.161:8804"},
|
||||
{"http": "http://10.229.198.143:8804"}, {"http": "http://10.122.108.164:8807"},
|
||||
{"http": "http://10.122.108.165:8807"}, {"http": "http://10.122.108.166:8807"},
|
||||
{"http": "http://10.122.108.168:8801"}, {"http": "http://10.122.150.146:8802"},
|
||||
{"http": "http://10.122.150.158:8802"}, {"http": "http://10.122.150.164:8801"},
|
||||
{"http": "http://10.143.51.38:8813"}, {"http": "http://10.143.103.42:8810"},
|
||||
{"http": "http://10.143.194.45:8804"}, {"http": "http://10.143.226.25:8801"},
|
||||
{"http": "http://10.143.236.12:8807"}, {"http": "http://10.143.238.36:8807"},
|
||||
{"http": "http://10.144.71.30:8807"}, {"http": "http://10.144.73.16:8804"},
|
||||
{"http": "http://10.144.138.36:8801"}, {"http": "http://10.144.152.40:8810"},
|
||||
{"http": "http://10.144.199.29:8810"}, {"http": "http://10.144.251.29:8813"},
|
||||
]
|
||||
headers = {
|
||||
"X-Tp-Authorization": "Basic RVJOSUVMaXRlVjpFUk5JRUxpdGVWXzFxYXo0cmZ2M2VkYzV0Z2Iyd3N4LWJmZS10cA==",
|
||||
"scheme": "https"
|
||||
}
|
||||
|
||||
new_url = url.replace("https://", "http://") if url.startswith("https://") else url
|
||||
|
||||
# 代理可能不稳定,需要重试
|
||||
for idx in range(retry_time):
|
||||
try:
|
||||
response = requests.get(new_url, headers=headers, proxies=random.choice(proxies))
|
||||
if response.status_code == 200:
|
||||
image_data = io.BytesIO(response.content)
|
||||
|
||||
mime_type = response.headers.get('Content-Type')
|
||||
if mime_type is None:
|
||||
mime_type, _ = mimetypes.guess_type(url)
|
||||
|
||||
data_processor_logger.info(f"Detected MIME type: {mime_type}") # 调试信息
|
||||
pil_image = process_image_data(image_data, mime_type, url)
|
||||
|
||||
return pil_image
|
||||
except Exception as e:
|
||||
data_processor_logger.error(f"Failed to download the image, idx: {idx}, URL: {url}, error: {e}")
|
||||
|
||||
raise Exception(f"Failed to download the image from URL: {url}")
|
||||
|
||||
|
||||
|
||||
def base64_to_pil_image(base64_string):
|
||||
"""base64_to_pil_image"""
|
||||
image_bytes = base64.b64decode(base64_string)
|
||||
buffer = io.BytesIO(image_bytes)
|
||||
pil_image = Image.open(buffer)
|
||||
return pil_image
|
||||
|
||||
|
||||
def is_public_url(url):
|
||||
"""判断是否公网url"""
|
||||
try:
|
||||
# 解析URL
|
||||
parsed_url = urlparse(url)
|
||||
hostname = parsed_url.hostname
|
||||
if hostname is None:
|
||||
return False
|
||||
# 尝试将域名解析为IP地址
|
||||
ip_address = socket.gethostbyname(hostname)
|
||||
# 转换为IP地址对象
|
||||
ip_obj = ipaddress.ip_address(ip_address)
|
||||
# 判断是否为私有IP或保留IP地址
|
||||
if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_link_local or ip_obj.is_reserved:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error checking URL: {e}")
|
||||
return False
|
||||
|
||||
def process_transparency(image):
|
||||
""" process transparency. """
|
||||
def _is_transparent(image):
|
||||
# 检查图片是否有alpha通道
|
||||
if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info):
|
||||
# 获取alpha通道
|
||||
alpha = image.convert('RGBA').split()[-1]
|
||||
# 如果alpha通道中存在0,说明图片有透明部分
|
||||
if alpha.getextrema()[0] < 255:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _convert_transparent_paste(image):
|
||||
width, height = image.size
|
||||
new_image = Image.new("RGB", (width, height), (255, 255, 255)) # 生成一张白色底图
|
||||
new_image.paste(image, (0, 0), image)
|
||||
return new_image
|
||||
|
||||
try:
|
||||
if _is_transparent(image): # Check and fix transparent images
|
||||
data_processor_logger.info("Image has transparent background, adding white background.")
|
||||
image = _convert_transparent_paste(image)
|
||||
except:
|
||||
pass
|
||||
|
||||
return ImageOps.exif_transpose(image)
|
||||
241
fastdeploy/input/multimodal/video.py
Normal file
241
fastdeploy/input/multimodal/video.py
Normal file
@@ -0,0 +1,241 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import base64
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from PIL import Image
|
||||
|
||||
from .base import MediaIO
|
||||
from .image import ImageMediaIO
|
||||
|
||||
|
||||
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
|
||||
"""
|
||||
对视频帧进行缩放,将每一帧的大小调整为指定的高度和宽度。
|
||||
|
||||
Args:
|
||||
frames (npt.NDArray, shape=(N, H, W, C)): 包含N个帧的三维数组,其中H是高度,W是宽度,C是通道数。
|
||||
所有帧都应该具有相同的通道数。
|
||||
size (tuple[int, int], required): 一个元组,包含两个整数,分别表示目标高度和宽度。
|
||||
|
||||
Returns:
|
||||
npt.NDArray, shape=(N, new_height, new_width, C): 返回一个新的三维数组,其中每一帧已经被缩放到指定的高度和宽度。
|
||||
新数组的通道数与输入数组相同。
|
||||
|
||||
Raises:
|
||||
None
|
||||
"""
|
||||
num_frames, _, _, channels = frames.shape
|
||||
new_height, new_width = size
|
||||
resized_frames = np.empty((num_frames, new_height, new_width, channels),
|
||||
dtype=frames.dtype)
|
||||
# lazy import cv2 to avoid bothering users who only use text models
|
||||
import cv2
|
||||
for i, frame in enumerate(frames):
|
||||
resized_frame = cv2.resize(frame, (new_width, new_height))
|
||||
resized_frames[i] = resized_frame
|
||||
return resized_frames
|
||||
|
||||
|
||||
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
|
||||
"""
|
||||
对视频帧进行缩放,将每个帧的高度和宽度都乘以一个因子。
|
||||
|
||||
Args:
|
||||
frames (npt.NDArray): 形状为(T,H,W,C)的四维numpy数组,表示T个帧,高度为H,宽度为W,通道数为C。
|
||||
size_factor (float): 用于缩放视频帧的因子,新的高度和宽度将分别是原来的高度和宽度的size_factor倍。
|
||||
|
||||
Returns:
|
||||
npt.NDArray: 形状为(T,new_H,new_W,C)的四维numpy数组,表示T个帧,高度为new_H,宽度为new_W,通道数为C。
|
||||
其中new_H和new_W是根据size_factor计算出来的。
|
||||
|
||||
Raises:
|
||||
None
|
||||
"""
|
||||
_, height, width, _ = frames.shape
|
||||
new_height = int(height * size_factor)
|
||||
new_width = int(width * size_factor)
|
||||
|
||||
return resize_video(frames, (new_height, new_width))
|
||||
|
||||
|
||||
def sample_frames_from_video(frames: npt.NDArray,
|
||||
num_frames: int) -> npt.NDArray:
|
||||
"""
|
||||
从视频中随机选取指定数量的帧,并返回一个包含这些帧的numpy数组。
|
||||
|
||||
Args:
|
||||
frames (npt.NDArray): 形状为(T,H,W,C)的ndarray,表示视频的所有帧,其中T是帧的总数,H、W是每个帧的高度和宽度,C是通道数。
|
||||
num_frames (int, optional): 要从视频中选取的帧数。如果设置为-1,则将返回所有帧。默认为-1。
|
||||
|
||||
Returns:
|
||||
npt.NDArray: 形状为(num_frames,H,W,C)的ndarray,表示选取的帧。如果num_frames=-1,则返回原始的frames。
|
||||
"""
|
||||
total_frames = frames.shape[0]
|
||||
if num_frames == -1:
|
||||
return frames
|
||||
|
||||
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
||||
sampled_frames = frames[frame_indices, ...]
|
||||
return sampled_frames
|
||||
|
||||
|
||||
class VideoMediaIO(MediaIO[npt.NDArray]):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_io: ImageMediaIO,
|
||||
*,
|
||||
num_frames: int = 32,
|
||||
) -> None:
|
||||
"""
|
||||
初始化一个 VideoMediaIO 对象。
|
||||
|
||||
Args:
|
||||
image_io (ImageMediaIO): 用于读取和写入图像的 ImageMediaIO 对象。
|
||||
num_frames (int, optional): 视频中帧数,默认为 32。
|
||||
ImageMediaIO 对象必须支持指定帧数。
|
||||
|
||||
Raises:
|
||||
TypeError: 如果 image_io 不是 ImageMediaIO 类型。
|
||||
ValueError: 如果 num_frames 小于等于 0。
|
||||
|
||||
Returns:
|
||||
None: 无返回值,直接初始化并设置属性。
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.image_io = image_io
|
||||
self.num_frames = num_frames
|
||||
|
||||
def load_bytes(self, data: bytes) -> npt.NDArray:
|
||||
"""
|
||||
从字节数据加载视频帧,并返回一个 numpy ndarray。
|
||||
如果字节数据中的视频帧数量大于指定的 `num_frames`,则将其平均分布到这些帧上;否则,返回所有帧。
|
||||
|
||||
Args:
|
||||
data (bytes): 包含视频帧数据的字节对象。
|
||||
|
||||
Returns:
|
||||
npt.NDArray, shape=(num_frames, height, width, channels): 返回一个 numpy ndarray,其中包含了视频帧数据。
|
||||
如果 `num_frames` 小于视频帧数量,则返回前 `num_frames` 帧;否则,返回所有帧。
|
||||
|
||||
Raises:
|
||||
None.
|
||||
"""
|
||||
import decord
|
||||
vr = decord.VideoReader(BytesIO(data), num_threads=1)
|
||||
total_frame_num = len(vr)
|
||||
|
||||
num_frames = self.num_frames
|
||||
if total_frame_num > num_frames:
|
||||
uniform_sampled_frames = np.linspace(0,
|
||||
total_frame_num - 1,
|
||||
num_frames,
|
||||
dtype=int)
|
||||
frame_idx = uniform_sampled_frames.tolist()
|
||||
else:
|
||||
frame_idx = list(range(0, total_frame_num))
|
||||
|
||||
return vr.get_batch(frame_idx).asnumpy()
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> npt.NDArray:
|
||||
"""
|
||||
加载 base64 编码的数据,并返回 numpy ndarray。
|
||||
|
||||
Args:
|
||||
media_type (str): 媒体类型,目前仅支持 "video/jpeg"。
|
||||
当为 "video/jpeg" 时,将解析每一帧的 base64 编码数据,并转换成 numpy ndarray。
|
||||
data (str): base64 编码的字符串数据。
|
||||
|
||||
Returns:
|
||||
npt.NDArray, optional: 如果 media_type 为 "video/jpeg",则返回 numpy ndarray 格式的视频数据;否则返回 None。
|
||||
|
||||
Raises:
|
||||
None.
|
||||
"""
|
||||
if media_type.lower() == "video/jpeg":
|
||||
load_frame = partial(
|
||||
self.image_io.load_base64,
|
||||
"image/jpeg",
|
||||
)
|
||||
|
||||
return np.stack([
|
||||
np.array(load_frame(frame_data))
|
||||
for frame_data in data.split(",")
|
||||
])
|
||||
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> npt.NDArray:
|
||||
"""
|
||||
读取文件内容,并将其转换为numpy数组。
|
||||
|
||||
Args:
|
||||
filepath (Path): 文件路径对象,表示要读取的文件。
|
||||
|
||||
Returns:
|
||||
npt.NDArray, optional: 返回一个numpy数组,包含了文件内容。如果无法解析文件内容,则返回None。
|
||||
|
||||
Raises:
|
||||
无。
|
||||
"""
|
||||
with filepath.open("rb") as f:
|
||||
data = f.read()
|
||||
|
||||
return self.load_bytes(data)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: npt.NDArray,
|
||||
*,
|
||||
video_format: str = "JPEG",
|
||||
) -> str:
|
||||
"""
|
||||
将视频编码为Base64字符串,每一帧都是一个Base64字符串。
|
||||
如果视频格式为"JPEG",则每一帧都会被转换成JPEG图片并进行编码。
|
||||
|
||||
Args:
|
||||
media (npt.NDArray): 要编码的视频,形状为(H,W,C)或者(T,H,W,C),其中T为时间步长,H和W分别为高度和宽度,C为通道数。
|
||||
当前仅支持JPEG格式。
|
||||
video_format (str, optional, default="JPEG"): 视频格式,只支持"JPEG"。 Default to "JPEG".
|
||||
|
||||
Raises:
|
||||
NotImplementedError: 当前仅支持JPEG格式。
|
||||
|
||||
Returns:
|
||||
str: Base64字符串,每一帧都是一个Base64字符串,用","连接起来。
|
||||
"""
|
||||
video = media
|
||||
|
||||
if video_format == "JPEG":
|
||||
encode_frame = partial(
|
||||
self.image_io.encode_base64,
|
||||
image_format=video_format,
|
||||
)
|
||||
|
||||
return ",".join(
|
||||
encode_frame(Image.fromarray(frame)) for frame in video)
|
||||
|
||||
msg = "Only JPEG format is supported for now."
|
||||
raise NotImplementedError(msg)
|
||||
59
fastdeploy/input/preprocess.py
Normal file
59
fastdeploy/input/preprocess.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from fastdeploy.engine.config import ModelConfig
|
||||
|
||||
class InputPreprocessor:
|
||||
"""
|
||||
Args:
|
||||
model_name_or_path (str):
|
||||
Model name or path to the pretrained model. If a model name is provided, it should be a
|
||||
key in the Hugging Face Transformers' model registry (https://huggingface.co/models).
|
||||
The model will be downloaded from the Hugging Face model hub if necessary.
|
||||
If a path is provided, the model will be loaded from that path.
|
||||
enable_mm (bool, optional):
|
||||
Whether to use the multi-modal model processor. Defaults to False.
|
||||
|
||||
Raises:
|
||||
ValueError:
|
||||
If the model name is not found in the Hugging Face Transformers' model registry and the path does not
|
||||
exist.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
model_name_or_path: str,
|
||||
enable_mm: bool = False,
|
||||
) -> None:
|
||||
|
||||
self.model_name_or_path = model_name_or_path
|
||||
self.enable_mm = enable_mm
|
||||
|
||||
|
||||
def create_processor(self):
|
||||
"""
|
||||
创建数据处理器。如果启用了多模态注册表,则使用该表中的模型;否则,使用传递给构造函数的模型名称或路径。
|
||||
返回值:DataProcessor(如果不启用多模态注册表)或MultiModalRegistry.Processor(如果启用多模态注册表)。
|
||||
|
||||
Args:
|
||||
无参数。
|
||||
|
||||
Returns:
|
||||
DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
|
||||
"""
|
||||
architectures = ModelConfig(self.model_name_or_path).architectures
|
||||
from fastdeploy.input.text_processor import DataProcessor
|
||||
self.processor = DataProcessor(model_name_or_path=self.model_name_or_path)
|
||||
return self.processor
|
||||
533
fastdeploy/input/text_processor.py
Normal file
533
fastdeploy/input/text_processor.py
Normal file
@@ -0,0 +1,533 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
from paddlenlp.generation import GenerationConfig
|
||||
from paddlenlp.transformers import Llama3Tokenizer, LlamaTokenizer
|
||||
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
|
||||
class BaseDataProcessor(ABC):
|
||||
"""base class for data processor"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.tokenizer = self._load_tokenizer()
|
||||
self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id(
|
||||
self.tokenizer.bos_token)
|
||||
self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id(
|
||||
self.tokenizer.cls_token)
|
||||
self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id(
|
||||
self.tokenizer.sep_token)
|
||||
self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id(
|
||||
self.tokenizer.eos_token)
|
||||
self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id(
|
||||
self.tokenizer.mask_token)
|
||||
data_processor_logger.info((
|
||||
f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ",
|
||||
f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, "
|
||||
f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, "
|
||||
f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, "
|
||||
f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}"
|
||||
))
|
||||
|
||||
@abstractmethod
|
||||
def process_request(self, request, **kwargs):
|
||||
"""
|
||||
Preprocess the request
|
||||
|
||||
Args:
|
||||
request (Dict): may contain text and messages fields
|
||||
**kwargs: others
|
||||
|
||||
Returns:
|
||||
bool: Whether preprocessing is successful
|
||||
str: error message
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def process_response(self, response_dict):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def text2ids(self, text, max_model_len=None):
|
||||
"""
|
||||
text to token ids
|
||||
|
||||
Args:
|
||||
text (str): text
|
||||
|
||||
Returns:
|
||||
List[int]: token ids list
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def messages2ids(self, messages):
|
||||
"""
|
||||
Convert multi-turn messages into ID sequences.
|
||||
|
||||
Args:
|
||||
messages (List[List[Dict[str, Any]]]): multi-turn messages.
|
||||
|
||||
Returns:
|
||||
List[int]: ID sequences
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def ids2tokens(self, token_id, task_id=None):
|
||||
"""
|
||||
token ids to strings
|
||||
|
||||
Args:
|
||||
token_id (List[int]): token id
|
||||
task_id (str): task id
|
||||
|
||||
Returns:
|
||||
List[str]: strings
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _load_tokenizer(self):
|
||||
"""
|
||||
load tokenizer
|
||||
|
||||
Returns:
|
||||
tokenizer (AutoTokenizer)
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DataProcessor(BaseDataProcessor):
|
||||
|
||||
def __init__(self, model_name_or_path):
|
||||
"""
|
||||
Initializes the DecodeStatus object.
|
||||
|
||||
Args:
|
||||
model_name_or_path (str): The name or path of the pre-trained model to be loaded.
|
||||
Can also be a path to a directory containing the pre-trained model file.
|
||||
|
||||
Returns:
|
||||
None.
|
||||
|
||||
Raises:
|
||||
None.
|
||||
"""
|
||||
|
||||
self.model_name_or_path = model_name_or_path
|
||||
self._init_config()
|
||||
|
||||
self.decode_status = dict()
|
||||
self.tokenizer = self._load_tokenizer()
|
||||
data_processor_logger.info(
|
||||
f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
|
||||
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
|
||||
)
|
||||
|
||||
from paddlenlp.trl.llm_utils import get_eos_token_id
|
||||
|
||||
self.eos_token_ids = get_eos_token_id(self.tokenizer,
|
||||
self.generation_config)
|
||||
self.eos_token_id_len = len(self.eos_token_ids)
|
||||
self.pad_token_id = self.get_pad_id()
|
||||
self.tokenizer.pad_token_id = self.pad_token_id
|
||||
|
||||
def _init_config(self):
|
||||
"""
|
||||
初始化配置,包括模型名称、使用Hugging Face Tokenizer等。
|
||||
|
||||
Args:
|
||||
无参数,但是会从环境变量中获取一些配置信息。
|
||||
|
||||
Returns:
|
||||
无返回值,直接修改了类的属性。
|
||||
|
||||
Raises:
|
||||
无异常抛出。
|
||||
"""
|
||||
self.use_hf_tokenizer = int(os.getenv("USE_HF_TOKENIZER", "0")) == 1
|
||||
|
||||
# Generation config
|
||||
try:
|
||||
self.generation_config = GenerationConfig.from_pretrained(
|
||||
self.model_name_or_path)
|
||||
except Exception as e:
|
||||
data_processor_logger.warning(
|
||||
f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
|
||||
)
|
||||
self.generation_config = None
|
||||
|
||||
def process_request(self, request, max_model_len=None):
|
||||
"""
|
||||
Preprocess the request
|
||||
|
||||
Args:
|
||||
request (Dict): may contain text and messages fields
|
||||
|
||||
Returns:
|
||||
bool: Whether preprocessing is successful
|
||||
str: error message
|
||||
"""
|
||||
if request.get("eos_token_ids") is None or len(
|
||||
request.eos_token_ids) == 0:
|
||||
request.eos_token_ids = self.eos_token_ids
|
||||
|
||||
stop_sequences = request.get("stop", [])
|
||||
if stop_sequences is not None and len(stop_sequences) != 0:
|
||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||
request.set("stop_token_ids", stop_seqs)
|
||||
request.set("stop_seqs_len", stop_seqs_len)
|
||||
|
||||
if request.prompt_token_ids is None or len(
|
||||
request.prompt_token_ids) == 0:
|
||||
if request.prompt is not None:
|
||||
request.prompt_token_ids = self.text2ids(
|
||||
request.prompt, max_model_len, request.raw_request)
|
||||
elif request.messages is not None:
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError(
|
||||
"This model does not support chat_template.")
|
||||
request.prompt_token_ids = self.messages2ids(request.messages)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The request should have `input_ids`, `text` or `messages`: {request}."
|
||||
)
|
||||
|
||||
if max_model_len is not None and len(
|
||||
request.prompt_token_ids) > max_model_len:
|
||||
request.prompt_token_ids = request.prompt_token_ids[:
|
||||
max_model_len -
|
||||
1]
|
||||
return request
|
||||
|
||||
def process_request_dict(self, request, max_model_len=None):
|
||||
"""
|
||||
Preprocess the request
|
||||
|
||||
Args:
|
||||
request (Dict): may contain text and messages fields
|
||||
|
||||
Returns:
|
||||
bool: Whether preprocessing is successful
|
||||
str: error message
|
||||
"""
|
||||
if not request.get('eos_token_ids'):
|
||||
request['eos_token_ids'] = self.eos_token_ids
|
||||
|
||||
# 处理stop_sequences
|
||||
stop_sequences = request.get('stop', [])
|
||||
if stop_sequences:
|
||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||
request['stop_token_ids'] = stop_seqs
|
||||
request['stop_seqs_len'] = stop_seqs_len
|
||||
|
||||
# 处理prompt_token_ids
|
||||
if not request.get('prompt_token_ids'):
|
||||
if 'prompt' in request:
|
||||
raw_request = request.get('raw_request', True)
|
||||
request['prompt_token_ids'] = self.text2ids(
|
||||
request['prompt'], max_model_len, raw_request).tolist()
|
||||
elif 'messages' in request:
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError(
|
||||
"This model does not support chat_template.")
|
||||
request['prompt_token_ids'] = self.messages2ids(
|
||||
request['messages']).tolist()
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
|
||||
)
|
||||
|
||||
# 截断超过长度限制的prompt
|
||||
if max_model_len is not None and len(
|
||||
request['prompt_token_ids']) > max_model_len:
|
||||
request['prompt_token_ids'] = request[
|
||||
'prompt_token_ids'][:max_model_len - 1]
|
||||
|
||||
return request
|
||||
|
||||
def process_response(self, response_dict, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
is_end = response_dict.finished
|
||||
req_id = response_dict.request_id
|
||||
|
||||
token_ids = response_dict.outputs.token_ids
|
||||
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
|
||||
response_dict.usage = {
|
||||
"completion_tokens": response_dict.outputs.index + 1
|
||||
}
|
||||
|
||||
if is_end:
|
||||
self.clear_request_status(req_id)
|
||||
data_processor_logger.debug(
|
||||
"Request id: {} has been completed.".format(token_ids))
|
||||
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
|
||||
self.clear_request_status(req_id)
|
||||
return response_dict
|
||||
|
||||
def process_response_dict(self, response_dict, stream=True):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
is_end = response_dict["finished"]
|
||||
req_id = response_dict["request_id"]
|
||||
|
||||
token_ids = response_dict["outputs"]["token_ids"]
|
||||
|
||||
if is_end:
|
||||
data_processor_logger.debug(
|
||||
"Request id: {} has been completed.".format(token_ids))
|
||||
full_text = self.clear_request_status(req_id)
|
||||
if not stream:
|
||||
response_dict["outputs"]["text"] = full_text
|
||||
else:
|
||||
response_dict["outputs"]["text"] = ""
|
||||
else:
|
||||
response_dict["outputs"]["text"] = self.ids2tokens(
|
||||
token_ids, req_id)
|
||||
return response_dict
|
||||
|
||||
def text2ids(self, text, max_model_len, raw_request=True):
|
||||
"""
|
||||
text to token ids
|
||||
|
||||
Args:
|
||||
text (str): text
|
||||
|
||||
Returns:
|
||||
List[int]: token ids list
|
||||
"""
|
||||
if self.use_hf_tokenizer:
|
||||
tokens = self.tokenizer(
|
||||
text,
|
||||
return_tensors="np",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
)
|
||||
else:
|
||||
if not raw_request or self.tokenizer.chat_template is None:
|
||||
text = [text] if isinstance(text, str) else text
|
||||
chat_template = False
|
||||
elif self.tokenizer.chat_template is not None:
|
||||
text = [text] if isinstance(text, str) else text
|
||||
text = [
|
||||
self.tokenizer.apply_chat_template(sentence,
|
||||
tokenize=False)
|
||||
for sentence in text
|
||||
]
|
||||
chat_template = True
|
||||
tokens = self.tokenizer(
|
||||
text,
|
||||
return_tensors="np",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=max_model_len,
|
||||
add_special_tokens=chat_template,
|
||||
)
|
||||
return tokens["input_ids"][0]
|
||||
|
||||
def messages2ids(self, messages):
|
||||
"""
|
||||
Convert multi-turn messages into ID sequences.
|
||||
|
||||
Args:
|
||||
messages (List[List[Dict[str, Any]]]): multi-turn messages.
|
||||
|
||||
Returns:
|
||||
List[int]: ID sequences
|
||||
"""
|
||||
message_result = self.tokenizer.apply_chat_template(
|
||||
messages, return_tensors="pd")
|
||||
return np.array(message_result["input_ids"][0])
|
||||
|
||||
def ids2tokens(self, token_id, task_id):
|
||||
"""
|
||||
token ids to strings
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): token ids
|
||||
task_id (str): task id
|
||||
|
||||
Returns:
|
||||
List[str]: strings
|
||||
"""
|
||||
if self.use_hf_tokenizer:
|
||||
if task_id not in self.decode_status:
|
||||
# history token ids & history token strings & befer decode str
|
||||
self.decode_status[task_id] = [[], [], ""]
|
||||
|
||||
previous_token_ids = self.decode_status[task_id][0]
|
||||
decode_str = self.tokenizer.batch_decode(
|
||||
[previous_token_ids + token_id],
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False)
|
||||
if isinstance(decode_str, list) and len(decode_str):
|
||||
new_str = decode_str[0].replace(self.decode_status[task_id][2],
|
||||
"", 1)
|
||||
self.decode_status[task_id][1].append(new_str)
|
||||
self.decode_status[task_id][2] = decode_str[0]
|
||||
else:
|
||||
new_str = ""
|
||||
self.decode_status[task_id][0] += token_id
|
||||
return new_str
|
||||
else:
|
||||
if task_id not in self.decode_status:
|
||||
# prefix offset & read offset & history token ids & history token strings
|
||||
self.decode_status[task_id] = [0, 0, [], []]
|
||||
|
||||
prefix_offset = self.decode_status[task_id][0]
|
||||
read_offset = self.decode_status[task_id][1]
|
||||
previous_token_ids = self.decode_status[task_id][2]
|
||||
decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
|
||||
previous_token_ids + token_id, prefix_offset, read_offset)
|
||||
self.decode_status[task_id][0] = prefix_offset
|
||||
self.decode_status[task_id][1] = read_offset
|
||||
self.decode_status[task_id][2] += token_id
|
||||
self.decode_status[task_id][3].append(decode_str)
|
||||
return decode_str
|
||||
|
||||
def _load_tokenizer(self):
|
||||
"""
|
||||
load tokenizer
|
||||
|
||||
Returns:
|
||||
tokenizer (AutoTokenizer)
|
||||
"""
|
||||
|
||||
if self.use_hf_tokenizer:
|
||||
from transformers import AutoTokenizer
|
||||
return AutoTokenizer.from_pretrained(self.model_name_or_path,
|
||||
use_fast=False)
|
||||
else:
|
||||
from paddlenlp.transformers import AutoTokenizer
|
||||
return AutoTokenizer.from_pretrained(self.model_name_or_path,
|
||||
padding_side="left",
|
||||
use_fast=True)
|
||||
|
||||
def clear_request_status(self, task_id):
|
||||
"""
|
||||
clear request status
|
||||
|
||||
Args:
|
||||
task_id (str): task id
|
||||
|
||||
Returns:
|
||||
results_all (str): all token strings
|
||||
"""
|
||||
results_all = ""
|
||||
if task_id in self.decode_status:
|
||||
if self.use_hf_tokenizer:
|
||||
results_all = self.decode_status[task_id][2]
|
||||
else:
|
||||
results_all = "".join(self.decode_status[task_id][3])
|
||||
del self.decode_status[task_id]
|
||||
return results_all
|
||||
|
||||
def get_pad_id(self):
|
||||
"""
|
||||
get pad_token_id, if not pad_token_id, use eos_token
|
||||
|
||||
Returns:
|
||||
int: pad_token_id
|
||||
"""
|
||||
if isinstance(self.tokenizer,
|
||||
(LlamaTokenizer,
|
||||
Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
|
||||
return self.tokenizer.eos_token
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
def pad_batch_data(self,
|
||||
insts,
|
||||
pad_id=0,
|
||||
return_seq_len=False,
|
||||
return_array=True,
|
||||
pad_style="right"):
|
||||
"""Pad the instances to the max sequence length in batch."""
|
||||
if len(insts) == 0:
|
||||
padded_insts = np.array([[]],
|
||||
dtype=np.int64) if return_array else [[]]
|
||||
if return_seq_len:
|
||||
seq_len = np.array([], dtype=np.int64) if return_array else []
|
||||
return padded_insts, seq_len
|
||||
return padded_insts
|
||||
|
||||
max_len = max(map(len, insts))
|
||||
if pad_style == "left":
|
||||
padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst)
|
||||
for inst in insts]
|
||||
else:
|
||||
padded_insts = [
|
||||
list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts
|
||||
]
|
||||
if return_array:
|
||||
padded_insts = np.array(padded_insts,
|
||||
dtype=np.int64).reshape([-1, max_len])
|
||||
|
||||
if return_seq_len:
|
||||
seq_len = [len(inst) for inst in insts]
|
||||
if return_array:
|
||||
seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
|
||||
return padded_insts, seq_len
|
||||
return padded_insts
|
||||
|
||||
def update_stop_seq(self, stop_sequences):
|
||||
"""
|
||||
Update stop sequences from request.
|
||||
"""
|
||||
stop_seqs = []
|
||||
for seq in stop_sequences:
|
||||
if seq != self.tokenizer.eos_token_id:
|
||||
stop_seqs.append(
|
||||
self.tokenizer.convert_tokens_to_ids(
|
||||
self.tokenizer.tokenize(seq)))
|
||||
stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs,
|
||||
pad_id=-1,
|
||||
return_seq_len=True,
|
||||
return_array=False)
|
||||
data_processor_logger.debug(
|
||||
f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
|
||||
return stop_seqs, stop_seqs_len
|
||||
Reference in New Issue
Block a user