[LLM] First commit the llm deployment code

This commit is contained in:
jiangjiajun
2025-06-09 19:20:15 +08:00
committed by XieYunshen
parent 8513414112
commit 149c79699d
11814 changed files with 127294 additions and 1293102 deletions

View File

@@ -0,0 +1,15 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

View File

@@ -0,0 +1,269 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from paddlenlp.transformers import AddedToken, PretrainedTokenizer
from paddlenlp.utils import logger
__all__ = ["ErnieBotTokenizer"]
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {},
"tokenizer_file": {},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
class ErnieBotTokenizer(PretrainedTokenizer):
"""
Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
resource_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
**kwargs,
):
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
bos_token = AddedToken(bos_token,
lstrip=False, rstrip=False) if isinstance(
bos_token, str) else bos_token
eos_token = AddedToken(eos_token,
lstrip=False, rstrip=False) if isinstance(
eos_token, str) else eos_token
unk_token = AddedToken(unk_token,
lstrip=False, rstrip=False) if isinstance(
unk_token, str) else unk_token
pad_token = AddedToken(pad_token,
lstrip=False, rstrip=False) if isinstance(
pad_token, str) else pad_token
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
verbose=False,
sp_model_kwargs=self.sp_model_kwargs,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
# for eb35 reader
self.bos_id = self.bos_token_id
self.eos_id = self.eos_token_id
self.sep_id = self.sep_token_id
self.pad_id = self.pad_token_id
self.unk_id = self.unk_token_id
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
def tokenize(self, text):
"""Returns a tokenized string."""
return self._tokenize(text)
def _tokenize(self, text):
"""Returns a tokenized string."""
return self.sp_model.encode(text, out_type=str)
def decode(self,
tokens,
skip_special_tokens=False,
clean_up_tokenization_spaces=False):
"""Returns a tokenized string."""
return self.sp_model.decode(tokens)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(
f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") +
VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" build_inputs_with_special_tokens """
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True)
bos_token_id = [1] if self.add_bos_token else []
eos_token_id = [1] if self.add_eos_token else []
if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
def create_token_type_ids_from_sequences(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
if token_ids_1 is not None:
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
return output

View File

@@ -0,0 +1,23 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from .process import DataProcessor, fancy_print, IDS_TYPE_FLAG
__all__ = [
'DataProcessor',
'fancy_print',
'IDS_TYPE_FLAG',
]

View File

@@ -0,0 +1,20 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from .get_image_preprocessor import get_image_preprocessor
from .image_preprocessor_adaptive import AdaptiveImageProcessor
__all__ = ['get_image_preprocessor', 'AdaptiveImageProcessor']

View File

@@ -0,0 +1,33 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""get image preprocessor"""
from .image_preprocessor_adaptive import AdaptiveImageProcessor
from fastdeploy.utils import data_processor_logger
def get_image_preprocessor(args):
"""
get_image_preprocessor from args
"""
if args.vision_model_name_or_path is None:
return None
data_processor_logger.info("use AdaptiveImageProcessor")
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
return image_preprocess

View File

@@ -0,0 +1,568 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""image preprocessor adaptive"""
import math
from typing import List, Optional, Union
import numpy as np
import paddle
import PIL
from paddlenlp.transformers.feature_extraction_utils import BatchFeature
from paddlenlp.transformers.image_processing_utils import BaseImageProcessor
from paddlenlp.transformers.image_transforms import (
convert_to_rgb,
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from paddlenlp.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_valid_image,
make_list_of_images,
to_numpy_array,
valid_images,
)
from paddlenlp.transformers.tokenizer_utils_base import (
TensorType,
)
from PIL import Image
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200
VIDEO_MIN_PIXELS = 128 * 28 * 28
VIDEO_MAX_PIXELS = 768 * 28 * 28
VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
FRAME_FACTOR = 2
FPS = 2.0
FPS_MIN_FRAMES = 4
FPS_MAX_FRAMES = 768
VideoInput = Union[
List["PIL.Image.Image"],
"np.ndarray",
"paddle.Tensor",
List["np.ndarray"],
List["paddle.Tensor"],
List[List["PIL.Image.Image"]],
List[List["np.ndarrray"]],
List[List["paddle.Tensor"]],
]
__all__ = [
"AdaptiveImageProcessor",
]
def is_scaled_image(image: np.ndarray) -> bool:
"""
Checks to see whether the pixel values have already been rescaled to [0, 1].
"""
if image.dtype == np.uint8:
return False
# It's possible the image has pixel values in [0, 255] but is of floating type
return np.min(image) >= 0 and np.max(image) <= 1
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
def make_batched_videos(videos) -> List[VideoInput]:
"""dummy"""
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
class AdaptiveImageProcessor(BaseImageProcessor):
r"""
Constructs a adaptive image processor that dynamically resizes images based on the original images.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
in the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
min_pixels (`int`, *optional*, defaults to `56 * 56`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to 14):
The spacial patch size of the vision encoder.
temporal_conv_size (`int`, *optional*, defaults to 2):
The temporal conv size in resampler.
merge_size (`int`, *optional*, defaults to 2):
The merge size of the vision encoder to llm encoder.
"""
model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
def __init__(
self,
do_resize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
min_pixels: int = 56 * 56,
max_pixels: int = 28 * 28 * 1280,
patch_size: int = 14,
temporal_conv_size: int = 2,
merge_size: int = 2,
**kwargs,
) -> None:
"""init"""
super().__init__(**kwargs)
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_conv_size = temporal_conv_size
self.merge_size = merge_size
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
self.do_convert_rgb = do_convert_rgb
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
"""设定pixels"""
if min_pixels is not None:
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
self.min_pixels = min_pixels
self.size["min_pixels"] = int(min_pixels)
if max_pixels is not None:
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
self.max_pixels = max_pixels
self.size["max_pixels"] = int(max_pixels)
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
"""dummy"""
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=actual_min_pixels,
max_pixels=actual_max_pixels,
)
return (resized_height, resized_width), (resized_height // self.patch_size, resized_width // self.patch_size)
def _preprocess(
self,
images: Union[ImageInput, VideoInput],
do_resize: bool = True,
resample: PILImageResampling = None,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = False,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
predetermined_grid_thw=None,
):
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
Args:
images (`ImageInput`):
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
If pixel values range from 0 to 1, set `do_rescale=False`.
vision_info (`List[Dict]`, *optional*):
Optional list of dictionaries containing additional information about vision inputs.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Mean to use if normalizing the image.
Can be a float or a list of floats corresponding to the number of channels in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to use if normalizing the image.
Can be a float or a list of floats corresponding to the number of channels in the image.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
images = make_list_of_images(images)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
data_processor_logger.warning(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
height, width = get_image_size(images[0], channel_dim=input_data_format)
resized_height, resized_width = height, width
processed_images = []
if predetermined_grid_thw is not None:
assert len(predetermined_grid_thw) == len(
images
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
for img_idx, image in enumerate(images):
if do_resize:
if predetermined_grid_thw is not None:
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
resized_height *= self.patch_size
resized_width *= self.patch_size
else:
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.astype("uint8") # TODO : 需要手动加上否则多除255 导致结果会出错
# 直接fromarray不要靠paddlenlp里面的
image = Image.fromarray(image)
image = resize(
image,
size=(resized_height, resized_width),
resample=resample,
data_format=input_data_format,
)
if do_rescale:
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
if do_normalize:
image = normalize(image=image, mean=image_mean, std=image_std, data_format=input_data_format)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
processed_images.append(image)
patches = np.array(processed_images)
if data_format == ChannelDimension.LAST:
patches = patches.transpose([0, 3, 1, 2])
channel = patches.shape[1] # [time, C, H, W]
grid_t = patches.shape[0]
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
patches = patches.reshape(
[
grid_t,
channel,
grid_h // self.merge_size,
self.merge_size,
self.patch_size,
grid_w // self.merge_size,
self.merge_size,
self.patch_size,
]
)
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
flatten_patches = patches.reshape(
[grid_t * grid_h * grid_w, channel * self.patch_size * self.patch_size]
) # [grid_t * grid_h * grid_w, C * psz * psz]
return flatten_patches, (grid_t, grid_h, grid_w)
def preprocess(
self,
images: ImageInput,
videos: VideoInput = None,
do_resize: bool = True,
size: Optional[Union[int, List[int]]] = None,
resample: PILImageResampling = None,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
predetermined_grid_thw=None,
):
"""
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
videos (`VideoInput`):
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if images is not None:
images = make_batched_images(images)
if videos is not None:
videos = make_batched_videos(videos)
if images is not None and not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
if images is not None:
pixel_values, vision_grid_thws = [], []
for img_idx, image in enumerate(images):
if predetermined_grid_thw is not None:
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
else:
predetermined_grid_thw_one = None
patches, image_grid_thw = self._preprocess(
image,
do_resize=do_resize,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
predetermined_grid_thw=predetermined_grid_thw_one,
)
pixel_values.extend(patches)
vision_grid_thws.append(image_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
if videos is not None:
pixel_values, vision_grid_thws = [], []
for images in videos:
patches, video_grid_thw = self._preprocess(
images,
do_resize=do_resize,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
predetermined_grid_thw=predetermined_grid_thw,
)
pixel_values.extend(patches)
vision_grid_thws.append(video_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
return BatchFeature(data=data, tensor_type=return_tensors)
def round_by_factor(number: int, factor: int) -> int:
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor
def smart_resize(
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
):
"""
Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > MAX_RATIO:
if height > width:
new_width = max(factor, round_by_factor(width, factor))
new_height = floor_by_factor(new_width * MAX_RATIO, factor)
else:
new_height = max(factor, round_by_factor(height, factor))
new_width = floor_by_factor(new_height * MAX_RATIO, factor)
data_processor_logger.info(
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
resize to {max(new_height, new_width) / min(new_height, new_width)}"
)
height = new_height
width = new_width
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
return h_bar, w_bar

View File

@@ -0,0 +1,388 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
""" process.py """
import copy
import io
from collections import defaultdict
from typing import Any, Dict, List, Union
import numpy as np
from paddlenlp.transformers.image_utils import ChannelDimension
from PIL import Image
from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
from .process_video import read_frames_decord, read_video_decord
from .utils.io_utils import RAW_IMAGE_DIR, get_downloadable
from .utils.render_timestamp import render_frame_timestamp
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
def fancy_print(input_ids, tokenizer, image_patch_id=None):
"""
input_ids: input_ids
tokenizer: the tokenizer of models
"""
i = 0
res = ""
text_ids = []
real_image_token_len = 0
while i < len(input_ids):
if input_ids[i] == image_patch_id:
if len(text_ids) > 0:
res += tokenizer.decode(text_ids)
text_ids = []
real_image_token_len += 1
else:
if real_image_token_len != 0:
res += f"<|IMAGE@{real_image_token_len}|>"
real_image_token_len = 0
text_ids.append(input_ids[i])
i += 1
if len(text_ids) > 0:
res += tokenizer.decode(text_ids)
text_ids = []
return res
class DataProcessor:
"""
Processes multimodal chat messages into model-ready inputs,
handling text, images, and videos with 3D positional embeddings.
"""
CLS_TOKEN = "<|begin_of_sentence|>"
SEP_TOKEN = "<|end_of_sentence|>"
IMG_START = "<|IMAGE_START|>"
IMG_END = "<|IMAGE_END|>"
VID_START = "<|VIDEO_START|>"
VID_END = "<|VIDEO_END|>"
def __init__(
self,
tokenizer_name: str,
image_preprocessor_name: str,
spatial_conv_size: int = 2,
temporal_conv_size: int = 2,
image_min_pixels: int = 4 * 28 * 28,
image_max_pixels: int = 6177 * 28 * 28,
video_min_pixels: int = 299 * 28 * 28,
video_max_pixels: int = 1196 * 28 * 28,
video_target_frames: int = -1,
video_frames_sample: str = "leading",
video_max_frames: int = 180,
video_min_frames: int = 16,
video_fps: int = 2,
) -> None:
# Tokenizer and image preprocessor
self.tokenizer = ErnieVLTokenizer.from_pretrained(tokenizer_name, verbose=False)
self.tokenizer.ignored_index = -100
self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name)
# Convolution sizes for patch aggregation
self.spatial_conv_size = spatial_conv_size
self.temporal_conv_size = temporal_conv_size
# Pixel constraints
self.image_min_pixels = image_min_pixels
self.image_max_pixels = image_max_pixels
self.video_min_pixels = video_min_pixels
self.video_max_pixels = video_max_pixels
# Video sampling parameters
self.target_frames = video_target_frames
self.frames_sample = video_frames_sample
self.max_frames = video_max_frames
self.min_frames = video_min_frames
self.fps = video_fps
# Special tokens and IDs
self.cls_token = self.CLS_TOKEN
self.sep_token = self.SEP_TOKEN
self.image_start = self.IMG_START
self.image_end = self.IMG_END
self.video_start = self.VID_START
self.video_end = self.VID_END
self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
self.token_type_mapping = self._build_token_type_mapping()
self.is_training = True
self.role_prefixes = {"system": "", "user": "User: ", "bot": "Assistant: ", "assistant": "Assistant: "}
def _build_token_type_mapping(self) -> Dict[Any, int]:
mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END):
mapping[token] = IDS_TYPE_FLAG["image"]
mapping[self.image_patch_id] = IDS_TYPE_FLAG["image"]
return mapping
def train(self) -> None:
"""Enable training mode (produces labels)."""
self.is_training = True
def eval(self) -> None:
"""Enable evaluation mode (doesn't produce labels)."""
self.is_training = False
def process(self, messages: List[Dict[str, Any]]) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
"""
Convert chat messages into model inputs.
Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
"""
outputs = {
"input_ids": [],
"token_type_ids": [],
"position_ids": [],
"images": [],
"grid_thw": [],
"image_type_ids": [],
"labels": [],
"cur_position": 0,
"pic_cnt": 0,
"video_cnt": 0,
}
self._add_special_token(self.cls_token, outputs)
for msg in messages:
role = msg.get("role")
assert role in self.role_prefixes, f"Unsupported role: {role}"
prefix = self.role_prefixes[role]
if prefix:
self._add_text(prefix, outputs)
content_items = msg.get("content")
if not isinstance(content_items, list):
content_items = [content_items]
for item in content_items:
if isinstance(item, str) or item.get("type") == "text":
text = item if isinstance(item, str) else item.get("text", "")
self._add_text(text, outputs)
elif item.get("type") == "image_url" or item.get("type") == "image":
self._add_image(item, outputs)
elif item.get("type") == "video_url" or item.get("type") == "video":
self._add_video(item, outputs)
if role in ("user", "system"):
self._add_text("\n", outputs)
else:
self._add_special_token(self.sep_token, outputs)
if not self.is_training:
# Append assistant prefix in eval
self._add_text(self.role_prefixes["bot"], outputs)
return outputs
def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token)
outputs["input_ids"].append(token_id)
outputs["token_type_ids"].append(self.token_type_mapping[token])
pos = outputs["cur_position"]
outputs["position_ids"].append([pos] * 3)
outputs["cur_position"] += 1
def _add_text(self, text: str, outputs: Dict) -> None:
tokens = self.tokenizer.encode(text, add_special_tokens=False)["input_ids"]
outputs["input_ids"].extend(tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
start = outputs["cur_position"]
for i in range(len(tokens)):
outputs["position_ids"].append([start + i] * 3)
outputs["cur_position"] += len(tokens)
def _add_image(self, item: Dict, outputs: Dict) -> None:
url_info = item.get("image_url", {})
w = url_info.get("image_width", None)
h = url_info.get("image_height", None)
if "image" in item:
img = item["image"]
else:
url = url_info.get("url")
data = get_downloadable(url, download_dir=RAW_IMAGE_DIR, save_to_disk=False)
img = Image.open(io.BytesIO(data) if isinstance(data, bytes) else data)
if w and h:
img = img.resize((w, h))
outputs["pic_cnt"] += 1
self._add_text(f"Picture {outputs['pic_cnt']}:", outputs)
self._add_special_token(self.IMG_START, outputs)
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
img.height,
img.width,
min_pixels=self.image_min_pixels,
max_pixels=self.image_max_pixels,
)[1]
num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
outputs["position_ids"].extend(pos_ids)
outputs["cur_position"] = np.max(pos_ids) + 1
# Preprocess pixels
ret = self.image_preprocessor.preprocess(
images=[img.convert("RGB")],
do_normalize=False,
do_rescale=False,
predetermined_grid_thw=np.array([[patches_h, patches_w]]),
do_convert_rgb=True,
input_data_format=ChannelDimension.LAST,
)
outputs["images"].append(ret["pixel_values"])
outputs["grid_thw"].append(ret["image_grid_thw"])
outputs["image_type_ids"].append(0)
self._add_special_token(self.IMG_END, outputs)
def _add_video(self, item: Dict, outputs: Dict) -> None:
url_info = item.get("video_url", {})
url = url_info.get("url")
outputs["video_cnt"] += 1
self._add_text(f"Video {outputs['video_cnt']}:", outputs)
self._add_special_token(self.VID_START, outputs)
if "video" in item:
video_path = item["video"]
frames = self._load_and_process_video(video_path, item)
else:
video_path = get_downloadable(url, save_to_disk=False)
frames = self._load_and_process_video(video_path, item)
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
frames[0].height,
frames[0].width,
min_pixels=self.video_min_pixels,
max_pixels=self.video_max_pixels,
)[1]
num_frames = len(frames)
num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
ret = self.image_preprocessor.preprocess(
images=None,
videos=pixel_stack,
do_normalize=False,
do_rescale=False,
predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
do_convert_rgb=True,
input_data_format=ChannelDimension.LAST,
)
outputs["images"].append(ret["pixel_values_videos"])
outputs["grid_thw"].append(ret["video_grid_thw"])
outputs["image_type_ids"].extend([1] * num_frames)
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
outputs["position_ids"].extend(pos_ids)
outputs["cur_position"] = np.max(pos_ids) + 1
self._add_special_token(self.VID_END, outputs)
def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]:
reader, meta, path = read_video_decord(url, save_to_disk=False)
video_frame_args = dict()
video_frame_args["fps"] = item.get("fps", self.fps)
video_frame_args["min_frames"] = item.get("min_frames", self.min_frames)
video_frame_args["max_frames"] = item.get("max_frames", self.max_frames)
video_frame_args["target_frames"] = item.get("target_frames", self.target_frames)
video_frame_args["frames_sample"] = item.get("frames_sample", self.frames_sample)
video_frame_args = self._set_video_frame_args(video_frame_args, meta)
frames_data, _, timestamps = read_frames_decord(
path,
reader,
meta,
target_frames=video_frame_args["target_frames"],
target_fps=video_frame_args["fps"],
frames_sample=video_frame_args["frames_sample"],
save_to_disk=False,
)
frames: List[Image.Image] = []
for img_array, ts in zip(frames_data, timestamps):
frames.append(render_frame_timestamp(img_array, ts))
# Ensure even number of frames for temporal conv
if len(frames) % 2 != 0:
frames.append(copy.deepcopy(frames[-1]))
return frames
def _set_video_frame_args(self, video_frame_args, video_meta):
"""
根据已知参数和优先级,设定最终的抽帧参数
"""
# 优先级video_target_frames > (video_min_frames, video_max_frames) > video_fps
if video_frame_args["target_frames"] > 0:
if video_frame_args["fps"] >= 0:
raise ValueError("fps must be negative if target_frames is given")
if (
video_frame_args["min_frames"] > 0
and video_frame_args["target_frames"] < video_frame_args["min_frames"]
):
raise ValueError("target_frames must be larger than min_frames")
if (
video_frame_args["max_frames"] > 0
and video_frame_args["target_frames"] > video_frame_args["max_frames"]
):
raise ValueError("target_frames must be smaller than max_frames")
else:
if video_frame_args["fps"] < 0:
raise ValueError("Must provide either positive target_fps or positive target_frames.")
# 先计算在video_fps下抽到的帧数
frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
# 判断是否在目标区间内如果不是则取target_frames为上界或下界
if (
video_frame_args["min_frames"] > 0
and video_frame_args["max_frames"] > 0
and video_frame_args["min_frames"] > video_frame_args["max_frames"]
):
raise ValueError("min_frames must be smaller than max_frames")
if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
video_frame_args["target_frames"] = video_frame_args["min_frames"]
video_frame_args["fps"] = -1
if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
video_frame_args["target_frames"] = video_frame_args["max_frames"]
video_frame_args["fps"] = -1
return video_frame_args
def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int) -> List[List[int]]:
# Downsample time if needed
t_eff = t // self.temporal_conv_size if t != 1 else 1
gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
time_idx = np.repeat(np.arange(t_eff), gh * gw)
h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
w_idx = np.tile(np.arange(gw), t_eff * gh)
coords = list(zip(time_idx, h_idx, w_idx))
return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]

View File

@@ -0,0 +1,201 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import io
import os
import random
import numpy as np
from PIL import Image
from .utils.io_utils import EXTRACTED_FRAME_DIR, get_downloadable, get_filename
from .utils.video_utils import VideoReaderWrapper
from fastdeploy.utils import data_processor_logger
def read_video_decord(video_path, save_to_disk):
"""get reader and meta by decord"""
data_in_mem = False
# video_path = get_downloadable(video_path, save_to_disk=save_to_disk)
if isinstance(video_path, VideoReaderWrapper):
data_in_mem = True
video_reader = video_path
else:
if isinstance(video_path, bytes):
video_path = io.BytesIO(video_path)
video_reader = VideoReaderWrapper(video_path, num_threads=1)
vlen = len(video_reader)
fps = video_reader.get_avg_fps()
duration = vlen / float(fps)
video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
return video_reader, video_meta, video_path
def get_frame_indices(
vlen,
target_frames=-1,
target_fps=-1,
frames_sample="middle",
fix_start=None,
input_fps=-1,
):
"""
取出对应的frame index
"""
assert frames_sample in ["rand", "middle", "leading"]
if target_frames > 0:
assert target_fps <= 0, "target_fps must be negative if target_frames is given."
if target_frames > vlen:
acc_samples = vlen
data_processor_logger.info(
f"target_frames={target_frames} is larger than video length {vlen}, "
f"will sample {acc_samples} frames."
)
else:
acc_samples = target_frames
data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
# split the video into `acc_samples` intervals, and sample from each interval.
intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
ranges = []
for idx, interv in enumerate(intervals[:-1]):
ranges.append((interv, intervals[idx + 1] - 1))
if frames_sample == "rand":
try:
frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
except Exception as e:
frame_indices = np.random.permutation(vlen)[:acc_samples]
frame_indices.sort()
frame_indices = list(frame_indices)
elif fix_start is not None:
frame_indices = [x[0] + fix_start for x in ranges]
elif frames_sample == "leading":
frame_indices = [x[0] for x in ranges]
elif frames_sample == "middle":
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
else:
raise NotImplementedError
elif target_fps > 0:
assert target_frames <= 0, "target_frames must be negative if target_fps is given."
assert input_fps > 0, "input_fps must be provided if target_fps is given."
data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
duration = float(vlen) / input_fps
delta = 1 / target_fps # gap between frames, this is also the clip length each frame represents
if frames_sample == "middle":
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
elif frames_sample == "leading":
frame_seconds = np.arange(0, duration, delta)
if frames_sample == "rand":
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
frame_seconds += rand_offset * delta
frame_indices = np.around(frame_seconds * input_fps).astype(int)
frame_indices = [e for e in frame_indices if e < vlen]
else:
raise ValueError("Must provide either positive target_fps or positive target_frames.")
return frame_indices
def read_frames_decord(
video_path,
video_reader,
video_meta,
target_frames=-1,
target_fps=-1,
frames_sample="middle",
fix_start=None,
save_to_disk=False,
cache_dir=EXTRACTED_FRAME_DIR,
frame_indices=None,
tol=10,
):
"""get frames by decord"""
if frame_indices is None:
frame_indices = get_frame_indices(
video_meta["num_of_frame"],
target_frames=target_frames,
target_fps=target_fps,
frames_sample=frames_sample,
fix_start=fix_start,
input_fps=video_meta["fps"],
)
frames = []
for frame_indice_index in range(0, len(frame_indices)):
frame_indice = frame_indices[frame_indice_index]
try:
frames.append(video_reader[frame_indice].asnumpy()) # (T, H, W, C)
except Exception as e:
data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
previous_counter = 1
later_counter = 1
previous_after_flag = True
if frame_indice == 0 or frame_indice == len(video_reader) - 1:
cur_tol = tol * 2
else:
cur_tol = tol
while previous_counter < cur_tol or later_counter < cur_tol:
if previous_after_flag:
if frame_indice - previous_counter < 0:
previous_counter += 1
previous_after_flag = not previous_after_flag
continue
try:
frames.append(video_reader[frame_indice - previous_counter].asnumpy())
data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame")
frame_indices[frame_indice_index] = frame_indice - previous_counter
break
except Exception as e:
previous_counter += 1
else:
if frame_indice + later_counter >= len(video_reader):
later_counter += 1
previous_after_flag = not previous_after_flag
continue
try:
frames.append(video_reader[frame_indice + later_counter].asnumpy())
data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame")
frame_indices[frame_indice_index] = frame_indice + later_counter
break
except Exception as e:
later_counter += 1
previous_after_flag = not previous_after_flag
frames = np.stack(frames, axis=0)
assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
ret = []
url_sha1 = get_filename()
for idx, frame in enumerate(frames):
tmp = Image.fromarray(frame, "RGB")
if save_to_disk:
save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
if not os.path.exists(os.path.dirname(save_path)):
os.makedirs(os.path.dirname(save_path))
tmp.save(save_path)
tmp = save_path
ret.append(tmp)
time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
return ret, frame_indices, time_stamps

View File

@@ -0,0 +1,19 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from .tokenizer_vl import ErnieVLTokenizer
__all__ = ['ErnieVLTokenizer']

View File

@@ -0,0 +1,348 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""
ErnieVLTokenizer
"""
import os
import re
from shutil import copyfile
from typing import Dict, List, Optional, Tuple
import numpy as np
import paddle
import sentencepiece as spm
from paddlenlp.transformers import PretrainedTokenizer
from paddlenlp.transformers.tokenizer_utils_base import (
PaddingStrategy,
TextInput,
)
from paddlenlp.utils.log import logger
class ErnieVLTokenizer(PretrainedTokenizer):
"""doc"""
resource_files_names = {
"vocab_file": "tokenizer.model",
}
pretrained_resource_files_map = {"vocab_file": {"ernie-bot-10b": None}}
pretrained_init_configuration = {
"ernie-bot-10b": {},
}
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
padding_side = "right"
def __init__(
self,
vocab_file,
bos_token="<s>",
cls_token="<cls>",
eos_token="</s>",
mask_token="<mask:0>",
pad_token="<pad>",
sep_token="<sep>",
unk_token="<unk>",
additional_special_tokens=None,
**kwargs,
):
"""doc"""
if additional_special_tokens is None:
additional_special_tokens = ["<mask:1>", "<mask:7>"]
super().__init__(
bos_token=bos_token,
cls_token=cls_token,
eos_token=eos_token,
mask_token=mask_token,
pad_token=pad_token,
sep_token=sep_token,
unk_token=unk_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
@property
def space_token(self):
"""doc"""
return "<mask:1>"
@property
def space_token_id(self):
"""doc"""
return self.sp_model.piece_to_id("<mask:1>")
@property
def gend_token(self):
"""doc"""
return "<mask:7>"
@property
def gend_token_id(self):
"""doc"""
return self.sp_model.piece_to_id("<mask:7>")
@property
def im_start_id(self):
"""doc"""
return self.sp_model.piece_to_id("<|im_start|>")
@property
def im_end_id(self):
"""doc"""
return self.sp_model.piece_to_id("<|im_end|>")
@property
def vocab_size(self):
"""doc"""
return self.sp_model.vocab_size()
def get_vocab(self):
"""doc"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""doc"""
return self.sp_model.encode_as_pieces(text)
def _convert_token_to_id(self, token):
"""doc"""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, id):
"""doc"""
return self.sp_model.id_to_piece(id)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
# prev_is_special = False
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
# if not prev_is_special:
# out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
# prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
# prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string # .strip()
def prepare_for_model(self, *args, **kwargs):
"""doc"""
if "add_special_tokens" in kwargs:
kwargs.pop("add_special_tokens")
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
"""
Converts a string in a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
Args:
text (`str`):
The sequence to be encoded.
**kwargs (additional keyword arguments):
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
Returns:
`List[str]`: The list of tokens.
"""
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
# TODO: should this be in the base class?
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
escaped_special_toks = [
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
no_split_token = set(self.unique_no_split_tokens)
tokens = self.tokens_trie.split(text)
tokenized_text = []
for token in tokens:
# Need to skip eventual empty (fully stripped) tokens
if not token:
continue
if token in no_split_token:
tokenized_text.append(token)
else:
tokenized_text.extend(self._tokenize(token))
return tokenized_text
def _decode(self, *args, **kwargs):
"""doc"""
kwargs.pop("clean_up_tokenization_spaces", None)
kwargs.pop("spaces_between_special_tokens", None)
return super()._decode(
*args,
**kwargs,
clean_up_tokenization_spaces=False,
spaces_between_special_tokens=False,
)
def _pad(
self,
encoded_inputs: Dict,
max_length: Optional[int] = None,
padding_strategy=PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""doc"""
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
if return_attention_mask:
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
attention_mask = encoded_inputs.pop("attention_mask")
if isinstance(attention_mask, paddle.Tensor):
attention_mask = attention_mask.numpy()
elif isinstance(attention_mask, list):
attention_mask = np.array(attention_mask)
elif not isinstance(attention_mask, np.ndarray):
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
else:
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
attention_mask = np.expand_dims(attention_mask, axis=0)
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
if attention_mask.ndim == 1:
pad_width = [(0, difference)]
else:
pad_width = [(0, 0), (0, difference), (0, difference)]
elif self.padding_side == "left":
if attention_mask.ndim == 1:
pad_width = [(difference, 0)]
else:
pad_width = [(0, 0), (difference, 0), (difference, 0)]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
attention_mask = np.pad(
attention_mask,
pad_width=pad_width,
mode="constant",
constant_values=0,
)
encoded_inputs = super()._pad(
encoded_inputs,
max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=False,
)
if return_attention_mask:
encoded_inputs["attention_mask"] = attention_mask.tolist()
return encoded_inputs
def add_special_tokens(
tokenizer,
special_tokens_info,
use_ocr_specialtoken=False,
use_crop_specialtoken=False,
special_token_ids_start=254208,
special_token_ids_end=256256,
):
"""
增加 special token
placeholder [<|IMAGE_PLACEHOLDER|>, <|AUDIO_PLACEHOLDER|>, <|VIDEO_PLACEHOLDER|>] 共3个
模态起始截止 special tokens [<|BOI|> <|EOI|> <|BOA|> <|EOA|> <|BOV|> <|EOV|>]
ocr special tokens [<|LOC_0|> <|LOC_1|> ... <|LOC_1000|>] 共1001个
crop special tokens [<|CROP_COL_SEP|>, <|CROP_ROW_SEP|>, <|CROP_IMAGE_SEP|>] 共3个
<|CROP_COL_SEP|> for col 维度切 图片width替换原明文逗号
<|CROP_ROW_SEP|> for row 维度切 图片height替换原明文回车
<|CROP_IMAGE_SEP|> for 区分原图和crop图 图片width替换原明文两个回车
共2048个 unsed token
Args:
tokenizer (ErnieTokenizer): tokenizer
special_token_ids_start (int, optional): special token 起点 ids. Defaults to 254208.
special_token_ids_end (int, optional): 词表最多支持大小. Defaults to 256256.
"""
special_tokens = [
special_tokens_info["image_placeholder"],
special_tokens_info["audio_placeholder"],
]
if use_ocr_specialtoken:
special_tokens.extend(special_tokens_info["ocr_coor"])
special_tokens.extend(special_tokens_info["ocr_begin_end"])
if use_crop_specialtoken:
special_tokens.extend(special_tokens_info["crop"])
# add special_tokens
additional_special_tokens = {"additional_special_tokens": special_tokens}
tokenizer.add_special_tokens(additional_special_tokens)
# check
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
assert (
len(tokenizer.get_vocab()) < special_token_ids_end
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"

Binary file not shown.

View File

@@ -0,0 +1,15 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

View File

@@ -0,0 +1,253 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import base64
import datetime
import hashlib
import io
import os
import threading
import uuid
from pathlib import Path
import numpy as np
import requests
from PIL import Image
from PIL.ExifTags import TAGS
RAW_VIDEO_DIR = "./download_tmp/raw_video/"
RAW_IMAGE_DIR = "./download_tmp/raw_images/"
EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
TMP_DIR = "./download_tmp/upload_tmp/"
def file_download(url, download_dir, save_to_disk=False, retry=0, retry_interval=3):
"""
Description: 下载url如果url是PIL直接返回
Args:
url(str, PIL): http/本地路径/io.Bytes注意io.Bytes是图片字节流
download_path: 在save_to_disk=True的情况下生效返回保存地址
save_to_disk: 是否保存在本地路径
"""
from .video_utils import VideoReaderWrapper
if isinstance(url, Image.Image):
return url
elif isinstance(url, VideoReaderWrapper):
return url
elif url.startswith("http"):
response = requests.get(url)
bytes_data = response.content
elif os.path.isfile(url):
if save_to_disk:
return url
bytes_data = open(url, "rb").read()
else:
bytes_data = base64.b64decode(url)
if not save_to_disk:
return bytes_data
download_path = os.path.join(download_dir, get_filename(url))
Path(download_path).parent.mkdir(parents=True, exist_ok=True)
with open(download_path, "wb") as f:
f.write(bytes_data)
return download_path
def get_filename(url=None):
"""
Get Filename
"""
if url is None:
return str(uuid.uuid4()).replace("-", "")
t = datetime.datetime.now()
if not isinstance(url, bytes):
url = url.encode("utf-8")
md5_hash = hashlib.md5(url).hexdigest()
pid = os.getpid()
tid = threading.get_ident()
# 去掉后缀防止save-jpg报错
image_filname = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
return image_filname
def get_downloadable(url, download_dir=RAW_VIDEO_DIR, save_to_disk=False, retry=0, retry_interval=3):
"""download video and store it in the disk
return downloaded **path** if save_to_disk is set to true
return downloaded **bytes** if save_to_disk is set to false
"""
if not os.path.exists(download_dir):
os.makedirs(download_dir)
downloaded_path = file_download(
url,
download_dir,
save_to_disk=save_to_disk,
retry=retry,
retry_interval=retry_interval,
)
return downloaded_path
def get_downloadable_image(download_path, need_exif_info, retry_max_time=0, retry_interval=3):
"""
带上exif info和图像处理的get downloadable
"""
def get_image_exif(image):
exif_data = image._getexif()
exif_info = {}
if exif_data is not None:
for tag, value in exif_data.items():
tag_name = TAGS.get(tag, tag)
exif_info[tag_name] = value.strip()
return exif_info
def has_transparent_background(img):
"""判断图片是否有背景"""
if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
# Check for any pixel with alpha channel less than 255 (fully opaque)
alpha = img.convert("RGBA").split()[-1]
if alpha.getextrema()[0] < 255:
return True
return False
def add_white_background(img):
"""
给透明背景的图,加个白色背景
"""
if img.mode != "RGBA":
img = img.convert("RGBA")
# 创建一个白色背景的图像,尺寸与原图一致
img_white_background = Image.new("RGBA", img.size, (255, 255, 255))
# 将原图粘贴到白色背景上
img_white_background.paste(img, (0, 0), img)
return img_white_background
def change_I16_to_L(img):
"""
将图片从I;16模式转换为L模式
"""
# 由于I模式的point函数只支持加减乘所以下面的* (1 / 256)不能改成除法
return img.point(lambda i: i * (1 / 256)).convert("L")
image = get_downloadable(download_path, save_to_disk=False, retry=retry_max_time, retry_interval=retry_interval)
if isinstance(image, Image.Image):
pil_image = image
else:
pil_image = Image.open(io.BytesIO(image))
if need_exif_info:
try:
exif_info = get_image_exif(pil_image)
except Exception as why:
exif_info = {}
else:
exif_info = {}
try:
if pil_image.mode == "I;16":
pil_image = change_I16_to_L(pil_image)
if has_transparent_background(pil_image):
pil_image = add_white_background(pil_image)
except Exception as e:
pass
return pil_image.convert("RGB"), exif_info
def str2hash(url):
"""
从一个str的到url
"""
return hashlib.sha256(url.encode()).hexdigest()
def pil2hash(pil):
"""
从一个PIL.Image到hash
"""
byte_io = io.BytesIO()
pil.save(byte_io, format="PNG") # 选择无损格式,避免压缩影响
image_bytes = byte_io.getvalue()
return hashlib.sha256(image_bytes).hexdigest()
def imagepath_to_base64(image_path):
"""imagepath_to_base64"""
image = Image.open(image_path).convert("RGB")
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
image_bytes = buffer.getvalue()
base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
return base64_encoded
def pil_image_to_base64(image):
"""pil_image_to_base64"""
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
image_bytes = buffer.getvalue()
base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
return base64_encoded
def http_to_pil_image(url):
"""http_to_pil_image"""
response = requests.get(url)
image_data = io.BytesIO(response.content)
pil_image = Image.open(image_data).convert("RGB")
return pil_image
def http_to_image_base64(url):
"""http_to_image_base64"""
response = requests.get(url)
image_data = io.BytesIO(response.content)
return base64.b64encode(image_data.getvalue()).decode("utf-8")
def base64_to_pil_image(base64_string):
""" " base64_to_pil_image"""
image_bytes = base64.b64decode(base64_string)
buffer = io.BytesIO(image_bytes)
image = Image.open(buffer)
return image
def get_hashable(to_be_hashed):
"""get hashable"""
if isinstance(to_be_hashed, bytes):
return to_be_hashed
elif isinstance(to_be_hashed, Image.Image):
return to_be_hashed.tobytes()
elif isinstance(to_be_hashed, str):
return to_be_hashed.encode("utf-8")
else:
raise ValueError(f"not support type: {type(to_be_hashed)}")
def load_dict_from_npz(npzfile):
"""从npz文件读取数据"""
with np.load(npzfile, allow_pickle=True) as data:
loaded_dict = {key: data[key] for key in data.files}
return loaded_dict

View File

@@ -0,0 +1,96 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
cur_directory = Path(__file__).parent.absolute()
FONT_PATH = os.path.join(cur_directory, "Roboto-Regular.ttf")
def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
"""
函数功能: 给pil.image的图片渲染时间戳
时间戳的大小为 min(width, height)的rate
字体的颜色为黑色, 轮廓是白色, 轮廓的大小是字体的10%
返回一个 Image 对象
"""
draw = ImageDraw.Draw(image) # 创建一个可绘制对象
width, height = image.size # 获取图片大小
font_size = int(min(width, height) * rate) # 设置字体大小
outline_size = int(font_size * 0.1) # 设置轮廓大小
font = ImageFont.truetype(font_path, font_size) # 加载字体文件, 设置字体大小
x = 0
y = 0 # 文本的x坐标, y坐标
# 绘制黑色的时间戳,白色的边框
draw.text((x, y), number, font=font, fill=(0, 0, 0), stroke_width=outline_size, stroke_fill=(255, 255, 255))
return image
def timestamp_converting(time_stamp_in_seconds):
"""
convert timestamp format from seconds to hr:min:sec
"""
# get hours
hours = 0
while time_stamp_in_seconds >= 3600:
hours += 1
time_stamp_in_seconds -= 3600
# get minutes
mins = 0
while time_stamp_in_seconds >= 60:
mins += 1
time_stamp_in_seconds -= 60
time_hours = f"{int(hours):02d}"
time_mins = f"{int(mins):02d}"
time_secs = f"{time_stamp_in_seconds:05.02f}"
fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
return fi_time_stamp
def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
"""
function: get the timestamp of a frame, 在均匀抽帧时用。
num_frames: 总帧数
frameid_list: 被抽帧的帧的索引
duration: 视频的总时长
return: timestamp; xx:xx:xx (str)
"""
time_stamp = duration * 1.0 * frame_id / num_frames
return time_stamp
def render_frame_timestamp(frame, timestamp, font_rate=0.1):
"""
函数功能, 给frame, 按照顺序将 index 渲染上去
逻辑思路: 把index渲染到图片的左上方
frame: 帧PIL.Image object
timestamp: 时间戳,单位是秒
font_rate: 字体大小占 min(wi, hei)的比率
"""
time_stamp = "time: " + timestamp_converting(timestamp)
new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
return new_frame

View File

@@ -0,0 +1,83 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import io
import os
from tempfile import NamedTemporaryFile as ntf
import decord
try:
# moviepy 1.0
import moviepy.editor as mp
except:
# moviepy 2.0
import moviepy as mp
def is_gif(data: bytes) -> bool:
"""
check if a bytes is a gif based on the magic head
"""
return data[:6] in (b"GIF87a", b"GIF89a")
class VideoReaderWrapper(decord.VideoReader):
"""
Solving memory leak bug
https://github.com/dmlc/decord/issues/208
"""
def __init__(self, video_path, *args, **kwargs):
with ntf(delete=True, suffix=".gif") as gif_file:
gif_input = None
self.original_file = None
if isinstance(video_path, str):
self.original_file = video_path
if video_path.lower().endswith(".gif"):
gif_input = video_path
elif isinstance(video_path, bytes):
if is_gif(video_path):
gif_file.write(video_path)
gif_input = gif_file.name
elif isinstance(video_path, io.BytesIO):
video_path.seek(0)
tmp_bytes = video_path.read()
video_path.seek(0)
if is_gif(tmp_bytes):
gif_file.write(tmp_bytes)
gif_input = gif_file.name
if gif_input is not None:
clip = mp.VideoFileClip(gif_input)
mp4_file = ntf(delete=False, suffix=".mp4")
clip.write_videofile(mp4_file.name, verbose=False, logger=None)
clip.close()
video_path = mp4_file.name
self.original_file = video_path
super().__init__(video_path, *args, **kwargs)
self.seek(0)
def __getitem__(self, key):
frames = super().__getitem__(key)
self.seek(0)
return frames
def __del__(self):
if self.original_file and os.path.exists(self.original_file):
os.remove(self.original_file)

View File

@@ -0,0 +1,15 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

View File

@@ -0,0 +1,127 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import base64
from io import BytesIO
from pathlib import Path
import numpy as np
import numpy.typing as npt
from .base import MediaIO, MultiModalPlugin
from .inputs import AudioItem, ModalityData, MultiModalKwargs
# TODO 多模数据处理
# try:
# import librosa
# except ImportError:
# librosa = PlaceholderModule("librosa") # type: ignore[assignment]
# try:
# import soundfile
# except ImportError:
# soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
def resample_audio(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
"""
将音频数据从原始采样率(`orig_sr`)重采样到目标采样率(`target_sr`)。
Args:
audio (npt.NDArray[np.floating]): 带有单通道浮点型音频数据的 numpy ndarray形状为 `(samples,)`。
orig_sr (float): 音频数据的原始采样率。
target_sr (float): 需要转换到的目标采样率。
Returns:
npt.NDArray[np.floating]: 带有单通道浮点型音频数据的 numpy ndarray形状为 `(samples,)`,已经被重采样到目标采样率。
Raises:
None.
"""
import librosa
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
"""
加载字节数据,返回音频信号和采样率。
参数:
data (bytes) - 字节数据,包含音频文件的内容。
返回值tuple
(array, float) - 第一个元素是一个numpy数组表示音频信号第二个元素是一个浮点数表示采样率。
如果解码失败,则返回 None。
"""
import librosa
return librosa.load(BytesIO(data), sr=None)
def load_base64(
self,
media_type: str,
data: str,
) -> tuple[npt.NDArray, float]:
"""
将 base64 编码的字符串转换为 numpy 数组和尺度。
Args:
media_type (str): 媒体类型,例如 'image/jpeg''image/png' 等。
data (str): base64 编码的字符串,表示图像或其他二进制数据。
Returns:
tuple[npt.NDArray, float]: 包含以下两个元素:
- npt.NDArray: 形状为HWC的 numpy 数组,表示图像或其他二进制数据。
- float: 图像的尺度,单位为像素。
Raises:
ValueError: 当 media_type 不是有效的媒体类型时引发。
"""
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
"""
加载音频文件,返回音频数据和采样率。
参数:
filepath (Path): 音频文件路径Path类型
返回值:
tuple[npt.NDArray, float]包含两个元素的元组第一个是音频数据npt.NDArray类型
第二个是采样率float类型
"""
import librosa
return librosa.load(filepath, sr=None)
def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
"""
将音频数据和采样率转换为Base64编码的字符串。
参数:
media (tuple[numpy.ndarray, float]): 包含音频数据和采样率的元组其中音频数据是一个numpy数组采样率是一个浮点数。
返回值 (str): Base64编码的字符串表示音频数据和采样率。
"""
audio, sr = media
with BytesIO() as buffer:
import soundfile
soundfile.write(buffer, audio, sr, format="WAV")
data = buffer.getvalue()
return base64.b64encode(data).decode('utf-8')

View File

@@ -0,0 +1,69 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from abc import ABC, abstractmethod
from collections import defaultdict
from collections.abc import Sequence
from pathlib import Path
from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
Optional, TypeVar, Union)
_T = TypeVar("_T")
class MediaIO(ABC, Generic[_T]):
@abstractmethod
def load_bytes(self, data: bytes) -> _T:
"""
将字节数据加载为对象,并返回该对象。
如果加载失败,则抛出异常。
Args:
data (bytes): 要加载的字节数据。
Raises:
NotImplementedError: 当前类未实现此方法。
Returns:
_T: 加载后的对象。
"""
raise NotImplementedError
@abstractmethod
def load_base64(self, media_type: str, data: str) -> _T:
"""
List of media types:
https://www.iana.org/assignments/media-types/media-types.xhtml
"""
raise NotImplementedError
@abstractmethod
def load_file(self, filepath: Path) -> _T:
"""
加载文件,返回解析后的数据。
Args:
filepath (Path): 文件路径,必须是一个绝对路径。
Raises:
NotImplementedError: 当前方法未被实现。
Returns:
_T: 任意类型,表示解析后的数据。
"""
raise NotImplementedError

View File

@@ -0,0 +1,145 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import base64
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional
import requests
from PIL import Image
from .base import MediaIO
class ImageMediaIO(MediaIO[Image.Image]):
def __init__(self, *, image_mode: str = "RGB") -> None:
"""
Initializes the object.
Args:
image_mode (str, optional): The mode of the image, defaults to "RGB". Should be one of "L", "LA", "P",
"RGB", "RGBA", "CMYK", or "YCbCr".
Raises:
ValueError: If `image_mode` is not a valid mode.
Returns:
None: This method does not return anything. It initializes the object with the given parameters.
"""
super().__init__()
self.image_mode = image_mode
def load_bytes(self, data: bytes) -> Image.Image:
"""
将字节数据转换为图像对象,并返回。
该方法会自动调用Image.open和Image.load方法以及convert方法将图像转换为指定模式默认为RGB
Args:
data (bytes): 包含图像数据的字节对象。
Returns:
Image.Image: 一个包含了原始图像数据的Image对象已经被转换为指定模式。
Raises:
无。
"""
image = Image.open(BytesIO(data))
image.load()
return image.convert(self.image_mode)
def load_base64(self, media_type: str, data: str) -> Image.Image:
"""
将 base64 编码的字符串转换为图片对象。
Args:
media_type (str): 媒体类型,例如 "image/jpeg"
data (str): base64 编码的字符串数据。
Returns:
Image.Image: PIL 中的图片对象。
Raises:
无。
"""
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> Image.Image:
"""
加载文件,并转换为指定模式。
如果文件不存在或无法打开将抛出FileNotFoundError异常。
Args:
filepath (Path): 文件路径Pathlib.Path对象
Returns:
Image.Image: 返回一个Image.Image对象表示已经加载和转换的图像。
Raises:
FileNotFoundError: 当文件不存在时抛出此异常。
"""
image = Image.open(filepath)
image.load()
return image.convert(self.image_mode)
def load_file_request(self, request: Any) -> Image.Image:
"""
从请求中加载图像文件并返回一个PIL Image对象。
该函数需要传入一个包含图像URL的字符串或者可迭代对象如requests库的Response对象
该函数会自动处理图像的格式和大小并将其转换为指定的模式默认为RGB
Args:
request (Any): 包含图像URL的字符串或者可迭代对象如requests库的Response对象
Returns:
Image.Image: PIL Image对象表示已经加载并转换好的图像。
Raises:
无。
"""
image = Image.open(requests.get(request, stream=True).raw)
image.load()
return image.convert(self.image_mode)
def encode_base64(
self,
media: Image.Image,
*,
image_format: str = "JPEG",
) -> str:
"""
将图像转换为Base64编码的字符串。
Args:
media (Image.Image): 待处理的图像对象支持PIL库中的Image类型。
image_format (str, optional): 指定图像格式,默认为"JPEG"。可选项包括:"PNG", "JPEG", "BMP", "TIFF"等。
PIL库中的所有图片格式都可以使用但是不建议使用"PPM""XBM"格式因为这两种格式在Python3中已经被弃用了。
Returns:
str: Base64编码后的字符串可以直接作为HTML或者JSON数据传输。
Raises:
None
"""
image = media
with BytesIO() as buffer:
image = image.convert(self.image_mode)
image.save(buffer, image_format)
data = buffer.getvalue()
return base64.b64encode(data).decode('utf-8')

View File

@@ -0,0 +1,192 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import base64
import io
import os
import random
import socket
from urllib.parse import urlparse
import ipaddress
import requests
from PIL import Image, ImageOps
from fastdeploy.utils import data_processor_logger
import pyheif
from pdf2image import convert_from_path
import cairosvg
import subprocess
import tempfile
import mimetypes
def process_image_data(image_data, mime_type, url):
"""处理不同类型的图像数据并返回 PIL 图像对象"""
if mime_type in ['image/heif', 'image/heic'] or url.lower().endswith('.heif') or url.lower().endswith('.heic'):
heif_file = pyheif.read(image_data)
pil_image = Image.frombytes(
heif_file.mode, heif_file.size, heif_file.data,
"raw", heif_file.mode, heif_file.stride
)
elif mime_type == 'application/pdf' or url.lower().endswith('.pdf'):
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
temp_pdf.write(image_data.getvalue())
temp_pdf_path = temp_pdf.name
images = convert_from_path(temp_pdf_path)
pil_image = images[0]
os.remove(temp_pdf_path)
elif mime_type == 'image/svg+xml' or url.lower().endswith('.svg'):
png_data = cairosvg.svg2png(bytestring=image_data.getvalue())
pil_image = Image.open(io.BytesIO(png_data))
elif mime_type in ['application/postscript', 'application/illustrator'] or url.lower().endswith('.ai'):
with tempfile.NamedTemporaryFile(delete=False, suffix='.ai') as ai_temp, tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as pdf_temp:
ai_temp_path = ai_temp.name
pdf_temp_path = pdf_temp.name
ai_temp.write(image_data.getvalue())
ai_temp.close()
subprocess.run(['inkscape', ai_temp_path, '--export-pdf=' + pdf_temp_path], check=True)
images = convert_from_path(pdf_temp_path)
pil_image = images[0]
os.remove(ai_temp_path)
os.remove(pdf_temp_path)
elif mime_type == 'image/gif' or url.lower().endswith('.gif'):
pil_image = Image.open(image_data)
else:
pil_image = Image.open(image_data)
return pil_image
def http_to_pil_image(url):
"""http_to_pil_image"""
if is_public_url(url) and int(os.getenv("DOWNLOAD_WITH_TP_SERVER", "0")):
return http_to_pil_image_with_tp_server(url)
response = requests.get(url)
if response.status_code != 200:
raise Exception("Failed to download the image from URL.")
image_data = io.BytesIO(response.content)
mime_type = response.headers.get('Content-Type')
if mime_type is None:
mime_type, _ = mimetypes.guess_type(url)
data_processor_logger.info(f"Detected MIME type: {mime_type}") # 调试信息
pil_image = process_image_data(image_data, mime_type, url)
return pil_image
def http_to_pil_image_with_tp_server(url, retry_time=6):
"""cnap平台没有外网访问权限需要使用tp服务下载图片"""
proxies = [{"http": "http://10.229.197.142:8807"}, {"http": "http://10.229.197.161:8804"},
{"http": "http://10.229.198.143:8804"}, {"http": "http://10.122.108.164:8807"},
{"http": "http://10.122.108.165:8807"}, {"http": "http://10.122.108.166:8807"},
{"http": "http://10.122.108.168:8801"}, {"http": "http://10.122.150.146:8802"},
{"http": "http://10.122.150.158:8802"}, {"http": "http://10.122.150.164:8801"},
{"http": "http://10.143.51.38:8813"}, {"http": "http://10.143.103.42:8810"},
{"http": "http://10.143.194.45:8804"}, {"http": "http://10.143.226.25:8801"},
{"http": "http://10.143.236.12:8807"}, {"http": "http://10.143.238.36:8807"},
{"http": "http://10.144.71.30:8807"}, {"http": "http://10.144.73.16:8804"},
{"http": "http://10.144.138.36:8801"}, {"http": "http://10.144.152.40:8810"},
{"http": "http://10.144.199.29:8810"}, {"http": "http://10.144.251.29:8813"},
]
headers = {
"X-Tp-Authorization": "Basic RVJOSUVMaXRlVjpFUk5JRUxpdGVWXzFxYXo0cmZ2M2VkYzV0Z2Iyd3N4LWJmZS10cA==",
"scheme": "https"
}
new_url = url.replace("https://", "http://") if url.startswith("https://") else url
# 代理可能不稳定,需要重试
for idx in range(retry_time):
try:
response = requests.get(new_url, headers=headers, proxies=random.choice(proxies))
if response.status_code == 200:
image_data = io.BytesIO(response.content)
mime_type = response.headers.get('Content-Type')
if mime_type is None:
mime_type, _ = mimetypes.guess_type(url)
data_processor_logger.info(f"Detected MIME type: {mime_type}") # 调试信息
pil_image = process_image_data(image_data, mime_type, url)
return pil_image
except Exception as e:
data_processor_logger.error(f"Failed to download the image, idx: {idx}, URL: {url}, error: {e}")
raise Exception(f"Failed to download the image from URL: {url}")
def base64_to_pil_image(base64_string):
"""base64_to_pil_image"""
image_bytes = base64.b64decode(base64_string)
buffer = io.BytesIO(image_bytes)
pil_image = Image.open(buffer)
return pil_image
def is_public_url(url):
"""判断是否公网url"""
try:
# 解析URL
parsed_url = urlparse(url)
hostname = parsed_url.hostname
if hostname is None:
return False
# 尝试将域名解析为IP地址
ip_address = socket.gethostbyname(hostname)
# 转换为IP地址对象
ip_obj = ipaddress.ip_address(ip_address)
# 判断是否为私有IP或保留IP地址
if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_link_local or ip_obj.is_reserved:
return False
else:
return True
except Exception as e:
print(f"Error checking URL: {e}")
return False
def process_transparency(image):
""" process transparency. """
def _is_transparent(image):
# 检查图片是否有alpha通道
if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info):
# 获取alpha通道
alpha = image.convert('RGBA').split()[-1]
# 如果alpha通道中存在0说明图片有透明部分
if alpha.getextrema()[0] < 255:
return True
return False
def _convert_transparent_paste(image):
width, height = image.size
new_image = Image.new("RGB", (width, height), (255, 255, 255)) # 生成一张白色底图
new_image.paste(image, (0, 0), image)
return new_image
try:
if _is_transparent(image): # Check and fix transparent images
data_processor_logger.info("Image has transparent background, adding white background.")
image = _convert_transparent_paste(image)
except:
pass
return ImageOps.exif_transpose(image)

View File

@@ -0,0 +1,241 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
import base64
from functools import partial
from io import BytesIO
from pathlib import Path
from typing import Optional
import numpy as np
import numpy.typing as npt
from PIL import Image
from .base import MediaIO
from .image import ImageMediaIO
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
"""
对视频帧进行缩放,将每一帧的大小调整为指定的高度和宽度。
Args:
frames (npt.NDArray, shape=(N, H, W, C)): 包含N个帧的三维数组其中H是高度W是宽度C是通道数。
所有帧都应该具有相同的通道数。
size (tuple[int, int], required): 一个元组,包含两个整数,分别表示目标高度和宽度。
Returns:
npt.NDArray, shape=(N, new_height, new_width, C): 返回一个新的三维数组,其中每一帧已经被缩放到指定的高度和宽度。
新数组的通道数与输入数组相同。
Raises:
None
"""
num_frames, _, _, channels = frames.shape
new_height, new_width = size
resized_frames = np.empty((num_frames, new_height, new_width, channels),
dtype=frames.dtype)
# lazy import cv2 to avoid bothering users who only use text models
import cv2
for i, frame in enumerate(frames):
resized_frame = cv2.resize(frame, (new_width, new_height))
resized_frames[i] = resized_frame
return resized_frames
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
"""
对视频帧进行缩放,将每个帧的高度和宽度都乘以一个因子。
Args:
frames (npt.NDArray): 形状为THWC的四维numpy数组表示T个帧高度为H宽度为W通道数为C。
size_factor (float): 用于缩放视频帧的因子新的高度和宽度将分别是原来的高度和宽度的size_factor倍。
Returns:
npt.NDArray: 形状为Tnew_Hnew_WC的四维numpy数组表示T个帧高度为new_H宽度为new_W通道数为C。
其中new_H和new_W是根据size_factor计算出来的。
Raises:
None
"""
_, height, width, _ = frames.shape
new_height = int(height * size_factor)
new_width = int(width * size_factor)
return resize_video(frames, (new_height, new_width))
def sample_frames_from_video(frames: npt.NDArray,
num_frames: int) -> npt.NDArray:
"""
从视频中随机选取指定数量的帧并返回一个包含这些帧的numpy数组。
Args:
frames (npt.NDArray): 形状为THWC的ndarray表示视频的所有帧其中T是帧的总数H、W是每个帧的高度和宽度C是通道数。
num_frames (int, optional): 要从视频中选取的帧数。如果设置为-1则将返回所有帧。默认为-1。
Returns:
npt.NDArray: 形状为num_framesHWC的ndarray表示选取的帧。如果num_frames=-1则返回原始的frames。
"""
total_frames = frames.shape[0]
if num_frames == -1:
return frames
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
sampled_frames = frames[frame_indices, ...]
return sampled_frames
class VideoMediaIO(MediaIO[npt.NDArray]):
def __init__(
self,
image_io: ImageMediaIO,
*,
num_frames: int = 32,
) -> None:
"""
初始化一个 VideoMediaIO 对象。
Args:
image_io (ImageMediaIO): 用于读取和写入图像的 ImageMediaIO 对象。
num_frames (int, optional): 视频中帧数,默认为 32。
ImageMediaIO 对象必须支持指定帧数。
Raises:
TypeError: 如果 image_io 不是 ImageMediaIO 类型。
ValueError: 如果 num_frames 小于等于 0。
Returns:
None: 无返回值,直接初始化并设置属性。
"""
super().__init__()
self.image_io = image_io
self.num_frames = num_frames
def load_bytes(self, data: bytes) -> npt.NDArray:
"""
从字节数据加载视频帧,并返回一个 numpy ndarray。
如果字节数据中的视频帧数量大于指定的 `num_frames`,则将其平均分布到这些帧上;否则,返回所有帧。
Args:
data (bytes): 包含视频帧数据的字节对象。
Returns:
npt.NDArray, shape=(num_frames, height, width, channels): 返回一个 numpy ndarray其中包含了视频帧数据。
如果 `num_frames` 小于视频帧数量,则返回前 `num_frames` 帧;否则,返回所有帧。
Raises:
None.
"""
import decord
vr = decord.VideoReader(BytesIO(data), num_threads=1)
total_frame_num = len(vr)
num_frames = self.num_frames
if total_frame_num > num_frames:
uniform_sampled_frames = np.linspace(0,
total_frame_num - 1,
num_frames,
dtype=int)
frame_idx = uniform_sampled_frames.tolist()
else:
frame_idx = list(range(0, total_frame_num))
return vr.get_batch(frame_idx).asnumpy()
def load_base64(self, media_type: str, data: str) -> npt.NDArray:
"""
加载 base64 编码的数据,并返回 numpy ndarray。
Args:
media_type (str): 媒体类型,目前仅支持 "video/jpeg"
当为 "video/jpeg" 时,将解析每一帧的 base64 编码数据,并转换成 numpy ndarray。
data (str): base64 编码的字符串数据。
Returns:
npt.NDArray, optional: 如果 media_type 为 "video/jpeg",则返回 numpy ndarray 格式的视频数据;否则返回 None。
Raises:
None.
"""
if media_type.lower() == "video/jpeg":
load_frame = partial(
self.image_io.load_base64,
"image/jpeg",
)
return np.stack([
np.array(load_frame(frame_data))
for frame_data in data.split(",")
])
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> npt.NDArray:
"""
读取文件内容并将其转换为numpy数组。
Args:
filepath (Path): 文件路径对象,表示要读取的文件。
Returns:
npt.NDArray, optional: 返回一个numpy数组包含了文件内容。如果无法解析文件内容则返回None。
Raises:
无。
"""
with filepath.open("rb") as f:
data = f.read()
return self.load_bytes(data)
def encode_base64(
self,
media: npt.NDArray,
*,
video_format: str = "JPEG",
) -> str:
"""
将视频编码为Base64字符串每一帧都是一个Base64字符串。
如果视频格式为"JPEG"则每一帧都会被转换成JPEG图片并进行编码。
Args:
media (npt.NDArray): 要编码的视频形状为HWC或者THWC其中T为时间步长H和W分别为高度和宽度C为通道数。
当前仅支持JPEG格式。
video_format (str, optional, default="JPEG"): 视频格式,只支持"JPEG"。 Default to "JPEG".
Raises:
NotImplementedError: 当前仅支持JPEG格式。
Returns:
str: Base64字符串每一帧都是一个Base64字符串","连接起来。
"""
video = media
if video_format == "JPEG":
encode_frame = partial(
self.image_io.encode_base64,
image_format=video_format,
)
return ",".join(
encode_frame(Image.fromarray(frame)) for frame in video)
msg = "Only JPEG format is supported for now."
raise NotImplementedError(msg)

View File

@@ -0,0 +1,59 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from fastdeploy.engine.config import ModelConfig
class InputPreprocessor:
"""
Args:
model_name_or_path (str):
Model name or path to the pretrained model. If a model name is provided, it should be a
key in the Hugging Face Transformers' model registry (https://huggingface.co/models).
The model will be downloaded from the Hugging Face model hub if necessary.
If a path is provided, the model will be loaded from that path.
enable_mm (bool, optional):
Whether to use the multi-modal model processor. Defaults to False.
Raises:
ValueError:
If the model name is not found in the Hugging Face Transformers' model registry and the path does not
exist.
"""
def __init__(
self,
model_name_or_path: str,
enable_mm: bool = False,
) -> None:
self.model_name_or_path = model_name_or_path
self.enable_mm = enable_mm
def create_processor(self):
"""
创建数据处理器。如果启用了多模态注册表,则使用该表中的模型;否则,使用传递给构造函数的模型名称或路径。
返回值DataProcessor如果不启用多模态注册表或MultiModalRegistry.Processor如果启用多模态注册表
Args:
无参数。
Returns:
DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
"""
architectures = ModelConfig(self.model_name_or_path).architectures
from fastdeploy.input.text_processor import DataProcessor
self.processor = DataProcessor(model_name_or_path=self.model_name_or_path)
return self.processor

View File

@@ -0,0 +1,533 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
from abc import ABC, abstractmethod
import numpy as np
from paddlenlp.generation import GenerationConfig
from paddlenlp.transformers import Llama3Tokenizer, LlamaTokenizer
from fastdeploy.utils import data_processor_logger
class BaseDataProcessor(ABC):
"""base class for data processor"""
def __init__(self):
"""
Returns:
None
"""
self.tokenizer = self._load_tokenizer()
self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id(
self.tokenizer.bos_token)
self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id(
self.tokenizer.cls_token)
self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id(
self.tokenizer.sep_token)
self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id(
self.tokenizer.eos_token)
self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id(
self.tokenizer.mask_token)
data_processor_logger.info((
f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ",
f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, "
f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, "
f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, "
f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}"
))
@abstractmethod
def process_request(self, request, **kwargs):
"""
Preprocess the request
Args:
request (Dict): may contain text and messages fields
**kwargs: others
Returns:
bool: Whether preprocessing is successful
str: error message
"""
raise NotImplementedError
@abstractmethod
def process_response(self, response_dict):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
raise NotImplementedError
def text2ids(self, text, max_model_len=None):
"""
text to token ids
Args:
text (str): text
Returns:
List[int]: token ids list
"""
raise NotImplementedError
def messages2ids(self, messages):
"""
Convert multi-turn messages into ID sequences.
Args:
messages (List[List[Dict[str, Any]]]): multi-turn messages.
Returns:
List[int]: ID sequences
"""
raise NotImplementedError
def ids2tokens(self, token_id, task_id=None):
"""
token ids to strings
Args:
token_id (List[int]): token id
task_id (str): task id
Returns:
List[str]: strings
"""
raise NotImplementedError
@abstractmethod
def _load_tokenizer(self):
"""
load tokenizer
Returns:
tokenizer (AutoTokenizer)
"""
raise NotImplementedError
class DataProcessor(BaseDataProcessor):
def __init__(self, model_name_or_path):
"""
Initializes the DecodeStatus object.
Args:
model_name_or_path (str): The name or path of the pre-trained model to be loaded.
Can also be a path to a directory containing the pre-trained model file.
Returns:
None.
Raises:
None.
"""
self.model_name_or_path = model_name_or_path
self._init_config()
self.decode_status = dict()
self.tokenizer = self._load_tokenizer()
data_processor_logger.info(
f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
)
from paddlenlp.trl.llm_utils import get_eos_token_id
self.eos_token_ids = get_eos_token_id(self.tokenizer,
self.generation_config)
self.eos_token_id_len = len(self.eos_token_ids)
self.pad_token_id = self.get_pad_id()
self.tokenizer.pad_token_id = self.pad_token_id
def _init_config(self):
"""
初始化配置包括模型名称、使用Hugging Face Tokenizer等。
Args:
无参数,但是会从环境变量中获取一些配置信息。
Returns:
无返回值,直接修改了类的属性。
Raises:
无异常抛出。
"""
self.use_hf_tokenizer = int(os.getenv("USE_HF_TOKENIZER", "0")) == 1
# Generation config
try:
self.generation_config = GenerationConfig.from_pretrained(
self.model_name_or_path)
except Exception as e:
data_processor_logger.warning(
f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
)
self.generation_config = None
def process_request(self, request, max_model_len=None):
"""
Preprocess the request
Args:
request (Dict): may contain text and messages fields
Returns:
bool: Whether preprocessing is successful
str: error message
"""
if request.get("eos_token_ids") is None or len(
request.eos_token_ids) == 0:
request.eos_token_ids = self.eos_token_ids
stop_sequences = request.get("stop", [])
if stop_sequences is not None and len(stop_sequences) != 0:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
request.set("stop_token_ids", stop_seqs)
request.set("stop_seqs_len", stop_seqs_len)
if request.prompt_token_ids is None or len(
request.prompt_token_ids) == 0:
if request.prompt is not None:
request.prompt_token_ids = self.text2ids(
request.prompt, max_model_len, request.raw_request)
elif request.messages is not None:
if self.tokenizer.chat_template is None:
raise ValueError(
"This model does not support chat_template.")
request.prompt_token_ids = self.messages2ids(request.messages)
else:
raise ValueError(
f"The request should have `input_ids`, `text` or `messages`: {request}."
)
if max_model_len is not None and len(
request.prompt_token_ids) > max_model_len:
request.prompt_token_ids = request.prompt_token_ids[:
max_model_len -
1]
return request
def process_request_dict(self, request, max_model_len=None):
"""
Preprocess the request
Args:
request (Dict): may contain text and messages fields
Returns:
bool: Whether preprocessing is successful
str: error message
"""
if not request.get('eos_token_ids'):
request['eos_token_ids'] = self.eos_token_ids
# 处理stop_sequences
stop_sequences = request.get('stop', [])
if stop_sequences:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
request['stop_token_ids'] = stop_seqs
request['stop_seqs_len'] = stop_seqs_len
# 处理prompt_token_ids
if not request.get('prompt_token_ids'):
if 'prompt' in request:
raw_request = request.get('raw_request', True)
request['prompt_token_ids'] = self.text2ids(
request['prompt'], max_model_len, raw_request).tolist()
elif 'messages' in request:
if self.tokenizer.chat_template is None:
raise ValueError(
"This model does not support chat_template.")
request['prompt_token_ids'] = self.messages2ids(
request['messages']).tolist()
else:
raise ValueError(
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
)
# 截断超过长度限制的prompt
if max_model_len is not None and len(
request['prompt_token_ids']) > max_model_len:
request['prompt_token_ids'] = request[
'prompt_token_ids'][:max_model_len - 1]
return request
def process_response(self, response_dict, **kwargs):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
is_end = response_dict.finished
req_id = response_dict.request_id
token_ids = response_dict.outputs.token_ids
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
response_dict.usage = {
"completion_tokens": response_dict.outputs.index + 1
}
if is_end:
self.clear_request_status(req_id)
data_processor_logger.debug(
"Request id: {} has been completed.".format(token_ids))
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
self.clear_request_status(req_id)
return response_dict
def process_response_dict(self, response_dict, stream=True):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
is_end = response_dict["finished"]
req_id = response_dict["request_id"]
token_ids = response_dict["outputs"]["token_ids"]
if is_end:
data_processor_logger.debug(
"Request id: {} has been completed.".format(token_ids))
full_text = self.clear_request_status(req_id)
if not stream:
response_dict["outputs"]["text"] = full_text
else:
response_dict["outputs"]["text"] = ""
else:
response_dict["outputs"]["text"] = self.ids2tokens(
token_ids, req_id)
return response_dict
def text2ids(self, text, max_model_len, raw_request=True):
"""
text to token ids
Args:
text (str): text
Returns:
List[int]: token ids list
"""
if self.use_hf_tokenizer:
tokens = self.tokenizer(
text,
return_tensors="np",
padding=True,
truncation=True,
)
else:
if not raw_request or self.tokenizer.chat_template is None:
text = [text] if isinstance(text, str) else text
chat_template = False
elif self.tokenizer.chat_template is not None:
text = [text] if isinstance(text, str) else text
text = [
self.tokenizer.apply_chat_template(sentence,
tokenize=False)
for sentence in text
]
chat_template = True
tokens = self.tokenizer(
text,
return_tensors="np",
padding=True,
truncation=True,
max_length=max_model_len,
add_special_tokens=chat_template,
)
return tokens["input_ids"][0]
def messages2ids(self, messages):
"""
Convert multi-turn messages into ID sequences.
Args:
messages (List[List[Dict[str, Any]]]): multi-turn messages.
Returns:
List[int]: ID sequences
"""
message_result = self.tokenizer.apply_chat_template(
messages, return_tensors="pd")
return np.array(message_result["input_ids"][0])
def ids2tokens(self, token_id, task_id):
"""
token ids to strings
Args:
token_ids (List[int]): token ids
task_id (str): task id
Returns:
List[str]: strings
"""
if self.use_hf_tokenizer:
if task_id not in self.decode_status:
# history token ids & history token strings & befer decode str
self.decode_status[task_id] = [[], [], ""]
previous_token_ids = self.decode_status[task_id][0]
decode_str = self.tokenizer.batch_decode(
[previous_token_ids + token_id],
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
if isinstance(decode_str, list) and len(decode_str):
new_str = decode_str[0].replace(self.decode_status[task_id][2],
"", 1)
self.decode_status[task_id][1].append(new_str)
self.decode_status[task_id][2] = decode_str[0]
else:
new_str = ""
self.decode_status[task_id][0] += token_id
return new_str
else:
if task_id not in self.decode_status:
# prefix offset & read offset & history token ids & history token strings
self.decode_status[task_id] = [0, 0, [], []]
prefix_offset = self.decode_status[task_id][0]
read_offset = self.decode_status[task_id][1]
previous_token_ids = self.decode_status[task_id][2]
decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
previous_token_ids + token_id, prefix_offset, read_offset)
self.decode_status[task_id][0] = prefix_offset
self.decode_status[task_id][1] = read_offset
self.decode_status[task_id][2] += token_id
self.decode_status[task_id][3].append(decode_str)
return decode_str
def _load_tokenizer(self):
"""
load tokenizer
Returns:
tokenizer (AutoTokenizer)
"""
if self.use_hf_tokenizer:
from transformers import AutoTokenizer
return AutoTokenizer.from_pretrained(self.model_name_or_path,
use_fast=False)
else:
from paddlenlp.transformers import AutoTokenizer
return AutoTokenizer.from_pretrained(self.model_name_or_path,
padding_side="left",
use_fast=True)
def clear_request_status(self, task_id):
"""
clear request status
Args:
task_id (str): task id
Returns:
results_all (str): all token strings
"""
results_all = ""
if task_id in self.decode_status:
if self.use_hf_tokenizer:
results_all = self.decode_status[task_id][2]
else:
results_all = "".join(self.decode_status[task_id][3])
del self.decode_status[task_id]
return results_all
def get_pad_id(self):
"""
get pad_token_id, if not pad_token_id, use eos_token
Returns:
int: pad_token_id
"""
if isinstance(self.tokenizer,
(LlamaTokenizer,
Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
return self.tokenizer.eos_token
return self.tokenizer.pad_token_id
def pad_batch_data(self,
insts,
pad_id=0,
return_seq_len=False,
return_array=True,
pad_style="right"):
"""Pad the instances to the max sequence length in batch."""
if len(insts) == 0:
padded_insts = np.array([[]],
dtype=np.int64) if return_array else [[]]
if return_seq_len:
seq_len = np.array([], dtype=np.int64) if return_array else []
return padded_insts, seq_len
return padded_insts
max_len = max(map(len, insts))
if pad_style == "left":
padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst)
for inst in insts]
else:
padded_insts = [
list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts
]
if return_array:
padded_insts = np.array(padded_insts,
dtype=np.int64).reshape([-1, max_len])
if return_seq_len:
seq_len = [len(inst) for inst in insts]
if return_array:
seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
return padded_insts, seq_len
return padded_insts
def update_stop_seq(self, stop_sequences):
"""
Update stop sequences from request.
"""
stop_seqs = []
for seq in stop_sequences:
if seq != self.tokenizer.eos_token_id:
stop_seqs.append(
self.tokenizer.convert_tokens_to_ids(
self.tokenizer.tokenize(seq)))
stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs,
pad_id=-1,
return_seq_len=True,
return_array=False)
data_processor_logger.debug(
f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
return stop_seqs, stop_seqs_len