[LLM] First commit the llm deployment code

2025-12-24 13:28:13 +08:00 · 2025-06-09 19:20:15 +08:00
parent 8513414112
commit 149c79699d
11814 changed files with 127294 additions and 1293102 deletions
--- a/fastdeploy/input/init.py
+++ b/fastdeploy/input/init.py
@@ -0,0 +1,15 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
--- a/fastdeploy/input/ernie_tokenizer.py
+++ b/fastdeploy/input/ernie_tokenizer.py
@@ -0,0 +1,269 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from paddlenlp.transformers import AddedToken, PretrainedTokenizer
+from paddlenlp.utils import logger
+
+__all__ = ["ErnieBotTokenizer"]
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {},
+    "tokenizer_file": {},
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+
+class ErnieBotTokenizer(PretrainedTokenizer):
+    """
+    Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    resource_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        bos_token = AddedToken(bos_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   pad_token, str) else pad_token
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            verbose=False,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        # for eb35 reader
+        self.bos_id = self.bos_token_id
+        self.eos_id = self.eos_token_id
+        self.sep_id = self.sep_token_id
+        self.pad_id = self.pad_token_id
+        self.unk_id = self.unk_token_id
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def tokenize(self, text):
+        """Returns a tokenized string."""
+        return self._tokenize(text)
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def decode(self,
+               tokens,
+               skip_special_tokens=False,
+               clean_up_tokenization_spaces=False):
+        """Returns a tokenized string."""
+        return self.sp_model.decode(tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(self,
+                        save_directory,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") +
+            VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+                out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file, )
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """ build_inputs_with_special_tokens """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
+                bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+ 
+        return output
--- a/fastdeploy/input/mm_processor/init.py
+++ b/fastdeploy/input/mm_processor/init.py
@@ -0,0 +1,23 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .process import DataProcessor, fancy_print, IDS_TYPE_FLAG
+
+__all__ = [
+    'DataProcessor',
+    'fancy_print',
+    'IDS_TYPE_FLAG',
+]   
--- a/fastdeploy/input/mm_processor/image_preprocessor/init.py
+++ b/fastdeploy/input/mm_processor/image_preprocessor/init.py
@@ -0,0 +1,20 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .get_image_preprocessor import get_image_preprocessor
+from .image_preprocessor_adaptive import AdaptiveImageProcessor
+
+__all__ = ['get_image_preprocessor', 'AdaptiveImageProcessor']
--- a/fastdeploy/input/mm_processor/image_preprocessor/get_image_preprocessor.py
+++ b/fastdeploy/input/mm_processor/image_preprocessor/get_image_preprocessor.py
@@ -0,0 +1,33 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""get image preprocessor"""
+
+from .image_preprocessor_adaptive import AdaptiveImageProcessor
+from fastdeploy.utils import data_processor_logger
+
+
+def get_image_preprocessor(args):
+    """
+    get_image_preprocessor from args
+    """
+
+    if args.vision_model_name_or_path is None:
+        return None
+
+    data_processor_logger.info("use AdaptiveImageProcessor")
+    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
+    return image_preprocess
--- a/fastdeploy/input/mm_processor/image_preprocessor/image_preprocessor_adaptive.py
+++ b/fastdeploy/input/mm_processor/image_preprocessor/image_preprocessor_adaptive.py
@@ -0,0 +1,568 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""image preprocessor adaptive"""
+
+import math
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddlenlp.transformers.feature_extraction_utils import BatchFeature
+from paddlenlp.transformers.image_processing_utils import BaseImageProcessor
+from paddlenlp.transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddlenlp.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from paddlenlp.transformers.tokenizer_utils_base import (
+    TensorType,
+)
+from PIL import Image
+
+from fastdeploy.utils import data_processor_logger
+
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 768
+
+
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "paddle.Tensor",
+    List["np.ndarray"],
+    List["paddle.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["paddle.Tensor"]],
+]
+
+
+__all__ = [
+    "AdaptiveImageProcessor",
+]
+
+
+def is_scaled_image(image: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+
+
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    """dummy"""
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class AdaptiveImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a adaptive image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
+            in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_conv_size (`int`, *optional*, defaults to 2):
+            The temporal conv size in resampler.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_conv_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        """init"""
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_conv_size = temporal_conv_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+
+    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
+        """设定pixels"""
+        if min_pixels is not None:
+            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
+            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
+            self.min_pixels = min_pixels
+            self.size["min_pixels"] = int(min_pixels)
+        if max_pixels is not None:
+            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
+            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
+            self.max_pixels = max_pixels
+            self.size["max_pixels"] = int(max_pixels)
+
+    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
+        """dummy"""
+        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=actual_min_pixels,
+            max_pixels=actual_max_pixels,
+        )
+        return (resized_height, resized_width), (resized_height // self.patch_size, resized_width // self.patch_size)
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = True,
+        resample: PILImageResampling = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = False,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        predetermined_grid_thw=None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
+                If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image.
+                Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image.
+                Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            data_processor_logger.warning(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+
+        if predetermined_grid_thw is not None:
+            assert len(predetermined_grid_thw) == len(
+                images
+            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
+
+        for img_idx, image in enumerate(images):
+            if do_resize:
+                if predetermined_grid_thw is not None:
+                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
+                    resized_height *= self.patch_size
+                    resized_width *= self.patch_size
+                else:
+                    resized_height, resized_width = smart_resize(
+                        height,
+                        width,
+                        factor=self.patch_size * self.merge_size,
+                        min_pixels=self.min_pixels,
+                        max_pixels=self.max_pixels,
+                    )
+                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
+                # 直接fromarray，不要靠paddlenlp里面的
+                image = Image.fromarray(image)
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    data_format=input_data_format,
+                )
+            if do_rescale:
+                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
+
+            if do_normalize:
+                image = normalize(image=image, mean=image_mean, std=image_std, data_format=input_data_format)
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
+
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose([0, 3, 1, 2])
+
+        channel = patches.shape[1]  # [time, C, H, W]
+        grid_t = patches.shape[0]
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            [
+                grid_t,
+                channel,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            ]
+        )
+        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
+        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
+
+        flatten_patches = patches.reshape(
+            [grid_t * grid_h * grid_w, channel * self.patch_size * self.patch_size]
+        )  # [grid_t * grid_h * grid_w, C * psz * psz]
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = True,
+        size: Optional[Union[int, List[int]]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        predetermined_grid_thw=None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+
+        if images is not None and not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for img_idx, image in enumerate(images):
+                if predetermined_grid_thw is not None:
+                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
+                else:
+                    predetermined_grid_thw_one = None
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    predetermined_grid_thw=predetermined_grid_thw_one,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    predetermined_grid_thw=predetermined_grid_thw,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+
+            data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+):
+    """
+    Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        if height > width:
+            new_width = max(factor, round_by_factor(width, factor))
+            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
+        else:
+            new_height = max(factor, round_by_factor(height, factor))
+            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
+
+        data_processor_logger.info(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
+              resize to {max(new_height, new_width) / min(new_height, new_width)}"
+        )
+
+        height = new_height
+        width = new_width
+
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+
+    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
+        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+
+    return h_bar, w_bar
--- a/fastdeploy/input/mm_processor/process.py
+++ b/fastdeploy/input/mm_processor/process.py
@@ -0,0 +1,388 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+""" process.py """
+import copy
+import io
+from collections import defaultdict
+from typing import Any, Dict, List, Union
+
+import numpy as np
+from paddlenlp.transformers.image_utils import ChannelDimension
+from PIL import Image
+
+
+
+from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
+from .process_video import read_frames_decord, read_video_decord
+from .utils.io_utils import RAW_IMAGE_DIR, get_downloadable
+from .utils.render_timestamp import render_frame_timestamp
+
+IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
+
+
+def fancy_print(input_ids, tokenizer, image_patch_id=None):
+    """
+    input_ids: input_ids
+    tokenizer: the tokenizer of models
+    """
+    i = 0
+    res = ""
+    text_ids = []
+    real_image_token_len = 0
+    while i < len(input_ids):
+        if input_ids[i] == image_patch_id:
+            if len(text_ids) > 0:
+                res += tokenizer.decode(text_ids)
+                text_ids = []
+
+            real_image_token_len += 1
+        else:
+            if real_image_token_len != 0:
+                res += f"<|IMAGE@{real_image_token_len}|>"
+                real_image_token_len = 0
+
+            text_ids.append(input_ids[i])
+
+        i += 1
+    if len(text_ids) > 0:
+
+        res += tokenizer.decode(text_ids)
+        text_ids = []
+    return res
+
+
+class DataProcessor:
+    """
+    Processes multimodal chat messages into model-ready inputs,
+    handling text, images, and videos with 3D positional embeddings.
+    """
+
+    CLS_TOKEN = "<|begin_of_sentence|>"
+    SEP_TOKEN = "<|end_of_sentence|>"
+    IMG_START = "<|IMAGE_START|>"
+    IMG_END = "<|IMAGE_END|>"
+    VID_START = "<|VIDEO_START|>"
+    VID_END = "<|VIDEO_END|>"
+
+    def __init__(
+        self,
+        tokenizer_name: str,
+        image_preprocessor_name: str,
+        spatial_conv_size: int = 2,
+        temporal_conv_size: int = 2,
+        image_min_pixels: int = 4 * 28 * 28,
+        image_max_pixels: int = 6177 * 28 * 28,
+        video_min_pixels: int = 299 * 28 * 28,
+        video_max_pixels: int = 1196 * 28 * 28,
+        video_target_frames: int = -1,
+        video_frames_sample: str = "leading",
+        video_max_frames: int = 180,
+        video_min_frames: int = 16,
+        video_fps: int = 2,
+    ) -> None:
+        # Tokenizer and image preprocessor
+        self.tokenizer = ErnieVLTokenizer.from_pretrained(tokenizer_name, verbose=False)
+        self.tokenizer.ignored_index = -100
+        self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name)
+
+        # Convolution sizes for patch aggregation
+        self.spatial_conv_size = spatial_conv_size
+        self.temporal_conv_size = temporal_conv_size
+
+        # Pixel constraints
+        self.image_min_pixels = image_min_pixels
+        self.image_max_pixels = image_max_pixels
+        self.video_min_pixels = video_min_pixels
+        self.video_max_pixels = video_max_pixels
+
+        # Video sampling parameters
+        self.target_frames = video_target_frames
+        self.frames_sample = video_frames_sample
+        self.max_frames = video_max_frames
+        self.min_frames = video_min_frames
+        self.fps = video_fps
+
+        # Special tokens and IDs
+        self.cls_token = self.CLS_TOKEN
+        self.sep_token = self.SEP_TOKEN
+        self.image_start = self.IMG_START
+        self.image_end = self.IMG_END
+        self.video_start = self.VID_START
+        self.video_end = self.VID_END
+        self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
+
+        self.token_type_mapping = self._build_token_type_mapping()
+        self.is_training = True
+        self.role_prefixes = {"system": "", "user": "User: ", "bot": "Assistant: ", "assistant": "Assistant: "}
+
+    def _build_token_type_mapping(self) -> Dict[Any, int]:
+        mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
+        for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END):
+            mapping[token] = IDS_TYPE_FLAG["image"]
+        mapping[self.image_patch_id] = IDS_TYPE_FLAG["image"]
+        return mapping
+
+    def train(self) -> None:
+        """Enable training mode (produces labels)."""
+        self.is_training = True
+
+    def eval(self) -> None:
+        """Enable evaluation mode (doesn't produce labels)."""
+        self.is_training = False
+
+    def process(self, messages: List[Dict[str, Any]]) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
+        """
+        Convert chat messages into model inputs.
+        Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
+        """
+        outputs = {
+            "input_ids": [],
+            "token_type_ids": [],
+            "position_ids": [],
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "labels": [],
+            "cur_position": 0,
+            "pic_cnt": 0,
+            "video_cnt": 0,
+        }
+        self._add_special_token(self.cls_token, outputs)
+
+        for msg in messages:
+            role = msg.get("role")
+            assert role in self.role_prefixes, f"Unsupported role: {role}"
+            prefix = self.role_prefixes[role]
+            if prefix:
+                self._add_text(prefix, outputs)
+
+            content_items = msg.get("content")
+            if not isinstance(content_items, list):
+                content_items = [content_items]
+
+            for item in content_items:
+                if isinstance(item, str) or item.get("type") == "text":
+                    text = item if isinstance(item, str) else item.get("text", "")
+                    self._add_text(text, outputs)
+                elif item.get("type") == "image_url" or item.get("type") == "image":
+                    self._add_image(item, outputs)
+                elif item.get("type") == "video_url" or item.get("type") == "video":
+                    self._add_video(item, outputs)
+
+            if role in ("user", "system"):
+                self._add_text("\n", outputs)
+            else:
+                self._add_special_token(self.sep_token, outputs)
+
+        if not self.is_training:
+            # Append assistant prefix in eval
+            self._add_text(self.role_prefixes["bot"], outputs)
+
+        return outputs
+
+    def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
+        token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token)
+        outputs["input_ids"].append(token_id)
+        outputs["token_type_ids"].append(self.token_type_mapping[token])
+        pos = outputs["cur_position"]
+        outputs["position_ids"].append([pos] * 3)
+        outputs["cur_position"] += 1
+
+    def _add_text(self, text: str, outputs: Dict) -> None:
+        tokens = self.tokenizer.encode(text, add_special_tokens=False)["input_ids"]
+        outputs["input_ids"].extend(tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
+
+        start = outputs["cur_position"]
+        for i in range(len(tokens)):
+            outputs["position_ids"].append([start + i] * 3)
+        outputs["cur_position"] += len(tokens)
+
+    def _add_image(self, item: Dict, outputs: Dict) -> None:
+        url_info = item.get("image_url", {})
+        w = url_info.get("image_width", None)
+        h = url_info.get("image_height", None)
+
+        if "image" in item:
+            img = item["image"]
+        else:
+            url = url_info.get("url")
+            data = get_downloadable(url, download_dir=RAW_IMAGE_DIR, save_to_disk=False)
+            img = Image.open(io.BytesIO(data) if isinstance(data, bytes) else data)
+
+        if w and h:
+            img = img.resize((w, h))
+
+        outputs["pic_cnt"] += 1
+        self._add_text(f"Picture {outputs['pic_cnt']}:", outputs)
+        self._add_special_token(self.IMG_START, outputs)
+
+        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
+            img.height,
+            img.width,
+            min_pixels=self.image_min_pixels,
+            max_pixels=self.image_max_pixels,
+        )[1]
+        num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
+
+        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+
+        pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+        # Preprocess pixels
+        ret = self.image_preprocessor.preprocess(
+            images=[img.convert("RGB")],
+            do_normalize=False,
+            do_rescale=False,
+            predetermined_grid_thw=np.array([[patches_h, patches_w]]),
+            do_convert_rgb=True,
+            input_data_format=ChannelDimension.LAST,
+        )
+        outputs["images"].append(ret["pixel_values"])
+        outputs["grid_thw"].append(ret["image_grid_thw"])
+        outputs["image_type_ids"].append(0)
+
+        self._add_special_token(self.IMG_END, outputs)
+
+    def _add_video(self, item: Dict, outputs: Dict) -> None:
+        url_info = item.get("video_url", {})
+        url = url_info.get("url")
+        outputs["video_cnt"] += 1
+        self._add_text(f"Video {outputs['video_cnt']}:", outputs)
+        self._add_special_token(self.VID_START, outputs)
+
+        if "video" in item:
+            video_path = item["video"]
+            frames = self._load_and_process_video(video_path, item)
+        else:
+            video_path = get_downloadable(url, save_to_disk=False)
+            frames = self._load_and_process_video(video_path, item)
+        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
+            frames[0].height,
+            frames[0].width,
+            min_pixels=self.video_min_pixels,
+            max_pixels=self.video_max_pixels,
+        )[1]
+        num_frames = len(frames)
+        num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+
+        pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
+        ret = self.image_preprocessor.preprocess(
+            images=None,
+            videos=pixel_stack,
+            do_normalize=False,
+            do_rescale=False,
+            predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
+            do_convert_rgb=True,
+            input_data_format=ChannelDimension.LAST,
+        )
+        outputs["images"].append(ret["pixel_values_videos"])
+        outputs["grid_thw"].append(ret["video_grid_thw"])
+        outputs["image_type_ids"].extend([1] * num_frames)
+
+        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+
+        pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+        self._add_special_token(self.VID_END, outputs)
+
+    def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]:
+        reader, meta, path = read_video_decord(url, save_to_disk=False)
+
+        video_frame_args = dict()
+        video_frame_args["fps"] = item.get("fps", self.fps)
+        video_frame_args["min_frames"] = item.get("min_frames", self.min_frames)
+        video_frame_args["max_frames"] = item.get("max_frames", self.max_frames)
+        video_frame_args["target_frames"] = item.get("target_frames", self.target_frames)
+        video_frame_args["frames_sample"] = item.get("frames_sample", self.frames_sample)
+
+        video_frame_args = self._set_video_frame_args(video_frame_args, meta)
+
+        frames_data, _, timestamps = read_frames_decord(
+            path,
+            reader,
+            meta,
+            target_frames=video_frame_args["target_frames"],
+            target_fps=video_frame_args["fps"],
+            frames_sample=video_frame_args["frames_sample"],
+            save_to_disk=False,
+        )
+
+        frames: List[Image.Image] = []
+        for img_array, ts in zip(frames_data, timestamps):
+            frames.append(render_frame_timestamp(img_array, ts))
+        # Ensure even number of frames for temporal conv
+        if len(frames) % 2 != 0:
+            frames.append(copy.deepcopy(frames[-1]))
+        return frames
+
+    def _set_video_frame_args(self, video_frame_args, video_meta):
+        """
+        根据已知参数和优先级，设定最终的抽帧参数
+        """
+        # 优先级：video_target_frames > (video_min_frames, video_max_frames) > video_fps
+        if video_frame_args["target_frames"] > 0:
+            if video_frame_args["fps"] >= 0:
+                raise ValueError("fps must be negative if target_frames is given")
+            if (
+                video_frame_args["min_frames"] > 0
+                and video_frame_args["target_frames"] < video_frame_args["min_frames"]
+            ):
+                raise ValueError("target_frames must be larger than min_frames")
+            if (
+                video_frame_args["max_frames"] > 0
+                and video_frame_args["target_frames"] > video_frame_args["max_frames"]
+            ):
+                raise ValueError("target_frames must be smaller than max_frames")
+        else:
+            if video_frame_args["fps"] < 0:
+                raise ValueError("Must provide either positive target_fps or positive target_frames.")
+            # 先计算在video_fps下抽到的帧数
+            frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
+            # 判断是否在目标区间内，如果不是，则取target_frames为上界或下界
+            if (
+                video_frame_args["min_frames"] > 0
+                and video_frame_args["max_frames"] > 0
+                and video_frame_args["min_frames"] > video_frame_args["max_frames"]
+            ):
+                raise ValueError("min_frames must be smaller than max_frames")
+            if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
+                video_frame_args["target_frames"] = video_frame_args["min_frames"]
+                video_frame_args["fps"] = -1
+            if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
+                video_frame_args["target_frames"] = video_frame_args["max_frames"]
+                video_frame_args["fps"] = -1
+
+        return video_frame_args
+
+    def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int) -> List[List[int]]:
+        # Downsample time if needed
+        t_eff = t // self.temporal_conv_size if t != 1 else 1
+        gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
+        time_idx = np.repeat(np.arange(t_eff), gh * gw)
+        h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
+        w_idx = np.tile(np.arange(gw), t_eff * gh)
+
+        coords = list(zip(time_idx, h_idx, w_idx))
+        return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
--- a/fastdeploy/input/mm_processor/process_video.py
+++ b/fastdeploy/input/mm_processor/process_video.py
@@ -0,0 +1,201 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import io
+import os
+import random
+
+import numpy as np
+from PIL import Image
+
+from .utils.io_utils import EXTRACTED_FRAME_DIR, get_downloadable, get_filename
+from .utils.video_utils import VideoReaderWrapper
+from fastdeploy.utils import data_processor_logger
+
+
+def read_video_decord(video_path, save_to_disk):
+    """get reader and meta by decord"""
+    data_in_mem = False
+    # video_path = get_downloadable(video_path, save_to_disk=save_to_disk)
+    if isinstance(video_path, VideoReaderWrapper):
+        data_in_mem = True
+        video_reader = video_path
+    else:
+        if isinstance(video_path, bytes):
+            video_path = io.BytesIO(video_path)
+        video_reader = VideoReaderWrapper(video_path, num_threads=1)
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    duration = vlen / float(fps)
+
+    video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
+
+    return video_reader, video_meta, video_path
+
+
+def get_frame_indices(
+    vlen,
+    target_frames=-1,
+    target_fps=-1,
+    frames_sample="middle",
+    fix_start=None,
+    input_fps=-1,
+):
+    """
+    取出对应的frame index
+    """
+    assert frames_sample in ["rand", "middle", "leading"]
+    if target_frames > 0:
+        assert target_fps <= 0, "target_fps must be negative if target_frames is given."
+        if target_frames > vlen:
+            acc_samples = vlen
+            data_processor_logger.info(
+                f"target_frames={target_frames} is larger than video length {vlen}, "
+                f"will sample {acc_samples} frames."
+            )
+        else:
+            acc_samples = target_frames
+            data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
+
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if frames_sample == "rand":
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except Exception as e:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif frames_sample == "leading":
+            frame_indices = [x[0] for x in ranges]
+        elif frames_sample == "middle":
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+
+    elif target_fps > 0:
+        assert target_frames <= 0, "target_frames must be negative if target_fps is given."
+        assert input_fps > 0, "input_fps must be provided if target_fps is given."
+        data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
+        duration = float(vlen) / input_fps
+        delta = 1 / target_fps  # gap between frames, this is also the clip length each frame represents
+        if frames_sample == "middle":
+            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        elif frames_sample == "leading":
+            frame_seconds = np.arange(0, duration, delta)
+        if frames_sample == "rand":
+            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+            rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
+            frame_seconds += rand_offset * delta
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+
+    else:
+        raise ValueError("Must provide either positive target_fps or positive target_frames.")
+
+    return frame_indices
+
+
+def read_frames_decord(
+    video_path,
+    video_reader,
+    video_meta,
+    target_frames=-1,
+    target_fps=-1,
+    frames_sample="middle",
+    fix_start=None,
+    save_to_disk=False,
+    cache_dir=EXTRACTED_FRAME_DIR,
+    frame_indices=None,
+    tol=10,
+):
+    """get frames by decord"""
+
+    if frame_indices is None:
+        frame_indices = get_frame_indices(
+            video_meta["num_of_frame"],
+            target_frames=target_frames,
+            target_fps=target_fps,
+            frames_sample=frames_sample,
+            fix_start=fix_start,
+            input_fps=video_meta["fps"],
+        )
+
+    frames = []
+    for frame_indice_index in range(0, len(frame_indices)):
+        frame_indice = frame_indices[frame_indice_index]
+        try:
+            frames.append(video_reader[frame_indice].asnumpy())  # (T, H, W, C)
+        except Exception as e:
+            data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
+            previous_counter = 1
+            later_counter = 1
+            previous_after_flag = True
+            if frame_indice == 0 or frame_indice == len(video_reader) - 1:
+                cur_tol = tol * 2
+            else:
+                cur_tol = tol
+            while previous_counter < cur_tol or later_counter < cur_tol:
+                if previous_after_flag:
+                    if frame_indice - previous_counter < 0:
+                        previous_counter += 1
+                        previous_after_flag = not previous_after_flag
+                        continue
+                    try:
+                        frames.append(video_reader[frame_indice - previous_counter].asnumpy())
+                        data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame")
+                        frame_indices[frame_indice_index] = frame_indice - previous_counter
+                        break
+                    except Exception as e:
+                        previous_counter += 1
+                else:
+                    if frame_indice + later_counter >= len(video_reader):
+                        later_counter += 1
+                        previous_after_flag = not previous_after_flag
+                        continue
+                    try:
+                        frames.append(video_reader[frame_indice + later_counter].asnumpy())
+                        data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame")
+                        frame_indices[frame_indice_index] = frame_indice + later_counter
+                        break
+                    except Exception as e:
+                        later_counter += 1
+                previous_after_flag = not previous_after_flag
+
+    frames = np.stack(frames, axis=0)
+    assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
+
+    ret = []
+
+    url_sha1 = get_filename()
+    for idx, frame in enumerate(frames):
+        tmp = Image.fromarray(frame, "RGB")
+        if save_to_disk:
+            save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
+            if not os.path.exists(os.path.dirname(save_path)):
+                os.makedirs(os.path.dirname(save_path))
+            tmp.save(save_path)
+            tmp = save_path
+        ret.append(tmp)
+
+    time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
+
+    return ret, frame_indices, time_stamps
--- a/fastdeploy/input/mm_processor/tokenizer/init.py
+++ b/fastdeploy/input/mm_processor/tokenizer/init.py
@@ -0,0 +1,19 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .tokenizer_vl import ErnieVLTokenizer
+
+__all__ = ['ErnieVLTokenizer']
--- a/fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py
+++ b/fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py
@@ -0,0 +1,348 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""
+ErnieVLTokenizer
+"""
+import os
+import re
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import paddle
+import sentencepiece as spm
+from paddlenlp.transformers import PretrainedTokenizer
+from paddlenlp.transformers.tokenizer_utils_base import (
+    PaddingStrategy,
+    TextInput,
+)
+from paddlenlp.utils.log import logger
+
+
+class ErnieVLTokenizer(PretrainedTokenizer):
+    """doc"""
+
+    resource_files_names = {
+        "vocab_file": "tokenizer.model",
+    }
+    pretrained_resource_files_map = {"vocab_file": {"ernie-bot-10b": None}}
+    pretrained_init_configuration = {
+        "ernie-bot-10b": {},
+    }
+    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
+    padding_side = "right"
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        cls_token="<cls>",
+        eos_token="</s>",
+        mask_token="<mask:0>",
+        pad_token="<pad>",
+        sep_token="<sep>",
+        unk_token="<unk>",
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        """doc"""
+        if additional_special_tokens is None:
+            additional_special_tokens = ["<mask:1>", "<mask:7>"]
+        super().__init__(
+            bos_token=bos_token,
+            cls_token=cls_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def space_token(self):
+        """doc"""
+        return "<mask:1>"
+
+    @property
+    def space_token_id(self):
+        """doc"""
+        return self.sp_model.piece_to_id("<mask:1>")
+
+    @property
+    def gend_token(self):
+        """doc"""
+        return "<mask:7>"
+
+    @property
+    def gend_token_id(self):
+        """doc"""
+        return self.sp_model.piece_to_id("<mask:7>")
+
+    @property
+    def im_start_id(self):
+        """doc"""
+        return self.sp_model.piece_to_id("<|im_start|>")
+
+    @property
+    def im_end_id(self):
+        """doc"""
+        return self.sp_model.piece_to_id("<|im_end|>")
+
+    @property
+    def vocab_size(self):
+        """doc"""
+        return self.sp_model.vocab_size()
+
+    def get_vocab(self):
+        """doc"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """doc"""
+        return self.sp_model.encode_as_pieces(text)
+
+    def _convert_token_to_id(self, token):
+        """doc"""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, id):
+        """doc"""
+        return self.sp_model.id_to_piece(id)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        # prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                # if not prev_is_special:
+                #     out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                # prev_is_special = True
+
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                # prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string  # .strip()
+
+    def prepare_for_model(self, *args, **kwargs):
+        """doc"""
+        if "add_special_tokens" in kwargs:
+            kwargs.pop("add_special_tokens")
+            # logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
+        return super().prepare_for_model(*args, **kwargs)
+
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+
+    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, using the tokenizer.
+
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            **kwargs (additional keyword arguments):
+                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+
+        Returns:
+            `List[str]`: The list of tokens.
+        """
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+
+        # TODO: should this be in the base class?
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [
+                re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+            ]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
+
+        no_split_token = set(self.unique_no_split_tokens)
+        tokens = self.tokens_trie.split(text)
+
+        tokenized_text = []
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            if token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token))
+        return tokenized_text
+
+    def _decode(self, *args, **kwargs):
+        """doc"""
+        kwargs.pop("clean_up_tokenization_spaces", None)
+        kwargs.pop("spaces_between_special_tokens", None)
+        return super()._decode(
+            *args,
+            **kwargs,
+            clean_up_tokenization_spaces=False,
+            spaces_between_special_tokens=False,
+        )
+
+    def _pad(
+        self,
+        encoded_inputs: Dict,
+        max_length: Optional[int] = None,
+        padding_strategy=PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """doc"""
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        if return_attention_mask:
+            required_input = encoded_inputs[self.model_input_names[0]]
+            if padding_strategy == PaddingStrategy.LONGEST:
+                max_length = len(required_input)
+            if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+                max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
+                attention_mask = encoded_inputs.pop("attention_mask")
+                if isinstance(attention_mask, paddle.Tensor):
+                    attention_mask = attention_mask.numpy()
+                elif isinstance(attention_mask, list):
+                    attention_mask = np.array(attention_mask)
+                elif not isinstance(attention_mask, np.ndarray):
+                    raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
+            else:
+                attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
+                attention_mask = np.expand_dims(attention_mask, axis=0)
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if self.padding_side == "right":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(0, difference)]
+                    else:
+                        pad_width = [(0, 0), (0, difference), (0, difference)]
+                elif self.padding_side == "left":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(difference, 0)]
+                    else:
+                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
+                else:
+                    raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                attention_mask = np.pad(
+                    attention_mask,
+                    pad_width=pad_width,
+                    mode="constant",
+                    constant_values=0,
+                )
+        encoded_inputs = super()._pad(
+            encoded_inputs,
+            max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=False,
+        )
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = attention_mask.tolist()
+        return encoded_inputs
+
+
+def add_special_tokens(
+    tokenizer,
+    special_tokens_info,
+    use_ocr_specialtoken=False,
+    use_crop_specialtoken=False,
+    special_token_ids_start=254208,
+    special_token_ids_end=256256,
+):
+    """
+    增加 special token
+
+    placeholder [<|IMAGE_PLACEHOLDER|>, <|AUDIO_PLACEHOLDER|>, <|VIDEO_PLACEHOLDER|>] 共3个
+
+    模态起始截止 special tokens [<|BOI|> <|EOI|> <|BOA|> <|EOA|> <|BOV|> <|EOV|>]
+
+    ocr special tokens [<|LOC_0|> <|LOC_1|> ... <|LOC_1000|>] 共1001个
+
+    crop special tokens [<|CROP_COL_SEP|>, <|CROP_ROW_SEP|>, <|CROP_IMAGE_SEP|>] 共3个
+        <|CROP_COL_SEP|> for col 维度切 图片width（替换原明文逗号）
+        <|CROP_ROW_SEP|> for row 维度切 图片height（替换原明文回车）
+        <|CROP_IMAGE_SEP|> for 区分原图和crop图 图片width（替换原明文两个回车）
+
+    共2048个 unsed token
+
+    Args:
+        tokenizer (ErnieTokenizer): tokenizer
+        special_token_ids_start (int, optional): special token 起点 ids. Defaults to 254208.
+        special_token_ids_end (int, optional): 词表最多支持大小. Defaults to 256256.
+    """
+    special_tokens = [
+        special_tokens_info["image_placeholder"],
+        special_tokens_info["audio_placeholder"],
+    ]
+
+    if use_ocr_specialtoken:
+        special_tokens.extend(special_tokens_info["ocr_coor"])
+        special_tokens.extend(special_tokens_info["ocr_begin_end"])
+
+    if use_crop_specialtoken:
+        special_tokens.extend(special_tokens_info["crop"])
+
+    # add special_tokens
+    additional_special_tokens = {"additional_special_tokens": special_tokens}
+    tokenizer.add_special_tokens(additional_special_tokens)
+
+    # check
+    first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
+
+    assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
+    assert (
+        len(tokenizer.get_vocab()) < special_token_ids_end
+    ), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
--- a/fastdeploy/input/mm_processor/utils/Roboto-Regular.ttf
+++ b/fastdeploy/input/mm_processor/utils/Roboto-Regular.ttf
--- a/fastdeploy/input/mm_processor/utils/init.py
+++ b/fastdeploy/input/mm_processor/utils/init.py
@@ -0,0 +1,15 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
--- a/fastdeploy/input/mm_processor/utils/io_utils.py
+++ b/fastdeploy/input/mm_processor/utils/io_utils.py
@@ -0,0 +1,253 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import base64
+import datetime
+import hashlib
+import io
+import os
+import threading
+import uuid
+from pathlib import Path
+
+import numpy as np
+import requests
+from PIL import Image
+from PIL.ExifTags import TAGS
+
+RAW_VIDEO_DIR = "./download_tmp/raw_video/"
+RAW_IMAGE_DIR = "./download_tmp/raw_images/"
+EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
+TMP_DIR = "./download_tmp/upload_tmp/"
+
+
+def file_download(url, download_dir, save_to_disk=False, retry=0, retry_interval=3):
+    """
+    Description: 下载url，如果url是PIL直接返回
+    Args:
+        url(str, PIL): http/本地路径/io.Bytes，注意io.Bytes是图片字节流
+        download_path: 在save_to_disk=True的情况下生效，返回保存地址
+        save_to_disk: 是否保存在本地路径
+
+    """
+    from .video_utils import VideoReaderWrapper
+
+    if isinstance(url, Image.Image):
+        return url
+    elif isinstance(url, VideoReaderWrapper):
+        return url
+    elif url.startswith("http"):
+        response = requests.get(url)
+        bytes_data = response.content
+    elif os.path.isfile(url):
+        if save_to_disk:
+            return url
+        bytes_data = open(url, "rb").read()
+    else:
+        bytes_data = base64.b64decode(url)
+    if not save_to_disk:
+        return bytes_data
+
+    download_path = os.path.join(download_dir, get_filename(url))
+    Path(download_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(download_path, "wb") as f:
+        f.write(bytes_data)
+    return download_path
+
+
+def get_filename(url=None):
+    """
+    Get Filename
+    """
+    if url is None:
+        return str(uuid.uuid4()).replace("-", "")
+    t = datetime.datetime.now()
+    if not isinstance(url, bytes):
+        url = url.encode("utf-8")
+
+    md5_hash = hashlib.md5(url).hexdigest()
+    pid = os.getpid()
+    tid = threading.get_ident()
+
+    # 去掉后缀，防止save-jpg报错
+    image_filname = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
+    return image_filname
+
+
+def get_downloadable(url, download_dir=RAW_VIDEO_DIR, save_to_disk=False, retry=0, retry_interval=3):
+    """download video and store it in the disk
+
+    return downloaded **path** if save_to_disk is set to true
+    return downloaded **bytes** if save_to_disk is set to false
+    """
+
+    if not os.path.exists(download_dir):
+        os.makedirs(download_dir)
+    downloaded_path = file_download(
+        url,
+        download_dir,
+        save_to_disk=save_to_disk,
+        retry=retry,
+        retry_interval=retry_interval,
+    )
+    return downloaded_path
+
+
+def get_downloadable_image(download_path, need_exif_info, retry_max_time=0, retry_interval=3):
+    """
+    带上exif info和图像处理的get downloadable
+    """
+
+    def get_image_exif(image):
+        exif_data = image._getexif()
+        exif_info = {}
+        if exif_data is not None:
+            for tag, value in exif_data.items():
+                tag_name = TAGS.get(tag, tag)
+                exif_info[tag_name] = value.strip()
+        return exif_info
+
+    def has_transparent_background(img):
+        """判断图片是否有背景"""
+        if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
+            # Check for any pixel with alpha channel less than 255 (fully opaque)
+            alpha = img.convert("RGBA").split()[-1]
+            if alpha.getextrema()[0] < 255:
+                return True
+        return False
+
+    def add_white_background(img):
+        """
+        给透明背景的图，加个白色背景
+        """
+        if img.mode != "RGBA":
+            img = img.convert("RGBA")
+        # 创建一个白色背景的图像，尺寸与原图一致
+        img_white_background = Image.new("RGBA", img.size, (255, 255, 255))
+
+        # 将原图粘贴到白色背景上
+        img_white_background.paste(img, (0, 0), img)
+
+        return img_white_background
+
+    def change_I16_to_L(img):
+        """
+        将图片从I;16模式转换为L模式
+        """
+        # 由于I模式的point函数只支持加减乘，所以下面的* (1 / 256)不能改成除法
+        return img.point(lambda i: i * (1 / 256)).convert("L")
+
+    image = get_downloadable(download_path, save_to_disk=False, retry=retry_max_time, retry_interval=retry_interval)
+    if isinstance(image, Image.Image):
+        pil_image = image
+    else:
+        pil_image = Image.open(io.BytesIO(image))
+    if need_exif_info:
+        try:
+            exif_info = get_image_exif(pil_image)
+        except Exception as why:
+            exif_info = {}
+    else:
+        exif_info = {}
+
+    try:
+        if pil_image.mode == "I;16":
+            pil_image = change_I16_to_L(pil_image)
+        if has_transparent_background(pil_image):
+            pil_image = add_white_background(pil_image)
+    except Exception as e:
+        pass
+
+    return pil_image.convert("RGB"), exif_info
+
+
+def str2hash(url):
+    """
+    从一个str的到url
+    """
+    return hashlib.sha256(url.encode()).hexdigest()
+
+
+def pil2hash(pil):
+    """
+    从一个PIL.Image到hash
+    """
+    byte_io = io.BytesIO()
+    pil.save(byte_io, format="PNG")  # 选择无损格式，避免压缩影响
+    image_bytes = byte_io.getvalue()
+
+    return hashlib.sha256(image_bytes).hexdigest()
+
+
+def imagepath_to_base64(image_path):
+    """imagepath_to_base64"""
+    image = Image.open(image_path).convert("RGB")
+    buffer = io.BytesIO()
+    image.save(buffer, format="JPEG")
+    image_bytes = buffer.getvalue()
+    base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
+    return base64_encoded
+
+
+def pil_image_to_base64(image):
+    """pil_image_to_base64"""
+    buffer = io.BytesIO()
+    image.save(buffer, format="JPEG")
+    image_bytes = buffer.getvalue()
+    base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
+    return base64_encoded
+
+
+def http_to_pil_image(url):
+    """http_to_pil_image"""
+    response = requests.get(url)
+    image_data = io.BytesIO(response.content)
+    pil_image = Image.open(image_data).convert("RGB")
+    return pil_image
+
+
+def http_to_image_base64(url):
+    """http_to_image_base64"""
+    response = requests.get(url)
+    image_data = io.BytesIO(response.content)
+    return base64.b64encode(image_data.getvalue()).decode("utf-8")
+
+
+def base64_to_pil_image(base64_string):
+    """ " base64_to_pil_image"""
+    image_bytes = base64.b64decode(base64_string)
+    buffer = io.BytesIO(image_bytes)
+    image = Image.open(buffer)
+    return image
+
+
+def get_hashable(to_be_hashed):
+    """get hashable"""
+    if isinstance(to_be_hashed, bytes):
+        return to_be_hashed
+    elif isinstance(to_be_hashed, Image.Image):
+        return to_be_hashed.tobytes()
+    elif isinstance(to_be_hashed, str):
+        return to_be_hashed.encode("utf-8")
+    else:
+        raise ValueError(f"not support type: {type(to_be_hashed)}")
+
+
+def load_dict_from_npz(npzfile):
+    """从npz文件读取数据"""
+    with np.load(npzfile, allow_pickle=True) as data:
+        loaded_dict = {key: data[key] for key in data.files}
+    return loaded_dict
--- a/fastdeploy/input/mm_processor/utils/render_timestamp.py
+++ b/fastdeploy/input/mm_processor/utils/render_timestamp.py
@@ -0,0 +1,96 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+from pathlib import Path
+
+from PIL import Image, ImageDraw, ImageFont
+
+cur_directory = Path(__file__).parent.absolute()
+FONT_PATH = os.path.join(cur_directory, "Roboto-Regular.ttf")
+
+
+def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
+    """
+    函数功能: 给pil.image的图片渲染时间戳
+    时间戳的大小为 min(width, height)的rate
+    字体的颜色为黑色, 轮廓是白色, 轮廓的大小是字体的10%
+    返回一个 Image 对象
+    """
+    draw = ImageDraw.Draw(image)  # 创建一个可绘制对象
+    width, height = image.size  # 获取图片大小
+    font_size = int(min(width, height) * rate)  # 设置字体大小
+    outline_size = int(font_size * 0.1)  # 设置轮廓大小
+    font = ImageFont.truetype(font_path, font_size)  # 加载字体文件, 设置字体大小
+    x = 0
+    y = 0  # 文本的x坐标, y坐标
+
+    # 绘制黑色的时间戳，白色的边框
+    draw.text((x, y), number, font=font, fill=(0, 0, 0), stroke_width=outline_size, stroke_fill=(255, 255, 255))
+
+    return image
+
+
+def timestamp_converting(time_stamp_in_seconds):
+    """
+    convert timestamp format from seconds to hr:min:sec
+    """
+    # get hours
+    hours = 0
+    while time_stamp_in_seconds >= 3600:
+        hours += 1
+        time_stamp_in_seconds -= 3600
+    # get minutes
+    mins = 0
+    while time_stamp_in_seconds >= 60:
+        mins += 1
+        time_stamp_in_seconds -= 60
+    time_hours = f"{int(hours):02d}"
+    time_mins = f"{int(mins):02d}"
+    time_secs = f"{time_stamp_in_seconds:05.02f}"
+    fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
+
+    return fi_time_stamp
+
+
+def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
+    """
+    function: get the timestamp of a frame, 在均匀抽帧时用。
+
+    num_frames: 总帧数
+    frameid_list: 被抽帧的帧的索引
+    duration: 视频的总时长
+    return: timestamp; xx:xx:xx (str)
+    """
+    time_stamp = duration * 1.0 * frame_id / num_frames
+
+    return time_stamp
+
+
+def render_frame_timestamp(frame, timestamp, font_rate=0.1):
+    """
+    函数功能, 给frame, 按照顺序将 index 渲染上去
+    逻辑思路: 把index渲染到图片的左上方
+
+    frame: 帧，PIL.Image object
+    timestamp: 时间戳，单位是秒
+    font_rate: 字体大小占 min(wi, hei)的比率
+    """
+
+    time_stamp = "time: " + timestamp_converting(timestamp)
+    new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
+
+    return new_frame
--- a/fastdeploy/input/mm_processor/utils/video_utils.py
+++ b/fastdeploy/input/mm_processor/utils/video_utils.py
@@ -0,0 +1,83 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import io
+import os
+from tempfile import NamedTemporaryFile as ntf
+
+import decord
+
+try:
+    # moviepy 1.0
+    import moviepy.editor as mp
+except:
+    # moviepy 2.0
+    import moviepy as mp
+
+
+def is_gif(data: bytes) -> bool:
+    """
+    check if a bytes is a gif based on the magic head
+    """
+    return data[:6] in (b"GIF87a", b"GIF89a")
+
+
+class VideoReaderWrapper(decord.VideoReader):
+    """
+    Solving memory leak bug
+
+    https://github.com/dmlc/decord/issues/208
+    """
+
+    def __init__(self, video_path, *args, **kwargs):
+        with ntf(delete=True, suffix=".gif") as gif_file:
+            gif_input = None
+            self.original_file = None
+            if isinstance(video_path, str):
+                self.original_file = video_path
+                if video_path.lower().endswith(".gif"):
+                    gif_input = video_path
+            elif isinstance(video_path, bytes):
+                if is_gif(video_path):
+                    gif_file.write(video_path)
+                    gif_input = gif_file.name
+            elif isinstance(video_path, io.BytesIO):
+                video_path.seek(0)
+                tmp_bytes = video_path.read()
+                video_path.seek(0)
+                if is_gif(tmp_bytes):
+                    gif_file.write(tmp_bytes)
+                    gif_input = gif_file.name
+
+            if gif_input is not None:
+                clip = mp.VideoFileClip(gif_input)
+                mp4_file = ntf(delete=False, suffix=".mp4")
+                clip.write_videofile(mp4_file.name, verbose=False, logger=None)
+                clip.close()
+                video_path = mp4_file.name
+                self.original_file = video_path
+
+            super().__init__(video_path, *args, **kwargs)
+            self.seek(0)
+
+    def __getitem__(self, key):
+        frames = super().__getitem__(key)
+        self.seek(0)
+        return frames
+
+    def __del__(self):
+        if self.original_file and os.path.exists(self.original_file):
+            os.remove(self.original_file)
--- a/fastdeploy/input/multimodal/init.py
+++ b/fastdeploy/input/multimodal/init.py
@@ -0,0 +1,15 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
--- a/fastdeploy/input/multimodal/audio.py
+++ b/fastdeploy/input/multimodal/audio.py
@@ -0,0 +1,127 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import base64
+from io import BytesIO
+from pathlib import Path
+
+import numpy as np
+import numpy.typing as npt
+
+from .base import MediaIO, MultiModalPlugin
+from .inputs import AudioItem, ModalityData, MultiModalKwargs
+
+# TODO 多模数据处理
+# try:
+#     import librosa
+# except ImportError:
+#     librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+# try:
+#     import soundfile
+# except ImportError:
+#     soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+
+
+def resample_audio(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    """
+    将音频数据从原始采样率（`orig_sr`）重采样到目标采样率（`target_sr`）。
+    
+    Args:
+        audio (npt.NDArray[np.floating]): 带有单通道浮点型音频数据的 numpy ndarray，形状为 `(samples,)`。
+        orig_sr (float): 音频数据的原始采样率。
+        target_sr (float): 需要转换到的目标采样率。
+    
+    Returns:
+        npt.NDArray[np.floating]: 带有单通道浮点型音频数据的 numpy ndarray，形状为 `(samples,)`，已经被重采样到目标采样率。
+    
+    Raises:
+        None.
+    """
+    import librosa
+    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+
+
+
+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
+        """
+            加载字节数据，返回音频信号和采样率。
+        参数：
+            data (bytes) - 字节数据，包含音频文件的内容。
+        返回值（tuple）：
+            (array, float) - 第一个元素是一个numpy数组，表示音频信号，第二个元素是一个浮点数，表示采样率。
+            如果解码失败，则返回 None。
+        """
+        import librosa
+        return librosa.load(BytesIO(data), sr=None)
+
+
+    def load_base64(
+        self,
+        media_type: str,
+        data: str,
+    ) -> tuple[npt.NDArray, float]:
+        """
+            将 base64 编码的字符串转换为 numpy 数组和尺度。
+        
+        Args:
+            media_type (str): 媒体类型，例如 'image/jpeg'、'image/png' 等。
+            data (str): base64 编码的字符串，表示图像或其他二进制数据。
+        
+        Returns:
+            tuple[npt.NDArray, float]: 包含以下两个元素：
+                - npt.NDArray: 形状为（H，W，C）的 numpy 数组，表示图像或其他二进制数据。
+                - float: 图像的尺度，单位为像素。
+        
+        Raises:
+            ValueError: 当 media_type 不是有效的媒体类型时引发。
+        """
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
+        """
+            加载音频文件，返回音频数据和采样率。
+        参数：
+            filepath (Path): 音频文件路径（Path类型）。
+        返回值：
+            tuple[npt.NDArray, float]：包含两个元素的元组，第一个是音频数据（npt.NDArray类型），
+            第二个是采样率（float类型）。
+        """
+        import librosa
+        return librosa.load(filepath, sr=None)
+
+    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
+        """
+            将音频数据和采样率转换为Base64编码的字符串。
+        参数：
+            media (tuple[numpy.ndarray, float]): 包含音频数据和采样率的元组，其中音频数据是一个numpy数组，采样率是一个浮点数。
+            返回值 (str): Base64编码的字符串，表示音频数据和采样率。
+        """
+        audio, sr = media
+
+        with BytesIO() as buffer:
+            import soundfile
+            soundfile.write(buffer, audio, sr, format="WAV")
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
--- a/fastdeploy/input/multimodal/base.py
+++ b/fastdeploy/input/multimodal/base.py
@@ -0,0 +1,69 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from collections.abc import Sequence
+from pathlib import Path
+from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
+                    Optional, TypeVar, Union)
+
+
+_T = TypeVar("_T")
+
+
+class MediaIO(ABC, Generic[_T]):
+
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        """
+            将字节数据加载为对象，并返回该对象。
+        如果加载失败，则抛出异常。
+        
+        Args:
+            data (bytes): 要加载的字节数据。
+        
+        Raises:
+            NotImplementedError: 当前类未实现此方法。
+        
+        Returns:
+            _T: 加载后的对象。
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        """
+        List of media types:
+        https://www.iana.org/assignments/media-types/media-types.xhtml
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        """
+            加载文件，返回解析后的数据。
+        
+        Args:
+            filepath (Path): 文件路径，必须是一个绝对路径。
+        
+        Raises:
+            NotImplementedError: 当前方法未被实现。
+        
+        Returns:
+            _T: 任意类型，表示解析后的数据。
+        """
+        raise NotImplementedError
--- a/fastdeploy/input/multimodal/image.py
+++ b/fastdeploy/input/multimodal/image.py
@@ -0,0 +1,145 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import base64
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional
+import requests
+from PIL import Image
+
+from .base import MediaIO
+
+
+class ImageMediaIO(MediaIO[Image.Image]):
+
+    def __init__(self, *, image_mode: str = "RGB") -> None:
+        """
+            Initializes the object.
+        
+        Args:
+            image_mode (str, optional): The mode of the image, defaults to "RGB". Should be one of "L", "LA", "P",
+                "RGB", "RGBA", "CMYK", or "YCbCr".
+        
+        Raises:
+            ValueError: If `image_mode` is not a valid mode.
+        
+        Returns:
+            None: This method does not return anything. It initializes the object with the given parameters.
+        """
+        super().__init__()
+
+        self.image_mode = image_mode
+
+    def load_bytes(self, data: bytes) -> Image.Image:
+        """
+            将字节数据转换为图像对象，并返回。
+        该方法会自动调用Image.open和Image.load方法，以及convert方法将图像转换为指定模式（默认为RGB）。
+        
+        Args:
+            data (bytes): 包含图像数据的字节对象。
+        
+        Returns:
+            Image.Image: 一个包含了原始图像数据的Image对象，已经被转换为指定模式。
+        
+        Raises:
+            无。
+        """
+        image = Image.open(BytesIO(data))
+        image.load()
+        return image.convert(self.image_mode)
+
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        """
+        将 base64 编码的字符串转换为图片对象。
+        
+        Args:
+            media_type (str): 媒体类型，例如 "image/jpeg"。
+            data (str): base64 编码的字符串数据。
+        
+        Returns:
+            Image.Image: PIL 中的图片对象。
+        
+        Raises:
+            无。
+        """
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> Image.Image:
+        """
+            加载文件，并转换为指定模式。
+        如果文件不存在或无法打开，将抛出FileNotFoundError异常。
+        
+        Args:
+            filepath (Path): 文件路径（Pathlib.Path对象）。
+        
+        Returns:
+            Image.Image: 返回一个Image.Image对象，表示已经加载和转换的图像。
+        
+        Raises:
+            FileNotFoundError: 当文件不存在时抛出此异常。
+        """
+        image = Image.open(filepath)
+        image.load()
+        return image.convert(self.image_mode)
+
+    def load_file_request(self, request: Any) -> Image.Image:
+        """
+            从请求中加载图像文件，并返回一个PIL Image对象。
+        该函数需要传入一个包含图像URL的字符串或者可迭代对象（如requests库的Response对象）。
+        该函数会自动处理图像的格式和大小，并将其转换为指定的模式（默认为RGB）。
+        
+        Args:
+            request (Any): 包含图像URL的字符串或者可迭代对象（如requests库的Response对象）。
+        
+        Returns:
+            Image.Image: PIL Image对象，表示已经加载并转换好的图像。
+        
+        Raises:
+            无。
+        """
+        image = Image.open(requests.get(request, stream=True).raw)
+        image.load()
+        return image.convert(self.image_mode)
+
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "JPEG",
+    ) -> str:
+        """
+            将图像转换为Base64编码的字符串。
+        
+        Args:
+            media (Image.Image): 待处理的图像对象，支持PIL库中的Image类型。
+            image_format (str, optional): 指定图像格式，默认为"JPEG"。可选项包括："PNG", "JPEG", "BMP", "TIFF"等。
+                PIL库中的所有图片格式都可以使用，但是不建议使用"PPM"和"XBM"格式，因为这两种格式在Python3中已经被弃用了。
+        
+        Returns:
+            str: Base64编码后的字符串，可以直接作为HTML或者JSON数据传输。
+        
+        Raises:
+            None
+        """
+        image = media
+
+        with BytesIO() as buffer:
+            image = image.convert(self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
--- a/fastdeploy/input/multimodal/utils.py
+++ b/fastdeploy/input/multimodal/utils.py
@@ -0,0 +1,192 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import base64
+import io
+import os
+import random
+
+import socket
+from urllib.parse import urlparse
+import ipaddress
+
+import requests
+from PIL import Image, ImageOps
+from fastdeploy.utils import data_processor_logger
+
+import pyheif
+from pdf2image import convert_from_path
+import cairosvg
+import subprocess
+import tempfile
+import mimetypes
+
+def process_image_data(image_data, mime_type, url):
+    """处理不同类型的图像数据并返回 PIL 图像对象"""
+
+    if mime_type in ['image/heif', 'image/heic'] or url.lower().endswith('.heif') or url.lower().endswith('.heic'):
+        heif_file = pyheif.read(image_data)
+        pil_image = Image.frombytes(
+            heif_file.mode, heif_file.size, heif_file.data,
+            "raw", heif_file.mode, heif_file.stride
+        )
+    elif mime_type == 'application/pdf' or url.lower().endswith('.pdf'):
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
+            temp_pdf.write(image_data.getvalue())
+            temp_pdf_path = temp_pdf.name
+        images = convert_from_path(temp_pdf_path)
+        pil_image = images[0]
+        os.remove(temp_pdf_path)
+    elif mime_type == 'image/svg+xml' or url.lower().endswith('.svg'):
+        png_data = cairosvg.svg2png(bytestring=image_data.getvalue())
+        pil_image = Image.open(io.BytesIO(png_data))
+    elif mime_type in ['application/postscript', 'application/illustrator'] or url.lower().endswith('.ai'):
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.ai') as ai_temp, tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as pdf_temp:
+            ai_temp_path = ai_temp.name
+            pdf_temp_path = pdf_temp.name
+            ai_temp.write(image_data.getvalue())
+            ai_temp.close()
+            subprocess.run(['inkscape', ai_temp_path, '--export-pdf=' + pdf_temp_path], check=True)
+            images = convert_from_path(pdf_temp_path)
+            pil_image = images[0]
+            os.remove(ai_temp_path)
+            os.remove(pdf_temp_path)
+
+    elif mime_type == 'image/gif' or url.lower().endswith('.gif'):
+        pil_image = Image.open(image_data)
+    else:
+        pil_image = Image.open(image_data)
+
+    return pil_image
+
+def http_to_pil_image(url):
+    """http_to_pil_image"""
+    if is_public_url(url) and int(os.getenv("DOWNLOAD_WITH_TP_SERVER", "0")):
+        return http_to_pil_image_with_tp_server(url)
+
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise Exception("Failed to download the image from URL.")
+    image_data = io.BytesIO(response.content)
+
+    mime_type = response.headers.get('Content-Type')
+    if mime_type is None:
+        mime_type, _ = mimetypes.guess_type(url)
+
+    data_processor_logger.info(f"Detected MIME type: {mime_type}")  # 调试信息
+    pil_image = process_image_data(image_data, mime_type, url)
+
+    return pil_image
+
+def http_to_pil_image_with_tp_server(url, retry_time=6):
+    """cnap平台没有外网访问权限，需要使用tp服务下载图片"""
+    proxies = [{"http": "http://10.229.197.142:8807"}, {"http": "http://10.229.197.161:8804"},
+               {"http": "http://10.229.198.143:8804"}, {"http": "http://10.122.108.164:8807"},
+               {"http": "http://10.122.108.165:8807"}, {"http": "http://10.122.108.166:8807"},
+               {"http": "http://10.122.108.168:8801"}, {"http": "http://10.122.150.146:8802"},
+               {"http": "http://10.122.150.158:8802"}, {"http": "http://10.122.150.164:8801"},
+               {"http": "http://10.143.51.38:8813"}, {"http": "http://10.143.103.42:8810"},
+               {"http": "http://10.143.194.45:8804"}, {"http": "http://10.143.226.25:8801"},
+               {"http": "http://10.143.236.12:8807"}, {"http": "http://10.143.238.36:8807"},
+               {"http": "http://10.144.71.30:8807"}, {"http": "http://10.144.73.16:8804"},
+               {"http": "http://10.144.138.36:8801"}, {"http": "http://10.144.152.40:8810"},
+               {"http": "http://10.144.199.29:8810"}, {"http": "http://10.144.251.29:8813"},
+               ]
+    headers = {
+        "X-Tp-Authorization": "Basic RVJOSUVMaXRlVjpFUk5JRUxpdGVWXzFxYXo0cmZ2M2VkYzV0Z2Iyd3N4LWJmZS10cA==",
+        "scheme": "https"
+        }
+
+    new_url = url.replace("https://", "http://") if url.startswith("https://") else url
+
+    # 代理可能不稳定，需要重试
+    for idx in range(retry_time):
+        try:
+            response = requests.get(new_url, headers=headers, proxies=random.choice(proxies))
+            if response.status_code == 200:
+                image_data = io.BytesIO(response.content)
+
+                mime_type = response.headers.get('Content-Type')
+                if mime_type is None:
+                    mime_type, _ = mimetypes.guess_type(url)
+
+                data_processor_logger.info(f"Detected MIME type: {mime_type}")  # 调试信息
+                pil_image = process_image_data(image_data, mime_type, url)
+
+                return pil_image
+        except Exception as e:
+            data_processor_logger.error(f"Failed to download the image, idx: {idx}, URL: {url}, error: {e}")
+
+    raise Exception(f"Failed to download the image from URL: {url}")
+
+
+
+def base64_to_pil_image(base64_string):
+    """base64_to_pil_image"""
+    image_bytes = base64.b64decode(base64_string)
+    buffer = io.BytesIO(image_bytes)
+    pil_image = Image.open(buffer)
+    return pil_image
+
+
+def is_public_url(url):
+    """判断是否公网url"""
+    try:
+        # 解析URL
+        parsed_url = urlparse(url)
+        hostname = parsed_url.hostname
+        if hostname is None:
+            return False
+        # 尝试将域名解析为IP地址
+        ip_address = socket.gethostbyname(hostname)
+        # 转换为IP地址对象
+        ip_obj = ipaddress.ip_address(ip_address)
+        # 判断是否为私有IP或保留IP地址
+        if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_link_local or ip_obj.is_reserved:
+            return False
+        else:
+            return True
+    except Exception as e:
+        print(f"Error checking URL: {e}")
+        return False
+
+def process_transparency(image):
+    """ process transparency. """
+    def _is_transparent(image):
+        # 检查图片是否有alpha通道
+        if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info):
+            # 获取alpha通道
+            alpha = image.convert('RGBA').split()[-1]
+            # 如果alpha通道中存在0，说明图片有透明部分
+            if alpha.getextrema()[0] < 255:
+                return True
+        return False
+
+
+    def _convert_transparent_paste(image):
+        width, height = image.size
+        new_image = Image.new("RGB", (width, height), (255, 255, 255)) # 生成一张白色底图
+        new_image.paste(image, (0, 0), image)
+        return new_image
+
+    try:
+        if _is_transparent(image):  # Check and fix transparent images
+            data_processor_logger.info("Image has transparent background, adding white background.")
+            image = _convert_transparent_paste(image)
+    except:
+        pass
+
+    return ImageOps.exif_transpose(image)
--- a/fastdeploy/input/multimodal/video.py
+++ b/fastdeploy/input/multimodal/video.py
@@ -0,0 +1,241 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+import base64
+from functools import partial
+from io import BytesIO
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+from .base import MediaIO
+from .image import ImageMediaIO
+
+
+def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
+    """
+    对视频帧进行缩放，将每一帧的大小调整为指定的高度和宽度。
+    
+    Args:
+        frames (npt.NDArray, shape=(N, H, W, C)): 包含N个帧的三维数组，其中H是高度，W是宽度，C是通道数。
+            所有帧都应该具有相同的通道数。
+        size (tuple[int, int], required): 一个元组，包含两个整数，分别表示目标高度和宽度。
+    
+    Returns:
+        npt.NDArray, shape=(N, new_height, new_width, C): 返回一个新的三维数组，其中每一帧已经被缩放到指定的高度和宽度。
+        新数组的通道数与输入数组相同。
+    
+    Raises:
+        None
+    """
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    # lazy import cv2 to avoid bothering users who only use text models
+    import cv2
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    """
+    对视频帧进行缩放，将每个帧的高度和宽度都乘以一个因子。
+    
+    Args:
+        frames (npt.NDArray): 形状为（T，H，W，C）的四维numpy数组，表示T个帧，高度为H，宽度为W，通道数为C。
+        size_factor (float): 用于缩放视频帧的因子，新的高度和宽度将分别是原来的高度和宽度的size_factor倍。
+    
+    Returns:
+        npt.NDArray: 形状为（T，new_H，new_W，C）的四维numpy数组，表示T个帧，高度为new_H，宽度为new_W，通道数为C。
+        其中new_H和new_W是根据size_factor计算出来的。
+    
+    Raises:
+        None
+    """
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+
+    return resize_video(frames, (new_height, new_width))
+
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    """
+    从视频中随机选取指定数量的帧，并返回一个包含这些帧的numpy数组。
+    
+    Args:
+        frames (npt.NDArray): 形状为（T，H，W，C）的ndarray，表示视频的所有帧，其中T是帧的总数，H、W是每个帧的高度和宽度，C是通道数。
+        num_frames (int, optional): 要从视频中选取的帧数。如果设置为-1，则将返回所有帧。默认为-1。
+    
+    Returns:
+        npt.NDArray: 形状为（num_frames，H，W，C）的ndarray，表示选取的帧。如果num_frames=-1，则返回原始的frames。
+    """
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    sampled_frames = frames[frame_indices, ...]
+    return sampled_frames
+
+
+class VideoMediaIO(MediaIO[npt.NDArray]):
+
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        *,
+        num_frames: int = 32,
+    ) -> None:
+        """
+            初始化一个 VideoMediaIO 对象。
+        
+        Args:
+            image_io (ImageMediaIO): 用于读取和写入图像的 ImageMediaIO 对象。
+            num_frames (int, optional): 视频中帧数，默认为 32。
+                ImageMediaIO 对象必须支持指定帧数。
+        
+        Raises:
+            TypeError: 如果 image_io 不是 ImageMediaIO 类型。
+            ValueError: 如果 num_frames 小于等于 0。
+        
+        Returns:
+            None: 无返回值，直接初始化并设置属性。
+        """
+        super().__init__()
+
+        self.image_io = image_io
+        self.num_frames = num_frames
+
+    def load_bytes(self, data: bytes) -> npt.NDArray:
+        """
+            从字节数据加载视频帧，并返回一个 numpy ndarray。
+        如果字节数据中的视频帧数量大于指定的 `num_frames`，则将其平均分布到这些帧上；否则，返回所有帧。
+        
+        Args:
+            data (bytes): 包含视频帧数据的字节对象。
+        
+        Returns:
+            npt.NDArray, shape=(num_frames, height, width, channels): 返回一个 numpy ndarray，其中包含了视频帧数据。
+            如果 `num_frames` 小于视频帧数量，则返回前 `num_frames` 帧；否则，返回所有帧。
+        
+        Raises:
+            None.
+        """
+        import decord
+        vr = decord.VideoReader(BytesIO(data), num_threads=1)
+        total_frame_num = len(vr)
+
+        num_frames = self.num_frames
+        if total_frame_num > num_frames:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frame_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+        else:
+            frame_idx = list(range(0, total_frame_num))
+
+        return vr.get_batch(frame_idx).asnumpy()
+
+    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+        """
+        加载 base64 编码的数据，并返回 numpy ndarray。
+        
+            Args:
+                media_type (str): 媒体类型，目前仅支持 "video/jpeg"。
+                当为 "video/jpeg" 时，将解析每一帧的 base64 编码数据，并转换成 numpy ndarray。
+                data (str): base64 编码的字符串数据。
+        
+            Returns:
+                npt.NDArray, optional: 如果 media_type 为 "video/jpeg"，则返回 numpy ndarray 格式的视频数据；否则返回 None。
+        
+            Raises:
+                None.
+        """
+        if media_type.lower() == "video/jpeg":
+            load_frame = partial(
+                self.image_io.load_base64,
+                "image/jpeg",
+            )
+
+            return np.stack([
+                np.array(load_frame(frame_data))
+                for frame_data in data.split(",")
+            ])
+
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> npt.NDArray:
+        """
+            读取文件内容，并将其转换为numpy数组。
+        
+        Args:
+            filepath (Path): 文件路径对象，表示要读取的文件。
+        
+        Returns:
+            npt.NDArray, optional: 返回一个numpy数组，包含了文件内容。如果无法解析文件内容，则返回None。
+        
+        Raises:
+            无。
+        """
+        with filepath.open("rb") as f:
+            data = f.read()
+
+        return self.load_bytes(data)
+
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = "JPEG",
+    ) -> str:
+        """
+            将视频编码为Base64字符串，每一帧都是一个Base64字符串。
+        如果视频格式为"JPEG"，则每一帧都会被转换成JPEG图片并进行编码。
+        
+        Args:
+            media (npt.NDArray): 要编码的视频，形状为（H，W，C）或者（T，H，W，C），其中T为时间步长，H和W分别为高度和宽度，C为通道数。
+                当前仅支持JPEG格式。
+            video_format (str, optional, default="JPEG"): 视频格式，只支持"JPEG"。 Default to "JPEG".
+        
+        Raises:
+            NotImplementedError: 当前仅支持JPEG格式。
+        
+        Returns:
+            str: Base64字符串，每一帧都是一个Base64字符串，用","连接起来。
+        """
+        video = media
+
+        if video_format == "JPEG":
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+
+            return ",".join(
+                encode_frame(Image.fromarray(frame)) for frame in video)
+
+        msg = "Only JPEG format is supported for now."
+        raise NotImplementedError(msg)
--- a/fastdeploy/input/preprocess.py
+++ b/fastdeploy/input/preprocess.py
@@ -0,0 +1,59 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from fastdeploy.engine.config import ModelConfig
+
+class InputPreprocessor:
+    """
+        Args:
+        model_name_or_path (str):
+            Model name or path to the pretrained model. If a model name is provided, it should be a
+            key in the Hugging Face Transformers' model registry (https://huggingface.co/models).
+            The model will be downloaded from the Hugging Face model hub if necessary.
+            If a path is provided, the model will be loaded from that path.
+        enable_mm (bool, optional):
+            Whether to use the multi-modal model processor. Defaults to False.
+
+        Raises:
+            ValueError:
+                If the model name is not found in the Hugging Face Transformers' model registry and the path does not
+                exist.
+    """
+    def __init__(
+        self,
+        model_name_or_path: str,
+        enable_mm: bool = False,
+    ) -> None:
+
+        self.model_name_or_path = model_name_or_path
+        self.enable_mm = enable_mm
+
+
+    def create_processor(self):
+        """
+            创建数据处理器。如果启用了多模态注册表，则使用该表中的模型；否则，使用传递给构造函数的模型名称或路径。
+        返回值：DataProcessor（如果不启用多模态注册表）或MultiModalRegistry.Processor（如果启用多模态注册表）。
+
+        Args:
+            无参数。
+
+        Returns:
+            DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
+        """
+        architectures = ModelConfig(self.model_name_or_path).architectures
+        from fastdeploy.input.text_processor import DataProcessor
+        self.processor = DataProcessor(model_name_or_path=self.model_name_or_path)
+        return self.processor
--- a/fastdeploy/input/text_processor.py
+++ b/fastdeploy/input/text_processor.py
@@ -0,0 +1,533 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+from abc import ABC, abstractmethod
+
+import numpy as np
+from paddlenlp.generation import GenerationConfig
+from paddlenlp.transformers import Llama3Tokenizer, LlamaTokenizer
+
+from fastdeploy.utils import data_processor_logger
+
+
+class BaseDataProcessor(ABC):
+    """base class for data processor"""
+
+    def __init__(self):
+        """
+        Returns:
+            None
+        """
+        self.tokenizer = self._load_tokenizer()
+        self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id(
+            self.tokenizer.bos_token)
+        self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id(
+            self.tokenizer.cls_token)
+        self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id(
+            self.tokenizer.sep_token)
+        self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id(
+            self.tokenizer.eos_token)
+        self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id(
+            self.tokenizer.mask_token)
+        data_processor_logger.info((
+            f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ",
+            f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, "
+            f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, "
+            f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, "
+            f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}"
+        ))
+
+    @abstractmethod
+    def process_request(self, request, **kwargs):
+        """
+        Preprocess the request
+
+        Args:
+            request (Dict): may contain text and messages fields
+            **kwargs: others
+
+        Returns:
+            bool: Whether preprocessing is successful
+            str: error message
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_response(self, response_dict):
+        """
+        Preprocess the response
+
+        Args:
+            response_dict (Dict): response for engine, contain ids fields
+
+        Returns:
+            Dict: response contain text fields
+        """
+        raise NotImplementedError
+
+    def text2ids(self, text, max_model_len=None):
+        """
+        text to token ids
+
+        Args:
+            text (str): text
+
+        Returns:
+            List[int]: token ids list
+        """
+        raise NotImplementedError
+
+    def messages2ids(self, messages):
+        """
+        Convert multi-turn messages into ID sequences.
+
+        Args:
+            messages (List[List[Dict[str, Any]]]): multi-turn messages.
+
+        Returns:
+            List[int]: ID sequences
+        """
+        raise NotImplementedError
+
+    def ids2tokens(self, token_id, task_id=None):
+        """
+        token ids to strings
+
+        Args:
+            token_id (List[int]): token id
+			task_id (str): task id
+
+        Returns:
+            List[str]: strings
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _load_tokenizer(self):
+        """
+        load tokenizer
+
+        Returns:
+            tokenizer (AutoTokenizer)
+        """
+        raise NotImplementedError
+
+
+class DataProcessor(BaseDataProcessor):
+
+    def __init__(self, model_name_or_path):
+        """
+            Initializes the DecodeStatus object.
+
+        Args:
+            model_name_or_path (str): The name or path of the pre-trained model to be loaded.
+                Can also be a path to a directory containing the pre-trained model file.
+
+        Returns:
+            None.
+
+        Raises:
+            None.
+        """
+
+        self.model_name_or_path = model_name_or_path
+        self._init_config()
+
+        self.decode_status = dict()
+        self.tokenizer = self._load_tokenizer()
+        data_processor_logger.info(
+            f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
+                                eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
+        )
+
+        from paddlenlp.trl.llm_utils import get_eos_token_id
+
+        self.eos_token_ids = get_eos_token_id(self.tokenizer,
+                                              self.generation_config)
+        self.eos_token_id_len = len(self.eos_token_ids)
+        self.pad_token_id = self.get_pad_id()
+        self.tokenizer.pad_token_id = self.pad_token_id
+
+    def _init_config(self):
+        """
+            初始化配置，包括模型名称、使用Hugging Face Tokenizer等。
+
+        Args:
+            无参数，但是会从环境变量中获取一些配置信息。
+
+        Returns:
+            无返回值，直接修改了类的属性。
+
+        Raises:
+            无异常抛出。
+        """
+        self.use_hf_tokenizer = int(os.getenv("USE_HF_TOKENIZER", "0")) == 1
+
+        # Generation config
+        try:
+            self.generation_config = GenerationConfig.from_pretrained(
+                self.model_name_or_path)
+        except Exception as e:
+            data_processor_logger.warning(
+                f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
+            )
+            self.generation_config = None
+
+    def process_request(self, request, max_model_len=None):
+        """
+        Preprocess the request
+
+        Args:
+            request (Dict): may contain text and messages fields
+
+        Returns:
+            bool: Whether preprocessing is successful
+            str: error message
+        """
+        if request.get("eos_token_ids") is None or len(
+                request.eos_token_ids) == 0:
+            request.eos_token_ids = self.eos_token_ids
+
+        stop_sequences = request.get("stop", [])
+        if stop_sequences is not None and len(stop_sequences) != 0:
+            stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
+            request.set("stop_token_ids", stop_seqs)
+            request.set("stop_seqs_len", stop_seqs_len)
+
+        if request.prompt_token_ids is None or len(
+                request.prompt_token_ids) == 0:
+            if request.prompt is not None:
+                request.prompt_token_ids = self.text2ids(
+                    request.prompt, max_model_len, request.raw_request)
+            elif request.messages is not None:
+                if self.tokenizer.chat_template is None:
+                    raise ValueError(
+                        "This model does not support chat_template.")
+                request.prompt_token_ids = self.messages2ids(request.messages)
+            else:
+                raise ValueError(
+                    f"The request should have `input_ids`, `text` or `messages`: {request}."
+                )
+
+        if max_model_len is not None and len(
+                request.prompt_token_ids) > max_model_len:
+            request.prompt_token_ids = request.prompt_token_ids[:
+                                                                max_model_len -
+                                                                1]
+        return request
+
+    def process_request_dict(self, request, max_model_len=None):
+        """
+        Preprocess the request
+
+        Args:
+            request (Dict): may contain text and messages fields
+
+        Returns:
+            bool: Whether preprocessing is successful
+            str: error message
+        """
+        if not request.get('eos_token_ids'):
+            request['eos_token_ids'] = self.eos_token_ids
+
+        # 处理stop_sequences
+        stop_sequences = request.get('stop', [])
+        if stop_sequences:
+            stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
+            request['stop_token_ids'] = stop_seqs
+            request['stop_seqs_len'] = stop_seqs_len
+
+        # 处理prompt_token_ids
+        if not request.get('prompt_token_ids'):
+            if 'prompt' in request:
+                raw_request = request.get('raw_request', True)
+                request['prompt_token_ids'] = self.text2ids(
+                    request['prompt'], max_model_len, raw_request).tolist()
+            elif 'messages' in request:
+                if self.tokenizer.chat_template is None:
+                    raise ValueError(
+                        "This model does not support chat_template.")
+                request['prompt_token_ids'] = self.messages2ids(
+                    request['messages']).tolist()
+            else:
+                raise ValueError(
+                    f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
+                )
+
+        # 截断超过长度限制的prompt
+        if max_model_len is not None and len(
+                request['prompt_token_ids']) > max_model_len:
+            request['prompt_token_ids'] = request[
+                'prompt_token_ids'][:max_model_len - 1]
+
+        return request
+
+    def process_response(self, response_dict, **kwargs):
+        """
+        Preprocess the response
+
+        Args:
+            response_dict (Dict): response for engine, contain ids fields
+
+        Returns:
+            Dict: response contain text fields
+        """
+        is_end = response_dict.finished
+        req_id = response_dict.request_id
+
+        token_ids = response_dict.outputs.token_ids
+        response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
+        response_dict.usage = {
+            "completion_tokens": response_dict.outputs.index + 1
+        }
+
+        if is_end:
+            self.clear_request_status(req_id)
+            data_processor_logger.debug(
+                "Request id: {} has been completed.".format(token_ids))
+            response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
+            self.clear_request_status(req_id)
+        return response_dict
+
+    def process_response_dict(self, response_dict, stream=True):
+        """
+        Preprocess the response
+
+        Args:
+            response_dict (Dict): response for engine, contain ids fields
+
+        Returns:
+            Dict: response contain text fields
+        """
+        is_end = response_dict["finished"]
+        req_id = response_dict["request_id"]
+
+        token_ids = response_dict["outputs"]["token_ids"]
+
+        if is_end:
+            data_processor_logger.debug(
+                "Request id: {} has been completed.".format(token_ids))
+            full_text = self.clear_request_status(req_id)
+            if not stream:
+                response_dict["outputs"]["text"] = full_text
+            else:
+                response_dict["outputs"]["text"] = ""
+        else:
+            response_dict["outputs"]["text"] = self.ids2tokens(
+                token_ids, req_id)
+        return response_dict
+
+    def text2ids(self, text, max_model_len, raw_request=True):
+        """
+        text to token ids
+
+        Args:
+            text (str): text
+
+        Returns:
+            List[int]: token ids list
+        """
+        if self.use_hf_tokenizer:
+            tokens = self.tokenizer(
+                text,
+                return_tensors="np",
+                padding=True,
+                truncation=True,
+            )
+        else:
+            if not raw_request or self.tokenizer.chat_template is None:
+                text = [text] if isinstance(text, str) else text
+                chat_template = False
+            elif self.tokenizer.chat_template is not None:
+                text = [text] if isinstance(text, str) else text
+                text = [
+                    self.tokenizer.apply_chat_template(sentence,
+                                                       tokenize=False)
+                    for sentence in text
+                ]
+                chat_template = True
+            tokens = self.tokenizer(
+                text,
+                return_tensors="np",
+                padding=True,
+                truncation=True,
+                max_length=max_model_len,
+                add_special_tokens=chat_template,
+            )
+        return tokens["input_ids"][0]
+
+    def messages2ids(self, messages):
+        """
+        Convert multi-turn messages into ID sequences.
+
+        Args:
+            messages (List[List[Dict[str, Any]]]): multi-turn messages.
+
+        Returns:
+            List[int]: ID sequences
+        """
+        message_result = self.tokenizer.apply_chat_template(
+            messages, return_tensors="pd")
+        return np.array(message_result["input_ids"][0])
+
+    def ids2tokens(self, token_id, task_id):
+        """
+        token ids to strings
+
+        Args:
+            token_ids (List[int]): token ids
+			task_id (str): task id
+
+        Returns:
+            List[str]: strings
+        """
+        if self.use_hf_tokenizer:
+            if task_id not in self.decode_status:
+                # history token ids & history token strings & befer decode str
+                self.decode_status[task_id] = [[], [], ""]
+
+            previous_token_ids = self.decode_status[task_id][0]
+            decode_str = self.tokenizer.batch_decode(
+                [previous_token_ids + token_id],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False)
+            if isinstance(decode_str, list) and len(decode_str):
+                new_str = decode_str[0].replace(self.decode_status[task_id][2],
+                                                "", 1)
+                self.decode_status[task_id][1].append(new_str)
+                self.decode_status[task_id][2] = decode_str[0]
+            else:
+                new_str = ""
+            self.decode_status[task_id][0] += token_id
+            return new_str
+        else:
+            if task_id not in self.decode_status:
+                # prefix offset & read offset & history token ids & history token strings
+                self.decode_status[task_id] = [0, 0, [], []]
+
+            prefix_offset = self.decode_status[task_id][0]
+            read_offset = self.decode_status[task_id][1]
+            previous_token_ids = self.decode_status[task_id][2]
+            decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
+                previous_token_ids + token_id, prefix_offset, read_offset)
+            self.decode_status[task_id][0] = prefix_offset
+            self.decode_status[task_id][1] = read_offset
+            self.decode_status[task_id][2] += token_id
+            self.decode_status[task_id][3].append(decode_str)
+            return decode_str
+
+    def _load_tokenizer(self):
+        """
+        load tokenizer
+
+        Returns:
+            tokenizer (AutoTokenizer)
+        """
+
+        if self.use_hf_tokenizer:
+            from transformers import AutoTokenizer
+            return AutoTokenizer.from_pretrained(self.model_name_or_path,
+                                                 use_fast=False)
+        else:
+            from paddlenlp.transformers import AutoTokenizer
+            return AutoTokenizer.from_pretrained(self.model_name_or_path,
+                                                 padding_side="left",
+                                                 use_fast=True)
+
+    def clear_request_status(self, task_id):
+        """
+        clear request status
+
+        Args:
+            task_id (str): task id
+
+        Returns:
+            results_all (str): all token strings
+        """
+        results_all = ""
+        if task_id in self.decode_status:
+            if self.use_hf_tokenizer:
+                results_all = self.decode_status[task_id][2]
+            else:
+                results_all = "".join(self.decode_status[task_id][3])
+            del self.decode_status[task_id]
+        return results_all
+
+    def get_pad_id(self):
+        """
+        get pad_token_id, if not pad_token_id, use eos_token
+
+        Returns:
+            int: pad_token_id
+        """
+        if isinstance(self.tokenizer,
+                      (LlamaTokenizer,
+                       Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
+            return self.tokenizer.eos_token
+        return self.tokenizer.pad_token_id
+
+    def pad_batch_data(self,
+                       insts,
+                       pad_id=0,
+                       return_seq_len=False,
+                       return_array=True,
+                       pad_style="right"):
+        """Pad the instances to the max sequence length in batch."""
+        if len(insts) == 0:
+            padded_insts = np.array([[]],
+                                    dtype=np.int64) if return_array else [[]]
+            if return_seq_len:
+                seq_len = np.array([], dtype=np.int64) if return_array else []
+                return padded_insts, seq_len
+            return padded_insts
+
+        max_len = max(map(len, insts))
+        if pad_style == "left":
+            padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst)
+                            for inst in insts]
+        else:
+            padded_insts = [
+                list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts
+            ]
+        if return_array:
+            padded_insts = np.array(padded_insts,
+                                    dtype=np.int64).reshape([-1, max_len])
+
+        if return_seq_len:
+            seq_len = [len(inst) for inst in insts]
+            if return_array:
+                seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
+            return padded_insts, seq_len
+        return padded_insts
+
+    def update_stop_seq(self, stop_sequences):
+        """
+        Update stop sequences from request.
+        """
+        stop_seqs = []
+        for seq in stop_sequences:
+            if seq != self.tokenizer.eos_token_id:
+                stop_seqs.append(
+                    self.tokenizer.convert_tokens_to_ids(
+                        self.tokenizer.tokenize(seq)))
+        stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs,
+                                                       pad_id=-1,
+                                                       return_seq_len=True,
+                                                       return_array=False)
+        data_processor_logger.debug(
+            f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
+        return stop_seqs, stop_seqs_len