FastDeploy/tests/input/test_qwen_vl_processor.py

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import unittest
from unittest.mock import MagicMock, patch

import numpy as np
from PIL import Image

from fastdeploy.engine.request import Request
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor


def mock_pil_image(height, width):
    """
    Generate mock random RGB image

    Args:
        height: Image height in pixels
        width: Image width in pixels

    Returns:
        PIL.Image object with random RGB data
    """
    rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
    return Image.fromarray(rgb_image)


def mock_read_frames(height: int, width: int, nums_frame: int, fps: int):
    """
    Generate mock video frames with metadata for testing purposes

    Creates synthetic video data by generating random RGB frames and constructing
    corresponding metadata to simulate real video processing.

    Args:
        height (int): Height of video frames in pixels
        width (int): Width of video frames in pixels
        nums_frame (int): Number of frames to generate
        fps (int): Frames per second for the mock video

    Returns:
        tuple: A tuple containing:
            frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3)
                containing randomly generated RGB frames
            meta (dict): Dictionary with video metadata:
                - fps (int): Frames per second (same as input)
                - duration (float): Calculated duration in seconds (nums_frame/fps)
                - num_of_frame (int): Number of frames (same as nums_frame input)
    """
    frames = []
    for _ in range(nums_frame):
        frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
        frames.append(frame)
    frames = np.stack(frames, axis=0)

    meta = {
        "fps": fps,
        "duration": nums_frame / fps,
        "num_of_frame": nums_frame,
    }
    return frames, meta


class TestQwenVLProcessor(unittest.TestCase):
    """
    Unit tests for Qwen Vision-Language Processor functionality
    """

    def setUp(self):
        """
        Initialize test case with:
        - Mock configuration
        - Patched message parsing and video processing methods
        - QwenVLProcessor instance with test parameters
        """
        config = MagicMock()
        config.vision_config.tokens_per_second = 2

        self.patcher_parse_image = patch(
            "fastdeploy.entrypoints.chat_utils.MultiModalPartParser.parse_image", return_value=mock_pil_image(480, 640)
        )
        self.patcher_parse_image.start()

        self.patcher_parse_video = patch(
            "fastdeploy.entrypoints.chat_utils.MultiModalPartParser.parse_video", return_value=b"123"
        )
        self.patcher_parse_video.start()

        self.patcher_read_frames = patch(
            "fastdeploy.input.qwen_vl_processor.process.read_frames", return_value=mock_read_frames(480, 640, 5, 2)
        )
        self.patcher_read_frames.start()

        mm_processor_kwargs = {
            "video_max_frames": 10,
            "video_min_frames": 1,
        }
        limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}

        model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
        self.processor = QwenVLProcessor(
            config=config,
            model_name_or_path=model_name_or_path,
            limit_mm_per_prompt=limit_mm_per_prompt,
            mm_processor_kwargs=mm_processor_kwargs,
            reasoning_parser_obj=None,
            tool_parser_obj=None,
        )

    def tearDown(self) -> None:
        """Clean up test case by stopping all mock patches"""
        self.patcher_read_frames.stop()
        self.patcher_parse_image.stop()
        self.patcher_parse_video.stop()

    def test_process_request(self):
        """
        Test processing of Request object with multimodal input

        Validates:
        1. Token ID lengths match position_ids and token_type_ids shapes
        2. Image processing produces expected output dimensions
        3. Video processing produces expected output dimensions
        4. Correct counts for images (1) and videos (1)
        """
        prompt = {
            "request_id": "12345",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
                        {"type": "text", "text": "Describe image and video."},
                    ],
                }
            ],
        }

        request = Request.from_dict(prompt)
        result = self.processor.process_request(request, 1024 * 100)

        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
        self.assertEqual(
            result.multimodal_inputs["images"].shape[0],
            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
        )
        self.assertEqual(
            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
        )
        self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
        self.assertEqual(result.multimodal_inputs["video_cnt"], 1)

    def test_process_request_dict(self):
        """
        Test processing of dictionary-format request with multimodal input

        Validates:
        1. Token ID lengths match position_ids and token_type_ids shapes
        2. Image processing produces expected output dimensions
        3. Video processing produces expected output dimensions
        4. Correct counts for images (1) and videos (1)
        """
        num_generated_token_ids = 10
        request = {
            "request_id": "12345",
            "metadata": {
                "generated_token_ids": [1] * num_generated_token_ids,
            },
            "stop": ["stop", "eof"],
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
                        {"type": "text", "text": "Describe image and video."},
                    ],
                }
            ],
        }

        result = self.processor.process_request_dict(request, 1024 * 100)

        self.assertEqual(result["prompt_token_ids_len"], result["multimodal_inputs"]["position_ids"].shape[0])
        self.assertEqual(result["prompt_token_ids_len"], result["multimodal_inputs"]["token_type_ids"].shape[0])
        self.assertEqual(
            result["multimodal_inputs"]["images"].shape[0],
            sum(map(lambda x: x.prod(), result["multimodal_inputs"]["grid_thw"])),
        )
        self.assertEqual(
            result["multimodal_inputs"]["image_type_ids"].shape[0], result["multimodal_inputs"]["grid_thw"][:, 0].sum()
        )
        self.assertEqual(result["multimodal_inputs"]["pic_cnt"], 1)
        self.assertEqual(result["multimodal_inputs"]["video_cnt"], 1)

    def test_prompt(self):
        """
        Test processing of prompt with image and video placeholders

        Validates:
        1. Token ID lengths match position_ids and token_type_ids shapes
        2. Image processing produces expected output dimensions
        3. Video processing produces expected output dimensions
        4. Correct counts for images (1) and videos (1)
        """
        prompt = {
            "request_id": "12345",
            "prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
            "multimodal_data": {
                "image": [mock_pil_image(10, 2100)],
                "video": [{"video": b"123", "fps": 5}],
            },
        }

        request = Request.from_dict(prompt)
        result = self.processor.process_request(request, 1024 * 100)

        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
        self.assertEqual(
            result.multimodal_inputs["images"].shape[0],
            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
        )
        self.assertEqual(
            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
        )
        self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
        self.assertEqual(result.multimodal_inputs["video_cnt"], 1)


if __name__ == "__main__":
    unittest.main()