mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-26 18:10:32 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			249 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			249 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| """
 | |
| 
 | |
| import unittest
 | |
| from unittest.mock import MagicMock, patch
 | |
| 
 | |
| import numpy as np
 | |
| from PIL import Image
 | |
| 
 | |
| from fastdeploy.engine.request import Request
 | |
| from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
 | |
| 
 | |
| 
 | |
| def mock_pil_image(height, width):
 | |
|     """
 | |
|     Generate mock random RGB image
 | |
| 
 | |
|     Args:
 | |
|         height: Image height in pixels
 | |
|         width: Image width in pixels
 | |
| 
 | |
|     Returns:
 | |
|         PIL.Image object with random RGB data
 | |
|     """
 | |
|     rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
 | |
|     return Image.fromarray(rgb_image)
 | |
| 
 | |
| 
 | |
| def mock_read_frames(height: int, width: int, nums_frame: int, fps: int):
 | |
|     """
 | |
|     Generate mock video frames with metadata for testing purposes
 | |
| 
 | |
|     Creates synthetic video data by generating random RGB frames and constructing
 | |
|     corresponding metadata to simulate real video processing.
 | |
| 
 | |
|     Args:
 | |
|         height (int): Height of video frames in pixels
 | |
|         width (int): Width of video frames in pixels
 | |
|         nums_frame (int): Number of frames to generate
 | |
|         fps (int): Frames per second for the mock video
 | |
| 
 | |
|     Returns:
 | |
|         tuple: A tuple containing:
 | |
|             frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3)
 | |
|                 containing randomly generated RGB frames
 | |
|             meta (dict): Dictionary with video metadata:
 | |
|                 - fps (int): Frames per second (same as input)
 | |
|                 - duration (float): Calculated duration in seconds (nums_frame/fps)
 | |
|                 - num_of_frame (int): Number of frames (same as nums_frame input)
 | |
|     """
 | |
|     frames = []
 | |
|     for _ in range(nums_frame):
 | |
|         frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
 | |
|         frames.append(frame)
 | |
|     frames = np.stack(frames, axis=0)
 | |
| 
 | |
|     meta = {
 | |
|         "fps": fps,
 | |
|         "duration": nums_frame / fps,
 | |
|         "num_of_frame": nums_frame,
 | |
|     }
 | |
|     return frames, meta
 | |
| 
 | |
| 
 | |
| class TestQwenVLProcessor(unittest.TestCase):
 | |
|     """
 | |
|     Unit tests for Qwen Vision-Language Processor functionality
 | |
|     """
 | |
| 
 | |
|     def setUp(self):
 | |
|         """
 | |
|         Initialize test case with:
 | |
|         - Mock configuration
 | |
|         - Patched message parsing and video processing methods
 | |
|         - QwenVLProcessor instance with test parameters
 | |
|         """
 | |
|         config = MagicMock()
 | |
|         config.vision_config.tokens_per_second = 2
 | |
| 
 | |
|         self.patcher_parse_image = patch(
 | |
|             "fastdeploy.entrypoints.chat_utils.MultiModalPartParser.parse_image", return_value=mock_pil_image(480, 640)
 | |
|         )
 | |
|         self.patcher_parse_image.start()
 | |
| 
 | |
|         self.patcher_parse_video = patch(
 | |
|             "fastdeploy.entrypoints.chat_utils.MultiModalPartParser.parse_video", return_value=b"123"
 | |
|         )
 | |
|         self.patcher_parse_video.start()
 | |
| 
 | |
|         self.patcher_read_frames = patch(
 | |
|             "fastdeploy.input.qwen_vl_processor.process.read_frames", return_value=mock_read_frames(480, 640, 5, 2)
 | |
|         )
 | |
|         self.patcher_read_frames.start()
 | |
| 
 | |
|         mm_processor_kwargs = {
 | |
|             "video_max_frames": 10,
 | |
|             "video_min_frames": 1,
 | |
|         }
 | |
|         limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
 | |
| 
 | |
|         model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
 | |
|         self.processor = QwenVLProcessor(
 | |
|             config=config,
 | |
|             model_name_or_path=model_name_or_path,
 | |
|             limit_mm_per_prompt=limit_mm_per_prompt,
 | |
|             mm_processor_kwargs=mm_processor_kwargs,
 | |
|             reasoning_parser_obj=None,
 | |
|             tool_parser_obj=None,
 | |
|         )
 | |
| 
 | |
|     def tearDown(self) -> None:
 | |
|         """Clean up test case by stopping all mock patches"""
 | |
|         self.patcher_read_frames.stop()
 | |
|         self.patcher_parse_image.stop()
 | |
|         self.patcher_parse_video.stop()
 | |
| 
 | |
|     def test_process_request(self):
 | |
|         """
 | |
|         Test processing of Request object with multimodal input
 | |
| 
 | |
|         Validates:
 | |
|         1. Token ID lengths match position_ids and token_type_ids shapes
 | |
|         2. Image processing produces expected output dimensions
 | |
|         3. Video processing produces expected output dimensions
 | |
|         4. Correct counts for images (1) and videos (1)
 | |
|         """
 | |
|         prompt = {
 | |
|             "request_id": "12345",
 | |
|             "messages": [
 | |
|                 {
 | |
|                     "role": "user",
 | |
|                     "content": [
 | |
|                         {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
 | |
|                         {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
 | |
|                         {"type": "text", "text": "Describe image and video."},
 | |
|                     ],
 | |
|                 }
 | |
|             ],
 | |
|         }
 | |
| 
 | |
|         request = Request.from_dict(prompt)
 | |
|         result = self.processor.process_request(request, 1024 * 100)
 | |
| 
 | |
|         self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
 | |
|         self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
 | |
|         self.assertEqual(
 | |
|             result.multimodal_inputs["images"].shape[0],
 | |
|             sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
 | |
|         )
 | |
|         self.assertEqual(
 | |
|             result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
 | |
|         )
 | |
|         self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
 | |
|         self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
 | |
| 
 | |
|     def test_process_request_dict(self):
 | |
|         """
 | |
|         Test processing of dictionary-format request with multimodal input
 | |
| 
 | |
|         Validates:
 | |
|         1. Token ID lengths match position_ids and token_type_ids shapes
 | |
|         2. Image processing produces expected output dimensions
 | |
|         3. Video processing produces expected output dimensions
 | |
|         4. Correct counts for images (1) and videos (1)
 | |
|         """
 | |
|         num_generated_token_ids = 10
 | |
|         request = {
 | |
|             "request_id": "12345",
 | |
|             "metadata": {
 | |
|                 "generated_token_ids": [1] * num_generated_token_ids,
 | |
|             },
 | |
|             "stop": ["stop", "eof"],
 | |
|             "messages": [
 | |
|                 {
 | |
|                     "role": "user",
 | |
|                     "content": [
 | |
|                         {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
 | |
|                         {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
 | |
|                         {"type": "text", "text": "Describe image and video."},
 | |
|                     ],
 | |
|                 }
 | |
|             ],
 | |
|         }
 | |
| 
 | |
|         result = self.processor.process_request_dict(request, 1024 * 100)
 | |
| 
 | |
|         self.assertEqual(result["prompt_token_ids_len"], result["multimodal_inputs"]["position_ids"].shape[0])
 | |
|         self.assertEqual(result["prompt_token_ids_len"], result["multimodal_inputs"]["token_type_ids"].shape[0])
 | |
|         self.assertEqual(
 | |
|             result["multimodal_inputs"]["images"].shape[0],
 | |
|             sum(map(lambda x: x.prod(), result["multimodal_inputs"]["grid_thw"])),
 | |
|         )
 | |
|         self.assertEqual(
 | |
|             result["multimodal_inputs"]["image_type_ids"].shape[0], result["multimodal_inputs"]["grid_thw"][:, 0].sum()
 | |
|         )
 | |
|         self.assertEqual(result["multimodal_inputs"]["pic_cnt"], 1)
 | |
|         self.assertEqual(result["multimodal_inputs"]["video_cnt"], 1)
 | |
| 
 | |
|     def test_prompt(self):
 | |
|         """
 | |
|         Test processing of prompt with image and video placeholders
 | |
| 
 | |
|         Validates:
 | |
|         1. Token ID lengths match position_ids and token_type_ids shapes
 | |
|         2. Image processing produces expected output dimensions
 | |
|         3. Video processing produces expected output dimensions
 | |
|         4. Correct counts for images (1) and videos (1)
 | |
|         """
 | |
|         prompt = {
 | |
|             "request_id": "12345",
 | |
|             "prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
 | |
|             "multimodal_data": {
 | |
|                 "image": [mock_pil_image(10, 2100)],
 | |
|                 "video": [{"video": b"123", "fps": 5}],
 | |
|             },
 | |
|         }
 | |
| 
 | |
|         request = Request.from_dict(prompt)
 | |
|         result = self.processor.process_request(request, 1024 * 100)
 | |
| 
 | |
|         self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
 | |
|         self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
 | |
|         self.assertEqual(
 | |
|             result.multimodal_inputs["images"].shape[0],
 | |
|             sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
 | |
|         )
 | |
|         self.assertEqual(
 | |
|             result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
 | |
|         )
 | |
|         self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
 | |
|         self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     unittest.main()
 | 
