[bug fix] Fix the placeholder in qwen prompt and add some unittests (#4065)

* fix the placeholder in qwen prompt * fix the placeholder in qwen prompt * add soem unittests for qwen_vl_processor
2025-10-05 08:37:06 +08:00 · 2025-09-11 20:00:02 +08:00
parent 850465e8ed
commit 2056a428bd
2 changed files with 116 additions and 7 deletions
--- a/tests/input/test_qwen_vl_processor.py
+++ b/tests/input/test_qwen_vl_processor.py
@@ -111,10 +111,10 @@ class TestQwenVLProcessor(unittest.TestCase):
        }
        limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}

-        model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
+        self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
        self.processor = QwenVLProcessor(
            config=config,
-            model_name_or_path=model_name_or_path,
+            model_name_or_path=self.model_name_or_path,
            limit_mm_per_prompt=limit_mm_per_prompt,
            mm_processor_kwargs=mm_processor_kwargs,
            reasoning_parser_obj=None,
@@ -137,7 +137,7 @@ class TestQwenVLProcessor(unittest.TestCase):
        3. Video processing produces expected output dimensions
        4. Correct counts for images (1) and videos (1)
        """
-        prompt = {
+        message = {
            "request_id": "12345",
            "messages": [
                {
@@ -151,7 +151,7 @@ class TestQwenVLProcessor(unittest.TestCase):
            ],
        }

-        request = Request.from_dict(prompt)
+        request = Request.from_dict(message)
        result = self.processor.process_request(request, 1024 * 100)

        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
@@ -219,9 +219,11 @@ class TestQwenVLProcessor(unittest.TestCase):
        3. Video processing produces expected output dimensions
        4. Correct counts for images (1) and videos (1)
        """
+        IMAGE_PLACEHOLDER = "<|image_pad|>"
+        VIDEO_PLACEHOLDER = "<|video_pad|>"
        prompt = {
            "request_id": "12345",
-            "prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
+            "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
            "multimodal_data": {
                "image": [mock_pil_image(10, 2100)],
                "video": [{"video": b"123", "fps": 5}],
@@ -243,6 +245,113 @@ class TestQwenVLProcessor(unittest.TestCase):
        self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
        self.assertEqual(result.multimodal_inputs["video_cnt"], 1)

+    def test_message_and_prompt(self):
+        """
+        Test consistency between message-based and prompt-based processing
+
+        Validates that processing a request through:
+        1. The message format (with image/video URLs)
+        2. The prompt format (with direct image/video data)
+        produces identical tokenization and multimodal input results.
+
+        Checks:
+        1. Prompt token IDs match between both processing methods
+        2. Grid dimensions (THW) match between both methods
+        3. Position IDs match between both methods
+        """
+        # Create test request in message format
+        request = {
+            "request_id": "12345",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
+                        {"type": "text", "text": "Describe image and video."},
+                    ],
+                }
+            ],
+        }
+        result = self.processor.process_request_dict(request, 1024 * 100)
+
+        # Create equivalent request in prompt format
+        prompt = {
+            "request_id": "12345",
+            "prompt": request["text_after_process"],
+            "multimodal_data": {
+                "image": [mock_pil_image(480, 640)],
+                "video": [{"video": b"123"}],
+            },
+        }
+        request2 = Request.from_dict(prompt)
+        result2 = self.processor.process_request(request2, 1024 * 100)
+
+        # Verify both processing methods produce identical results
+        self.assertEqual(result["prompt_token_ids"], result2.prompt_token_ids)
+        self.assertTrue(np.equal(result["multimodal_inputs"]["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
+        self.assertTrue(
+            np.equal(result["multimodal_inputs"]["position_ids"], result2.multimodal_inputs["position_ids"]).all()
+        )
+
+    def test_apply_chat_template(self):
+        """
+        Test the consistency between:
+        1. Directly applying chat template using HuggingFace tokenizer
+        2. Applying chat template through the processor's request processing
+
+        This test verifies that:
+        - The processor correctly handles multimodal messages (image, video, text)
+        - The text_after_process field matches the output from direct tokenizer application
+        - The chat template application preserves the message structure and content
+
+        Test Steps:
+        1. Create sample multimodal messages with image, video and text content
+        2. Apply chat template directly using the tokenizer
+        3. Process the same messages through the processor
+        4. Compare the outputs to ensure consistency
+        """
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+
+        # Sample multimodal messages containing image, video and text
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                    {"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
+                    {"type": "text", "text": "Describe image and video."},
+                ],
+            }
+        ]
+
+        # Apply chat template directly using the tokenizer
+        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+        # Create equivalent request dictionary
+        request = {
+            "request_id": "12345",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
+                        {"type": "text", "text": "Describe image and video."},
+                    ],
+                }
+            ],
+        }
+
+        # Process request through the processor
+        self.processor.process_request_dict(request, 1024 * 100)
+        prompt2 = request["text_after_process"]
+
+        # Verify both methods produce identical prompt strings
+        self.assertEqual(prompt, prompt2)
+

 if __name__ == "__main__":
    unittest.main()