[bug fix] Fix the placeholder in qwen prompt and add some unittests (#4065)

* fix the placeholder in qwen prompt * fix the placeholder in qwen prompt * add soem unittests for qwen_vl_processor
2025-10-05 16:48:03 +08:00 · 2025-09-11 20:00:02 +08:00
parent 850465e8ed
commit 2056a428bd
2 changed files with 116 additions and 7 deletions
--- a/fastdeploy/input/qwen_vl_processor/process.py
+++ b/fastdeploy/input/qwen_vl_processor/process.py
@@ -166,8 +166,8 @@ class DataProcessor:
        }
        # Define placeholders and their lengths
-        IMAGE_PLACEHOLDER = "<|image@placeholder|>"
+        IMAGE_PLACEHOLDER = "<|image_pad|>"
-        VIDEO_PLACEHOLDER = "<|video@placeholder|>"
+        VIDEO_PLACEHOLDER = "<|video_pad|>"
        IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
        VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
--- a/tests/input/test_qwen_vl_processor.py
+++ b/tests/input/test_qwen_vl_processor.py
@@ -111,10 +111,10 @@ class TestQwenVLProcessor(unittest.TestCase):
        }
        limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
-        model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
+        self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
        self.processor = QwenVLProcessor(
            config=config,
-            model_name_or_path=model_name_or_path,
+            model_name_or_path=self.model_name_or_path,
            limit_mm_per_prompt=limit_mm_per_prompt,
            mm_processor_kwargs=mm_processor_kwargs,
            reasoning_parser_obj=None,
@@ -137,7 +137,7 @@ class TestQwenVLProcessor(unittest.TestCase):
        3. Video processing produces expected output dimensions
        4. Correct counts for images (1) and videos (1)
        """
-        prompt = {
+        message = {
            "request_id": "12345",
            "messages": [
                {
@@ -151,7 +151,7 @@ class TestQwenVLProcessor(unittest.TestCase):
            ],
        }
-        request = Request.from_dict(prompt)
+        request = Request.from_dict(message)
        result = self.processor.process_request(request, 1024 * 100)
        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
@@ -219,9 +219,11 @@ class TestQwenVLProcessor(unittest.TestCase):
        3. Video processing produces expected output dimensions
        4. Correct counts for images (1) and videos (1)
        """
        IMAGE_PLACEHOLDER = "<|image_pad|>"
        VIDEO_PLACEHOLDER = "<|video_pad|>"
        prompt = {
            "request_id": "12345",
-            "prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
+            "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
            "multimodal_data": {
                "image": [mock_pil_image(10, 2100)],
                "video": [{"video": b"123", "fps": 5}],
@@ -243,6 +245,113 @@ class TestQwenVLProcessor(unittest.TestCase):
        self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
        self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
    def test_message_and_prompt(self):
        """
        Test consistency between message-based and prompt-based processing
        Validates that processing a request through:
        1. The message format (with image/video URLs)
        2. The prompt format (with direct image/video data)
        produces identical tokenization and multimodal input results.
        Checks:
        1. Prompt token IDs match between both processing methods
        2. Grid dimensions (THW) match between both methods
        3. Position IDs match between both methods
        """
        # Create test request in message format
        request = {
            "request_id": "12345",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
                        {"type": "text", "text": "Describe image and video."},
                    ],
                }
            ],
        }
        result = self.processor.process_request_dict(request, 1024 * 100)
        # Create equivalent request in prompt format
        prompt = {
            "request_id": "12345",
            "prompt": request["text_after_process"],
            "multimodal_data": {
                "image": [mock_pil_image(480, 640)],
                "video": [{"video": b"123"}],
            },
        }
        request2 = Request.from_dict(prompt)
        result2 = self.processor.process_request(request2, 1024 * 100)
        # Verify both processing methods produce identical results
        self.assertEqual(result["prompt_token_ids"], result2.prompt_token_ids)
        self.assertTrue(np.equal(result["multimodal_inputs"]["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
        self.assertTrue(
            np.equal(result["multimodal_inputs"]["position_ids"], result2.multimodal_inputs["position_ids"]).all()
        )
    def test_apply_chat_template(self):
        """
        Test the consistency between:
        1. Directly applying chat template using HuggingFace tokenizer
        2. Applying chat template through the processor's request processing
        This test verifies that:
        - The processor correctly handles multimodal messages (image, video, text)
        - The text_after_process field matches the output from direct tokenizer application
        - The chat template application preserves the message structure and content
        Test Steps:
        1. Create sample multimodal messages with image, video and text content
        2. Apply chat template directly using the tokenizer
        3. Process the same messages through the processor
        4. Compare the outputs to ensure consistency
        """
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        # Sample multimodal messages containing image, video and text
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
                    {"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
                    {"type": "text", "text": "Describe image and video."},
                ],
            }
        ]
        # Apply chat template directly using the tokenizer
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        # Create equivalent request dictionary
        request = {
            "request_id": "12345",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
                        {"type": "text", "text": "Describe image and video."},
                    ],
                }
            ],
        }
        # Process request through the processor
        self.processor.process_request_dict(request, 1024 * 100)
        prompt2 = request["text_after_process"]
        # Verify both methods produce identical prompt strings
        self.assertEqual(prompt, prompt2)
 if __name__ == "__main__":
    unittest.main()