diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py index 4e81306e7..9b480737a 100644 --- a/fastdeploy/input/qwen_vl_processor/process.py +++ b/fastdeploy/input/qwen_vl_processor/process.py @@ -166,8 +166,8 @@ class DataProcessor: } # Define placeholders and their lengths - IMAGE_PLACEHOLDER = "<|image@placeholder|>" - VIDEO_PLACEHOLDER = "<|video@placeholder|>" + IMAGE_PLACEHOLDER = "<|image_pad|>" + VIDEO_PLACEHOLDER = "<|video_pad|>" IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER) VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER) diff --git a/tests/input/test_qwen_vl_processor.py b/tests/input/test_qwen_vl_processor.py index 0dc547ac7..4936bc7a5 100644 --- a/tests/input/test_qwen_vl_processor.py +++ b/tests/input/test_qwen_vl_processor.py @@ -111,10 +111,10 @@ class TestQwenVLProcessor(unittest.TestCase): } limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} - model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct" + self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct" self.processor = QwenVLProcessor( config=config, - model_name_or_path=model_name_or_path, + model_name_or_path=self.model_name_or_path, limit_mm_per_prompt=limit_mm_per_prompt, mm_processor_kwargs=mm_processor_kwargs, reasoning_parser_obj=None, @@ -137,7 +137,7 @@ class TestQwenVLProcessor(unittest.TestCase): 3. Video processing produces expected output dimensions 4. Correct counts for images (1) and videos (1) """ - prompt = { + message = { "request_id": "12345", "messages": [ { @@ -151,7 +151,7 @@ class TestQwenVLProcessor(unittest.TestCase): ], } - request = Request.from_dict(prompt) + request = Request.from_dict(message) result = self.processor.process_request(request, 1024 * 100) self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0]) @@ -219,9 +219,11 @@ class TestQwenVLProcessor(unittest.TestCase): 3. Video processing produces expected output dimensions 4. Correct counts for images (1) and videos (1) """ + IMAGE_PLACEHOLDER = "<|image_pad|>" + VIDEO_PLACEHOLDER = "<|video_pad|>" prompt = { "request_id": "12345", - "prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.", + "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.", "multimodal_data": { "image": [mock_pil_image(10, 2100)], "video": [{"video": b"123", "fps": 5}], @@ -243,6 +245,113 @@ class TestQwenVLProcessor(unittest.TestCase): self.assertEqual(result.multimodal_inputs["pic_cnt"], 1) self.assertEqual(result.multimodal_inputs["video_cnt"], 1) + def test_message_and_prompt(self): + """ + Test consistency between message-based and prompt-based processing + + Validates that processing a request through: + 1. The message format (with image/video URLs) + 2. The prompt format (with direct image/video data) + produces identical tokenization and multimodal input results. + + Checks: + 1. Prompt token IDs match between both processing methods + 2. Grid dimensions (THW) match between both methods + 3. Position IDs match between both methods + """ + # Create test request in message format + request = { + "request_id": "12345", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, + {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, + {"type": "text", "text": "Describe image and video."}, + ], + } + ], + } + result = self.processor.process_request_dict(request, 1024 * 100) + + # Create equivalent request in prompt format + prompt = { + "request_id": "12345", + "prompt": request["text_after_process"], + "multimodal_data": { + "image": [mock_pil_image(480, 640)], + "video": [{"video": b"123"}], + }, + } + request2 = Request.from_dict(prompt) + result2 = self.processor.process_request(request2, 1024 * 100) + + # Verify both processing methods produce identical results + self.assertEqual(result["prompt_token_ids"], result2.prompt_token_ids) + self.assertTrue(np.equal(result["multimodal_inputs"]["grid_thw"], result2.multimodal_inputs["grid_thw"]).all()) + self.assertTrue( + np.equal(result["multimodal_inputs"]["position_ids"], result2.multimodal_inputs["position_ids"]).all() + ) + + def test_apply_chat_template(self): + """ + Test the consistency between: + 1. Directly applying chat template using HuggingFace tokenizer + 2. Applying chat template through the processor's request processing + + This test verifies that: + - The processor correctly handles multimodal messages (image, video, text) + - The text_after_process field matches the output from direct tokenizer application + - The chat template application preserves the message structure and content + + Test Steps: + 1. Create sample multimodal messages with image, video and text content + 2. Apply chat template directly using the tokenizer + 3. Process the same messages through the processor + 4. Compare the outputs to ensure consistency + """ + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) + + # Sample multimodal messages containing image, video and text + messages = [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, + {"type": "video", "video": {"url": "file://3_frame_video.mp4"}}, + {"type": "text", "text": "Describe image and video."}, + ], + } + ] + + # Apply chat template directly using the tokenizer + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + # Create equivalent request dictionary + request = { + "request_id": "12345", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, + {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, + {"type": "text", "text": "Describe image and video."}, + ], + } + ], + } + + # Process request through the processor + self.processor.process_request_dict(request, 1024 * 100) + prompt2 = request["text_after_process"] + + # Verify both methods produce identical prompt strings + self.assertEqual(prompt, prompt2) + if __name__ == "__main__": unittest.main()