[bug fix] Fix the placeholder in qwen prompt and add some unittests (#4065)

* fix the placeholder in qwen prompt

* fix the placeholder in qwen prompt

* add soem unittests for qwen_vl_processor
This commit is contained in:
lddfym
2025-09-11 20:00:02 +08:00
committed by GitHub
parent 850465e8ed
commit 2056a428bd
2 changed files with 116 additions and 7 deletions

View File

@@ -166,8 +166,8 @@ class DataProcessor:
} }
# Define placeholders and their lengths # Define placeholders and their lengths
IMAGE_PLACEHOLDER = "<|image@placeholder|>" IMAGE_PLACEHOLDER = "<|image_pad|>"
VIDEO_PLACEHOLDER = "<|video@placeholder|>" VIDEO_PLACEHOLDER = "<|video_pad|>"
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER) IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER) VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)

View File

@@ -111,10 +111,10 @@ class TestQwenVLProcessor(unittest.TestCase):
} }
limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct" self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
self.processor = QwenVLProcessor( self.processor = QwenVLProcessor(
config=config, config=config,
model_name_or_path=model_name_or_path, model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
reasoning_parser_obj=None, reasoning_parser_obj=None,
@@ -137,7 +137,7 @@ class TestQwenVLProcessor(unittest.TestCase):
3. Video processing produces expected output dimensions 3. Video processing produces expected output dimensions
4. Correct counts for images (1) and videos (1) 4. Correct counts for images (1) and videos (1)
""" """
prompt = { message = {
"request_id": "12345", "request_id": "12345",
"messages": [ "messages": [
{ {
@@ -151,7 +151,7 @@ class TestQwenVLProcessor(unittest.TestCase):
], ],
} }
request = Request.from_dict(prompt) request = Request.from_dict(message)
result = self.processor.process_request(request, 1024 * 100) result = self.processor.process_request(request, 1024 * 100)
self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0]) self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
@@ -219,9 +219,11 @@ class TestQwenVLProcessor(unittest.TestCase):
3. Video processing produces expected output dimensions 3. Video processing produces expected output dimensions
4. Correct counts for images (1) and videos (1) 4. Correct counts for images (1) and videos (1)
""" """
IMAGE_PLACEHOLDER = "<|image_pad|>"
VIDEO_PLACEHOLDER = "<|video_pad|>"
prompt = { prompt = {
"request_id": "12345", "request_id": "12345",
"prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.", "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
"multimodal_data": { "multimodal_data": {
"image": [mock_pil_image(10, 2100)], "image": [mock_pil_image(10, 2100)],
"video": [{"video": b"123", "fps": 5}], "video": [{"video": b"123", "fps": 5}],
@@ -243,6 +245,113 @@ class TestQwenVLProcessor(unittest.TestCase):
self.assertEqual(result.multimodal_inputs["pic_cnt"], 1) self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
self.assertEqual(result.multimodal_inputs["video_cnt"], 1) self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
def test_message_and_prompt(self):
"""
Test consistency between message-based and prompt-based processing
Validates that processing a request through:
1. The message format (with image/video URLs)
2. The prompt format (with direct image/video data)
produces identical tokenization and multimodal input results.
Checks:
1. Prompt token IDs match between both processing methods
2. Grid dimensions (THW) match between both methods
3. Position IDs match between both methods
"""
# Create test request in message format
request = {
"request_id": "12345",
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
{"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
{"type": "text", "text": "Describe image and video."},
],
}
],
}
result = self.processor.process_request_dict(request, 1024 * 100)
# Create equivalent request in prompt format
prompt = {
"request_id": "12345",
"prompt": request["text_after_process"],
"multimodal_data": {
"image": [mock_pil_image(480, 640)],
"video": [{"video": b"123"}],
},
}
request2 = Request.from_dict(prompt)
result2 = self.processor.process_request(request2, 1024 * 100)
# Verify both processing methods produce identical results
self.assertEqual(result["prompt_token_ids"], result2.prompt_token_ids)
self.assertTrue(np.equal(result["multimodal_inputs"]["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
self.assertTrue(
np.equal(result["multimodal_inputs"]["position_ids"], result2.multimodal_inputs["position_ids"]).all()
)
def test_apply_chat_template(self):
"""
Test the consistency between:
1. Directly applying chat template using HuggingFace tokenizer
2. Applying chat template through the processor's request processing
This test verifies that:
- The processor correctly handles multimodal messages (image, video, text)
- The text_after_process field matches the output from direct tokenizer application
- The chat template application preserves the message structure and content
Test Steps:
1. Create sample multimodal messages with image, video and text content
2. Apply chat template directly using the tokenizer
3. Process the same messages through the processor
4. Compare the outputs to ensure consistency
"""
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
# Sample multimodal messages containing image, video and text
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
{"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
{"type": "text", "text": "Describe image and video."},
],
}
]
# Apply chat template directly using the tokenizer
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Create equivalent request dictionary
request = {
"request_id": "12345",
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
{"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
{"type": "text", "text": "Describe image and video."},
],
}
],
}
# Process request through the processor
self.processor.process_request_dict(request, 1024 * 100)
prompt2 = request["text_after_process"]
# Verify both methods produce identical prompt strings
self.assertEqual(prompt, prompt2)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()