mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[bug fix] Fix the placeholder in qwen prompt and add some unittests (#4065)
* fix the placeholder in qwen prompt * fix the placeholder in qwen prompt * add soem unittests for qwen_vl_processor
This commit is contained in:
@@ -166,8 +166,8 @@ class DataProcessor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Define placeholders and their lengths
|
# Define placeholders and their lengths
|
||||||
IMAGE_PLACEHOLDER = "<|image@placeholder|>"
|
IMAGE_PLACEHOLDER = "<|image_pad|>"
|
||||||
VIDEO_PLACEHOLDER = "<|video@placeholder|>"
|
VIDEO_PLACEHOLDER = "<|video_pad|>"
|
||||||
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
|
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
|
||||||
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
|
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
|
||||||
|
|
||||||
|
@@ -111,10 +111,10 @@ class TestQwenVLProcessor(unittest.TestCase):
|
|||||||
}
|
}
|
||||||
limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
|
limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
|
||||||
|
|
||||||
model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
|
self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
|
||||||
self.processor = QwenVLProcessor(
|
self.processor = QwenVLProcessor(
|
||||||
config=config,
|
config=config,
|
||||||
model_name_or_path=model_name_or_path,
|
model_name_or_path=self.model_name_or_path,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
mm_processor_kwargs=mm_processor_kwargs,
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
reasoning_parser_obj=None,
|
reasoning_parser_obj=None,
|
||||||
@@ -137,7 +137,7 @@ class TestQwenVLProcessor(unittest.TestCase):
|
|||||||
3. Video processing produces expected output dimensions
|
3. Video processing produces expected output dimensions
|
||||||
4. Correct counts for images (1) and videos (1)
|
4. Correct counts for images (1) and videos (1)
|
||||||
"""
|
"""
|
||||||
prompt = {
|
message = {
|
||||||
"request_id": "12345",
|
"request_id": "12345",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
@@ -151,7 +151,7 @@ class TestQwenVLProcessor(unittest.TestCase):
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
request = Request.from_dict(prompt)
|
request = Request.from_dict(message)
|
||||||
result = self.processor.process_request(request, 1024 * 100)
|
result = self.processor.process_request(request, 1024 * 100)
|
||||||
|
|
||||||
self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
|
self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
|
||||||
@@ -219,9 +219,11 @@ class TestQwenVLProcessor(unittest.TestCase):
|
|||||||
3. Video processing produces expected output dimensions
|
3. Video processing produces expected output dimensions
|
||||||
4. Correct counts for images (1) and videos (1)
|
4. Correct counts for images (1) and videos (1)
|
||||||
"""
|
"""
|
||||||
|
IMAGE_PLACEHOLDER = "<|image_pad|>"
|
||||||
|
VIDEO_PLACEHOLDER = "<|video_pad|>"
|
||||||
prompt = {
|
prompt = {
|
||||||
"request_id": "12345",
|
"request_id": "12345",
|
||||||
"prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
|
"prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
|
||||||
"multimodal_data": {
|
"multimodal_data": {
|
||||||
"image": [mock_pil_image(10, 2100)],
|
"image": [mock_pil_image(10, 2100)],
|
||||||
"video": [{"video": b"123", "fps": 5}],
|
"video": [{"video": b"123", "fps": 5}],
|
||||||
@@ -243,6 +245,113 @@ class TestQwenVLProcessor(unittest.TestCase):
|
|||||||
self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
|
self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
|
||||||
self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
|
self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
|
||||||
|
|
||||||
|
def test_message_and_prompt(self):
|
||||||
|
"""
|
||||||
|
Test consistency between message-based and prompt-based processing
|
||||||
|
|
||||||
|
Validates that processing a request through:
|
||||||
|
1. The message format (with image/video URLs)
|
||||||
|
2. The prompt format (with direct image/video data)
|
||||||
|
produces identical tokenization and multimodal input results.
|
||||||
|
|
||||||
|
Checks:
|
||||||
|
1. Prompt token IDs match between both processing methods
|
||||||
|
2. Grid dimensions (THW) match between both methods
|
||||||
|
3. Position IDs match between both methods
|
||||||
|
"""
|
||||||
|
# Create test request in message format
|
||||||
|
request = {
|
||||||
|
"request_id": "12345",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
|
||||||
|
{"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
|
||||||
|
{"type": "text", "text": "Describe image and video."},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
result = self.processor.process_request_dict(request, 1024 * 100)
|
||||||
|
|
||||||
|
# Create equivalent request in prompt format
|
||||||
|
prompt = {
|
||||||
|
"request_id": "12345",
|
||||||
|
"prompt": request["text_after_process"],
|
||||||
|
"multimodal_data": {
|
||||||
|
"image": [mock_pil_image(480, 640)],
|
||||||
|
"video": [{"video": b"123"}],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
request2 = Request.from_dict(prompt)
|
||||||
|
result2 = self.processor.process_request(request2, 1024 * 100)
|
||||||
|
|
||||||
|
# Verify both processing methods produce identical results
|
||||||
|
self.assertEqual(result["prompt_token_ids"], result2.prompt_token_ids)
|
||||||
|
self.assertTrue(np.equal(result["multimodal_inputs"]["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
|
||||||
|
self.assertTrue(
|
||||||
|
np.equal(result["multimodal_inputs"]["position_ids"], result2.multimodal_inputs["position_ids"]).all()
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_apply_chat_template(self):
|
||||||
|
"""
|
||||||
|
Test the consistency between:
|
||||||
|
1. Directly applying chat template using HuggingFace tokenizer
|
||||||
|
2. Applying chat template through the processor's request processing
|
||||||
|
|
||||||
|
This test verifies that:
|
||||||
|
- The processor correctly handles multimodal messages (image, video, text)
|
||||||
|
- The text_after_process field matches the output from direct tokenizer application
|
||||||
|
- The chat template application preserves the message structure and content
|
||||||
|
|
||||||
|
Test Steps:
|
||||||
|
1. Create sample multimodal messages with image, video and text content
|
||||||
|
2. Apply chat template directly using the tokenizer
|
||||||
|
3. Process the same messages through the processor
|
||||||
|
4. Compare the outputs to ensure consistency
|
||||||
|
"""
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
|
||||||
|
|
||||||
|
# Sample multimodal messages containing image, video and text
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
|
||||||
|
{"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
|
||||||
|
{"type": "text", "text": "Describe image and video."},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Apply chat template directly using the tokenizer
|
||||||
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
|
||||||
|
# Create equivalent request dictionary
|
||||||
|
request = {
|
||||||
|
"request_id": "12345",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
|
||||||
|
{"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
|
||||||
|
{"type": "text", "text": "Describe image and video."},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process request through the processor
|
||||||
|
self.processor.process_request_dict(request, 1024 * 100)
|
||||||
|
prompt2 = request["text_after_process"]
|
||||||
|
|
||||||
|
# Verify both methods produce identical prompt strings
|
||||||
|
self.assertEqual(prompt, prompt2)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
Reference in New Issue
Block a user