mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[bug fix] Fix the placeholder in qwen prompt and add some unittests (#4065)
* fix the placeholder in qwen prompt * fix the placeholder in qwen prompt * add soem unittests for qwen_vl_processor
This commit is contained in:
@@ -111,10 +111,10 @@ class TestQwenVLProcessor(unittest.TestCase):
|
||||
}
|
||||
limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
|
||||
|
||||
model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
|
||||
self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
|
||||
self.processor = QwenVLProcessor(
|
||||
config=config,
|
||||
model_name_or_path=model_name_or_path,
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
reasoning_parser_obj=None,
|
||||
@@ -137,7 +137,7 @@ class TestQwenVLProcessor(unittest.TestCase):
|
||||
3. Video processing produces expected output dimensions
|
||||
4. Correct counts for images (1) and videos (1)
|
||||
"""
|
||||
prompt = {
|
||||
message = {
|
||||
"request_id": "12345",
|
||||
"messages": [
|
||||
{
|
||||
@@ -151,7 +151,7 @@ class TestQwenVLProcessor(unittest.TestCase):
|
||||
],
|
||||
}
|
||||
|
||||
request = Request.from_dict(prompt)
|
||||
request = Request.from_dict(message)
|
||||
result = self.processor.process_request(request, 1024 * 100)
|
||||
|
||||
self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
|
||||
@@ -219,9 +219,11 @@ class TestQwenVLProcessor(unittest.TestCase):
|
||||
3. Video processing produces expected output dimensions
|
||||
4. Correct counts for images (1) and videos (1)
|
||||
"""
|
||||
IMAGE_PLACEHOLDER = "<|image_pad|>"
|
||||
VIDEO_PLACEHOLDER = "<|video_pad|>"
|
||||
prompt = {
|
||||
"request_id": "12345",
|
||||
"prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
|
||||
"prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
|
||||
"multimodal_data": {
|
||||
"image": [mock_pil_image(10, 2100)],
|
||||
"video": [{"video": b"123", "fps": 5}],
|
||||
@@ -243,6 +245,113 @@ class TestQwenVLProcessor(unittest.TestCase):
|
||||
self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
|
||||
self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
|
||||
|
||||
def test_message_and_prompt(self):
|
||||
"""
|
||||
Test consistency between message-based and prompt-based processing
|
||||
|
||||
Validates that processing a request through:
|
||||
1. The message format (with image/video URLs)
|
||||
2. The prompt format (with direct image/video data)
|
||||
produces identical tokenization and multimodal input results.
|
||||
|
||||
Checks:
|
||||
1. Prompt token IDs match between both processing methods
|
||||
2. Grid dimensions (THW) match between both methods
|
||||
3. Position IDs match between both methods
|
||||
"""
|
||||
# Create test request in message format
|
||||
request = {
|
||||
"request_id": "12345",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
|
||||
{"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
|
||||
{"type": "text", "text": "Describe image and video."},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
result = self.processor.process_request_dict(request, 1024 * 100)
|
||||
|
||||
# Create equivalent request in prompt format
|
||||
prompt = {
|
||||
"request_id": "12345",
|
||||
"prompt": request["text_after_process"],
|
||||
"multimodal_data": {
|
||||
"image": [mock_pil_image(480, 640)],
|
||||
"video": [{"video": b"123"}],
|
||||
},
|
||||
}
|
||||
request2 = Request.from_dict(prompt)
|
||||
result2 = self.processor.process_request(request2, 1024 * 100)
|
||||
|
||||
# Verify both processing methods produce identical results
|
||||
self.assertEqual(result["prompt_token_ids"], result2.prompt_token_ids)
|
||||
self.assertTrue(np.equal(result["multimodal_inputs"]["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
|
||||
self.assertTrue(
|
||||
np.equal(result["multimodal_inputs"]["position_ids"], result2.multimodal_inputs["position_ids"]).all()
|
||||
)
|
||||
|
||||
def test_apply_chat_template(self):
|
||||
"""
|
||||
Test the consistency between:
|
||||
1. Directly applying chat template using HuggingFace tokenizer
|
||||
2. Applying chat template through the processor's request processing
|
||||
|
||||
This test verifies that:
|
||||
- The processor correctly handles multimodal messages (image, video, text)
|
||||
- The text_after_process field matches the output from direct tokenizer application
|
||||
- The chat template application preserves the message structure and content
|
||||
|
||||
Test Steps:
|
||||
1. Create sample multimodal messages with image, video and text content
|
||||
2. Apply chat template directly using the tokenizer
|
||||
3. Process the same messages through the processor
|
||||
4. Compare the outputs to ensure consistency
|
||||
"""
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
|
||||
|
||||
# Sample multimodal messages containing image, video and text
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
|
||||
{"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
|
||||
{"type": "text", "text": "Describe image and video."},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Apply chat template directly using the tokenizer
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
# Create equivalent request dictionary
|
||||
request = {
|
||||
"request_id": "12345",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
|
||||
{"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
|
||||
{"type": "text", "text": "Describe image and video."},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Process request through the processor
|
||||
self.processor.process_request_dict(request, 1024 * 100)
|
||||
prompt2 = request["text_after_process"]
|
||||
|
||||
# Verify both methods produce identical prompt strings
|
||||
self.assertEqual(prompt, prompt2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
Reference in New Issue
Block a user