diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py
index 4e81306e7..9b480737a 100644
--- a/fastdeploy/input/qwen_vl_processor/process.py
+++ b/fastdeploy/input/qwen_vl_processor/process.py
@@ -166,8 +166,8 @@ class DataProcessor:
         }
 
         # Define placeholders and their lengths
-        IMAGE_PLACEHOLDER = "<|image@placeholder|>"
-        VIDEO_PLACEHOLDER = "<|video@placeholder|>"
+        IMAGE_PLACEHOLDER = "<|image_pad|>"
+        VIDEO_PLACEHOLDER = "<|video_pad|>"
         IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
         VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
 
diff --git a/tests/input/test_qwen_vl_processor.py b/tests/input/test_qwen_vl_processor.py
index 0dc547ac7..4936bc7a5 100644
--- a/tests/input/test_qwen_vl_processor.py
+++ b/tests/input/test_qwen_vl_processor.py
@@ -111,10 +111,10 @@ class TestQwenVLProcessor(unittest.TestCase):
         }
         limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
 
-        model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
+        self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
         self.processor = QwenVLProcessor(
             config=config,
-            model_name_or_path=model_name_or_path,
+            model_name_or_path=self.model_name_or_path,
             limit_mm_per_prompt=limit_mm_per_prompt,
             mm_processor_kwargs=mm_processor_kwargs,
             reasoning_parser_obj=None,
@@ -137,7 +137,7 @@ class TestQwenVLProcessor(unittest.TestCase):
         3. Video processing produces expected output dimensions
         4. Correct counts for images (1) and videos (1)
         """
-        prompt = {
+        message = {
             "request_id": "12345",
             "messages": [
                 {
@@ -151,7 +151,7 @@ class TestQwenVLProcessor(unittest.TestCase):
             ],
         }
 
-        request = Request.from_dict(prompt)
+        request = Request.from_dict(message)
         result = self.processor.process_request(request, 1024 * 100)
 
         self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
@@ -219,9 +219,11 @@ class TestQwenVLProcessor(unittest.TestCase):
         3. Video processing produces expected output dimensions
         4. Correct counts for images (1) and videos (1)
         """
+        IMAGE_PLACEHOLDER = "<|image_pad|>"
+        VIDEO_PLACEHOLDER = "<|video_pad|>"
         prompt = {
             "request_id": "12345",
-            "prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
+            "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
             "multimodal_data": {
                 "image": [mock_pil_image(10, 2100)],
                 "video": [{"video": b"123", "fps": 5}],
@@ -243,6 +245,113 @@ class TestQwenVLProcessor(unittest.TestCase):
         self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
         self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
 
+    def test_message_and_prompt(self):
+        """
+        Test consistency between message-based and prompt-based processing
+
+        Validates that processing a request through:
+        1. The message format (with image/video URLs)
+        2. The prompt format (with direct image/video data)
+        produces identical tokenization and multimodal input results.
+
+        Checks:
+        1. Prompt token IDs match between both processing methods
+        2. Grid dimensions (THW) match between both methods
+        3. Position IDs match between both methods
+        """
+        # Create test request in message format
+        request = {
+            "request_id": "12345",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
+                        {"type": "text", "text": "Describe image and video."},
+                    ],
+                }
+            ],
+        }
+        result = self.processor.process_request_dict(request, 1024 * 100)
+
+        # Create equivalent request in prompt format
+        prompt = {
+            "request_id": "12345",
+            "prompt": request["text_after_process"],
+            "multimodal_data": {
+                "image": [mock_pil_image(480, 640)],
+                "video": [{"video": b"123"}],
+            },
+        }
+        request2 = Request.from_dict(prompt)
+        result2 = self.processor.process_request(request2, 1024 * 100)
+
+        # Verify both processing methods produce identical results
+        self.assertEqual(result["prompt_token_ids"], result2.prompt_token_ids)
+        self.assertTrue(np.equal(result["multimodal_inputs"]["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
+        self.assertTrue(
+            np.equal(result["multimodal_inputs"]["position_ids"], result2.multimodal_inputs["position_ids"]).all()
+        )
+
+    def test_apply_chat_template(self):
+        """
+        Test the consistency between:
+        1. Directly applying chat template using HuggingFace tokenizer
+        2. Applying chat template through the processor's request processing
+
+        This test verifies that:
+        - The processor correctly handles multimodal messages (image, video, text)
+        - The text_after_process field matches the output from direct tokenizer application
+        - The chat template application preserves the message structure and content
+
+        Test Steps:
+        1. Create sample multimodal messages with image, video and text content
+        2. Apply chat template directly using the tokenizer
+        3. Process the same messages through the processor
+        4. Compare the outputs to ensure consistency
+        """
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+
+        # Sample multimodal messages containing image, video and text
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                    {"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
+                    {"type": "text", "text": "Describe image and video."},
+                ],
+            }
+        ]
+
+        # Apply chat template directly using the tokenizer
+        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+        # Create equivalent request dictionary
+        request = {
+            "request_id": "12345",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
+                        {"type": "text", "text": "Describe image and video."},
+                    ],
+                }
+            ],
+        }
+
+        # Process request through the processor
+        self.processor.process_request_dict(request, 1024 * 100)
+        prompt2 = request["text_after_process"]
+
+        # Verify both methods produce identical prompt strings
+        self.assertEqual(prompt, prompt2)
+
 
 if __name__ == "__main__":
     unittest.main()