Update Unit Test for PaddleOCR-VL (#4802)

* fix paddleocr prefix cache bug * add test for paddleocr_vl * disable prefix-caching in ocr * add test for paddleocr_vl * Fix top_p for rejection sampling * add test for ocr processor; fix top_p for rejection sampling * add test for ocr processor; fix top_p for rejection sampling * add test for ocr processor; fix top_p for rejection sampling * add test for ocr processor; fix top_p for rejection sampling * add test for ocr processor; fix top_p for rejection sampling --------- Co-authored-by: ming1753 <ideaminghp@163.com> Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-11-04 22:40:15 +08:00
parent 1b61d62ecf
commit 2c281e617c
4 changed files with 1160 additions and 14 deletions
--- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -26,6 +26,8 @@ from fastdeploy.utils import data_processor_logger

 from .process import DataProcessor

+_SAMPLING_EPS = 1e-5
+

 class Ernie4_5_VLProcessor(Ernie4_5Processor):
    """The processor class for ERNIE MoE VL models."""
@@ -268,6 +270,9 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor):
            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
        data_processor_logger.info(f"Processed request {request}")

+        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
+            request["top_p"] = _SAMPLING_EPS
+
        return request

    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
--- a/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py
+++ b/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py
@@ -22,6 +22,8 @@ from fastdeploy.utils import data_processor_logger

 from .process import DataProcessor

+_SAMPLING_EPS = 1e-5
+

 class PaddleOCRVLProcessor(TextProcessor):
    """
@@ -61,7 +63,6 @@ class PaddleOCRVLProcessor(TextProcessor):
            tool_parser_obj: Tool parser instance
        """
        super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
-
        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
        self.processor = DataProcessor(
@@ -252,6 +253,9 @@ class PaddleOCRVLProcessor(TextProcessor):
        if request.get("max_tokens") is None:
            request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))  # Ensure at least 1 token

+        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
+            request["top_p"] = _SAMPLING_EPS
+
        return request

    def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
--- a/tests/e2e/test_paddleocr_vl_serving.py
+++ b/tests/e2e/test_paddleocr_vl_serving.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import json
 import os
 import shutil
 import signal
@@ -99,7 +98,6 @@ def setup_and_run_server():
        model_path = "./PaddleOCR-VL-0.9B"

    log_path = "server.log"
-    limit_mm_str = json.dumps({"image": 100, "video": 100})

    cmd = [
        sys.executable,
@@ -109,8 +107,6 @@ def setup_and_run_server():
        model_path,
        "--port",
        str(FD_API_PORT),
-        "--tensor-parallel-size",
-        "2",
        "--engine-worker-queue-port",
        str(FD_ENGINE_QUEUE_PORT),
        "--metrics-port",
@@ -119,18 +115,13 @@ def setup_and_run_server():
        str(FD_CACHE_QUEUE_PORT),
        "--enable-mm",
        "--max-model-len",
-        "32768",
+        "16384",
        "--max-num-batched-tokens",
-        "384",
+        "16384",
        "--max-num-seqs",
        "128",
-        "--limit-mm-per-prompt",
-        limit_mm_str,
-        "--enable-chunked-prefill",
-        "--kv-cache-ratio",
-        "0.71",
-        "--quantization",
-        "wint4",
+        "--gpu-memory-utilization",
+        "0.9",
        "--graph-optimization-config",
        '{"graph_opt_level":0, "use_cudagraph":true}',
    ]
--- a/tests/input/test_paddleocr_vl_processor.py
+++ b/tests/input/test_paddleocr_vl_processor.py