[Iluvatar] Support V1_KVCACHE_SCHEDULER and paddleocr-vl rope mode (#5555)

2025-12-24 13:28:13 +08:00 · 2025-12-18 18:14:25 +08:00
parent 48f3e9797e
commit ac013803f3
24 changed files with 1212 additions and 1090 deletions
--- a/tests/ci_use/iluvatar_UT/bench_gsm8k.py
+++ b/tests/ci_use/iluvatar_UT/bench_gsm8k.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Fastdeploy + ERNIE-4.5-Turbo 的指标评估"""
+# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
+import argparse
+import ast
+import json
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+import requests
+from tqdm import tqdm
+
+INVALID = -9999999
+
+
+def call_generate(prompt, **kwargs):
+    """
+    Generates response based on the input prompt.
+
+    Args:
+        prompt (str): The input prompt text.
+        **kwargs: Keyword arguments, including server IP address and port number.
+
+    Returns:
+        str: The response generated based on the prompt.
+
+    """
+    url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "messages": [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        "temperature": 0.6,
+        "max_tokens": 2047,
+        "top_p": 0.95,
+        "do_sample": True,
+    }
+
+    response = requests.post(url, headers=headers, data=json.dumps(data))
+    out = response.json()
+    return out["choices"][0]["message"]["content"]
+
+
+def get_one_example(lines, i, include_answer):
+    """
+    Retrieves a question-answer example from the given list of text lines.
+
+    Args:
+        lines (list of dict): A list of question-answer pairs.
+        i (int): The index of the question-answer pair to retrieve from lines.
+        include_answer (bool): Whether to include the answer in the returned string.
+
+    Returns:
+        str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
+
+    """
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    """
+    Selects k examples from the given list of text lines and concatenates them into a single string.
+
+    Args:
+        lines (list): A list containing text lines.
+        k (int): The number of examples to select.
+
+    Returns:
+        str: A string composed of k examples, separated by two newline characters.
+    """
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    """
+    Extracts numerical values from an answer string and returns them.
+
+    Args:
+        answer_str (str): The string containing the answer.
+
+    Returns:
+        The extracted numerical value; returns "INVALID" if extraction fails.
+    """
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def read_jsonl(filename: str):
+    """
+    Reads a JSONL file.
+
+    Args:
+        filename (str): Path to the JSONL file.
+
+    Yields:
+        dict: A dictionary object corresponding to each line in the JSONL file.
+    """
+    with open(filename) as fin:
+        for line in fin:
+            if line.startswith("#"):
+                continue
+            yield json.loads(line)
+
+
+def main(args):
+    """
+    Process inputs and generate answers by calling the model in parallel using a thread pool.
+
+    Args:
+        args (argparse.Namespace):
+            - num_questions (int): Number of questions to process.
+            - num_shots (int): Number of few-shot learning examples.
+            - ip (str): IP address of the model service.
+            - port (int): Port number of the model service.
+            - parallel (int): Number of questions to process in parallel.
+            - result_file (str): File path to store the results.
+
+    Returns:
+        None
+
+    """
+    # Read data
+    filename = "test.jsonl"
+
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+
+    states = [None] * len(labels)
+
+    # Use thread pool
+    def get_one_answer(i):
+        answer = call_generate(
+            prompt=few_shot_examples + questions[i],
+            # stop=["Question", "Assistant:", "<|separator|>"],
+            ip=args.ip,
+            port=args.port,
+        )
+        states[i] = answer
+
+    tic = time.time()
+    if args.parallel == 1:
+        for i in tqdm(range(len(questions))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(questions)))),
+                    total=len(questions),
+                )
+            )
+
+    latency = time.time() - tic
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": "paddlepaddle",
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ip", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=str, default="8188")
+    parser.add_argument("--num-shots", type=int, default=10)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=1319)
+    parser.add_argument("--result-file", type=str, default="result.jsonl")
+    parser.add_argument("--parallel", type=int, default=1)
+    args = parser.parse_args()
+    main(args)
--- a/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
+++ b/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
@@ -12,43 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import functools
+import os
 import sys
-import threading

 from fastdeploy import LLM, SamplingParams
 from fastdeploy.utils import set_random_seed

+tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.insert(0, tests_dir)

-def timeout(seconds):
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            result = [None]
-            exception = [None]
-
-            def target():
-                try:
-                    result[0] = func(*args, **kwargs)
-                except Exception as e:
-                    exception[0] = e
-
-            thread = threading.Thread(target=target)
-            thread.daemon = True
-            thread.start()
-            thread.join(seconds)
-
-            if thread.is_alive():
-                raise TimeoutError(f"Function timed out after {seconds} seconds")
-
-            if exception[0]:
-                raise exception[0]
-
-            return result[0]
-
-        return wrapper
-
-    return decorator
+from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout


@timeout(80)
@@ -75,15 +48,15 @@ def offline_infer_check():
        59335,
        68170,
        183,
-        49080,
-        94717,
-        82966,
-        99140,
-        31615,
-        51497,
-        94851,
-        60764,
-        10889,
+        97404,
+        100088,
+        36310,
+        95633,
+        95913,
+        41459,
+        95049,
+        94970,
+        96840,
        2,
    ], f"{outputs[0].outputs.token_ids}"
    print("PASSED")
@@ -94,10 +67,7 @@ if __name__ == "__main__":
        result = offline_infer_check()
        sys.exit(0)
    except TimeoutError:
-        print(
-            "The timeout exit may be due to multiple processes sharing the "
-            "same gpu card. You can check this using ixsmi on the device."
-        )
+        print(TIMEOUT_MSG)
        sys.exit(124)
    except Exception:
        sys.exit(1)
--- a/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py
+++ b/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import functools
 import io
+import os
 import sys
-import threading

 import requests
 from PIL import Image
@@ -24,39 +23,13 @@ from fastdeploy import LLM, SamplingParams
 from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
 from fastdeploy.utils import set_random_seed

+tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.insert(0, tests_dir)

-def timeout(seconds):
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            result = [None]
-            exception = [None]
-
-            def target():
-                try:
-                    result[0] = func(*args, **kwargs)
-                except Exception as e:
-                    exception[0] = e
-
-            thread = threading.Thread(target=target)
-            thread.daemon = True
-            thread.start()
-            thread.join(seconds)
-
-            if thread.is_alive():
-                raise TimeoutError(f"Function timed out after {seconds} seconds")
-
-            if exception[0]:
-                raise exception[0]
-
-            return result[0]
-
-        return wrapper
-
-    return decorator
+from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout


-@timeout(180)
+@timeout(210)
 def offline_infer_check():
    set_random_seed(123)

@@ -122,9 +95,9 @@ def offline_infer_check():
        5119,
        93956,
        68725,
-        14449,
-        4356,
-        38225,
+        100282,
+        23,
+        23,
        2,
    ], f"{outputs[0].outputs.token_ids}"
    print("PASSED")
@@ -135,10 +108,7 @@ if __name__ == "__main__":
        result = offline_infer_check()
        sys.exit(0)
    except TimeoutError:
-        print(
-            "The timeout exit may be due to multiple processes sharing the "
-            "same gpu card. You can check this using ixsmi on the device."
-        )
+        print(TIMEOUT_MSG)
        sys.exit(124)
    except Exception:
        sys.exit(1)
--- a/tests/ci_use/iluvatar_UT/utils.py
+++ b/tests/ci_use/iluvatar_UT/utils.py
@@ -0,0 +1,28 @@
+import functools
+import signal
+
+
+def timeout(seconds):
+    def decorator(func):
+        def _handle_timeout(signum, frame):
+            raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds")
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            original_handler = signal.signal(signal.SIGALRM, _handle_timeout)
+            signal.alarm(seconds)
+
+            try:
+                result = func(*args, **kwargs)
+                signal.alarm(0)
+                return result
+            finally:
+                signal.signal(signal.SIGALRM, original_handler)
+                signal.alarm(0)
+
+        return wrapper
+
+    return decorator
+
+
+TIMEOUT_MSG = "The timeout exit may be due to multiple processes sharing the same gpu card. You can check this using ixsmi on the device."