mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Iluvatar] Support V1_KVCACHE_SCHEDULER and paddleocr-vl rope mode (#5555)
This commit is contained in:
235
tests/ci_use/iluvatar_UT/bench_gsm8k.py
Normal file
235
tests/ci_use/iluvatar_UT/bench_gsm8k.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Fastdeploy + ERNIE-4.5-Turbo 的指标评估"""
|
||||
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
|
||||
import argparse
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
INVALID = -9999999
|
||||
|
||||
|
||||
def call_generate(prompt, **kwargs):
|
||||
"""
|
||||
Generates response based on the input prompt.
|
||||
|
||||
Args:
|
||||
prompt (str): The input prompt text.
|
||||
**kwargs: Keyword arguments, including server IP address and port number.
|
||||
|
||||
Returns:
|
||||
str: The response generated based on the prompt.
|
||||
|
||||
"""
|
||||
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
"temperature": 0.6,
|
||||
"max_tokens": 2047,
|
||||
"top_p": 0.95,
|
||||
"do_sample": True,
|
||||
}
|
||||
|
||||
response = requests.post(url, headers=headers, data=json.dumps(data))
|
||||
out = response.json()
|
||||
return out["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
def get_one_example(lines, i, include_answer):
|
||||
"""
|
||||
Retrieves a question-answer example from the given list of text lines.
|
||||
|
||||
Args:
|
||||
lines (list of dict): A list of question-answer pairs.
|
||||
i (int): The index of the question-answer pair to retrieve from lines.
|
||||
include_answer (bool): Whether to include the answer in the returned string.
|
||||
|
||||
Returns:
|
||||
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
|
||||
|
||||
"""
|
||||
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
|
||||
if include_answer:
|
||||
ret += " " + lines[i]["answer"]
|
||||
return ret
|
||||
|
||||
|
||||
def get_few_shot_examples(lines, k):
|
||||
"""
|
||||
Selects k examples from the given list of text lines and concatenates them into a single string.
|
||||
|
||||
Args:
|
||||
lines (list): A list containing text lines.
|
||||
k (int): The number of examples to select.
|
||||
|
||||
Returns:
|
||||
str: A string composed of k examples, separated by two newline characters.
|
||||
"""
|
||||
ret = ""
|
||||
for i in range(k):
|
||||
ret += get_one_example(lines, i, True) + "\n\n"
|
||||
return ret
|
||||
|
||||
|
||||
def get_answer_value(answer_str):
|
||||
"""
|
||||
Extracts numerical values from an answer string and returns them.
|
||||
|
||||
Args:
|
||||
answer_str (str): The string containing the answer.
|
||||
|
||||
Returns:
|
||||
The extracted numerical value; returns "INVALID" if extraction fails.
|
||||
"""
|
||||
answer_str = answer_str.replace(",", "")
|
||||
numbers = re.findall(r"\d+", answer_str)
|
||||
if len(numbers) < 1:
|
||||
return INVALID
|
||||
try:
|
||||
return ast.literal_eval(numbers[-1])
|
||||
except SyntaxError:
|
||||
return INVALID
|
||||
|
||||
|
||||
def read_jsonl(filename: str):
|
||||
"""
|
||||
Reads a JSONL file.
|
||||
|
||||
Args:
|
||||
filename (str): Path to the JSONL file.
|
||||
|
||||
Yields:
|
||||
dict: A dictionary object corresponding to each line in the JSONL file.
|
||||
"""
|
||||
with open(filename) as fin:
|
||||
for line in fin:
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def main(args):
|
||||
"""
|
||||
Process inputs and generate answers by calling the model in parallel using a thread pool.
|
||||
|
||||
Args:
|
||||
args (argparse.Namespace):
|
||||
- num_questions (int): Number of questions to process.
|
||||
- num_shots (int): Number of few-shot learning examples.
|
||||
- ip (str): IP address of the model service.
|
||||
- port (int): Port number of the model service.
|
||||
- parallel (int): Number of questions to process in parallel.
|
||||
- result_file (str): File path to store the results.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
# Read data
|
||||
filename = "test.jsonl"
|
||||
|
||||
lines = list(read_jsonl(filename))
|
||||
|
||||
# Construct prompts
|
||||
num_questions = args.num_questions
|
||||
num_shots = args.num_shots
|
||||
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
||||
|
||||
questions = []
|
||||
labels = []
|
||||
for i in range(len(lines[:num_questions])):
|
||||
questions.append(get_one_example(lines, i, False))
|
||||
labels.append(get_answer_value(lines[i]["answer"]))
|
||||
assert all(l != INVALID for l in labels)
|
||||
|
||||
states = [None] * len(labels)
|
||||
|
||||
# Use thread pool
|
||||
def get_one_answer(i):
|
||||
answer = call_generate(
|
||||
prompt=few_shot_examples + questions[i],
|
||||
# stop=["Question", "Assistant:", "<|separator|>"],
|
||||
ip=args.ip,
|
||||
port=args.port,
|
||||
)
|
||||
states[i] = answer
|
||||
|
||||
tic = time.time()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
else:
|
||||
with ThreadPoolExecutor(args.parallel) as executor:
|
||||
list(
|
||||
tqdm(
|
||||
executor.map(get_one_answer, list(range(len(questions)))),
|
||||
total=len(questions),
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
preds.append(get_answer_value(states[i]))
|
||||
|
||||
# Compute accuracy
|
||||
acc = np.mean(np.array(preds) == np.array(labels))
|
||||
invalid = np.mean(np.array(preds) == INVALID)
|
||||
|
||||
# Print results
|
||||
print(f"Accuracy: {acc:.3f}")
|
||||
print(f"Invalid: {invalid:.3f}")
|
||||
print(f"Latency: {latency:.3f} s")
|
||||
|
||||
with open(args.result_file, "a") as fout:
|
||||
value = {
|
||||
"task": "gsm8k",
|
||||
"backend": "paddlepaddle",
|
||||
"num_gpus": 1,
|
||||
"latency": round(latency, 3),
|
||||
"accuracy": round(acc, 3),
|
||||
"num_requests": args.num_questions,
|
||||
"other": {
|
||||
"num_questions": args.num_questions,
|
||||
"parallel": args.parallel,
|
||||
},
|
||||
}
|
||||
fout.write(json.dumps(value) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ip", type=str, default="127.0.0.1")
|
||||
parser.add_argument("--port", type=str, default="8188")
|
||||
parser.add_argument("--num-shots", type=int, default=10)
|
||||
parser.add_argument("--data-path", type=str, default="test.jsonl")
|
||||
parser.add_argument("--num-questions", type=int, default=1319)
|
||||
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||
parser.add_argument("--parallel", type=int, default=1)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -12,43 +12,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
|
||||
from fastdeploy import LLM, SamplingParams
|
||||
from fastdeploy.utils import set_random_seed
|
||||
|
||||
tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
sys.path.insert(0, tests_dir)
|
||||
|
||||
def timeout(seconds):
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
result = [None]
|
||||
exception = [None]
|
||||
|
||||
def target():
|
||||
try:
|
||||
result[0] = func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
exception[0] = e
|
||||
|
||||
thread = threading.Thread(target=target)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
thread.join(seconds)
|
||||
|
||||
if thread.is_alive():
|
||||
raise TimeoutError(f"Function timed out after {seconds} seconds")
|
||||
|
||||
if exception[0]:
|
||||
raise exception[0]
|
||||
|
||||
return result[0]
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout
|
||||
|
||||
|
||||
@timeout(80)
|
||||
@@ -75,15 +48,15 @@ def offline_infer_check():
|
||||
59335,
|
||||
68170,
|
||||
183,
|
||||
49080,
|
||||
94717,
|
||||
82966,
|
||||
99140,
|
||||
31615,
|
||||
51497,
|
||||
94851,
|
||||
60764,
|
||||
10889,
|
||||
97404,
|
||||
100088,
|
||||
36310,
|
||||
95633,
|
||||
95913,
|
||||
41459,
|
||||
95049,
|
||||
94970,
|
||||
96840,
|
||||
2,
|
||||
], f"{outputs[0].outputs.token_ids}"
|
||||
print("PASSED")
|
||||
@@ -94,10 +67,7 @@ if __name__ == "__main__":
|
||||
result = offline_infer_check()
|
||||
sys.exit(0)
|
||||
except TimeoutError:
|
||||
print(
|
||||
"The timeout exit may be due to multiple processes sharing the "
|
||||
"same gpu card. You can check this using ixsmi on the device."
|
||||
)
|
||||
print(TIMEOUT_MSG)
|
||||
sys.exit(124)
|
||||
except Exception:
|
||||
sys.exit(1)
|
||||
|
||||
@@ -12,10 +12,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
@@ -24,39 +23,13 @@ from fastdeploy import LLM, SamplingParams
|
||||
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
|
||||
from fastdeploy.utils import set_random_seed
|
||||
|
||||
tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
sys.path.insert(0, tests_dir)
|
||||
|
||||
def timeout(seconds):
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
result = [None]
|
||||
exception = [None]
|
||||
|
||||
def target():
|
||||
try:
|
||||
result[0] = func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
exception[0] = e
|
||||
|
||||
thread = threading.Thread(target=target)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
thread.join(seconds)
|
||||
|
||||
if thread.is_alive():
|
||||
raise TimeoutError(f"Function timed out after {seconds} seconds")
|
||||
|
||||
if exception[0]:
|
||||
raise exception[0]
|
||||
|
||||
return result[0]
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout
|
||||
|
||||
|
||||
@timeout(180)
|
||||
@timeout(210)
|
||||
def offline_infer_check():
|
||||
set_random_seed(123)
|
||||
|
||||
@@ -122,9 +95,9 @@ def offline_infer_check():
|
||||
5119,
|
||||
93956,
|
||||
68725,
|
||||
14449,
|
||||
4356,
|
||||
38225,
|
||||
100282,
|
||||
23,
|
||||
23,
|
||||
2,
|
||||
], f"{outputs[0].outputs.token_ids}"
|
||||
print("PASSED")
|
||||
@@ -135,10 +108,7 @@ if __name__ == "__main__":
|
||||
result = offline_infer_check()
|
||||
sys.exit(0)
|
||||
except TimeoutError:
|
||||
print(
|
||||
"The timeout exit may be due to multiple processes sharing the "
|
||||
"same gpu card. You can check this using ixsmi on the device."
|
||||
)
|
||||
print(TIMEOUT_MSG)
|
||||
sys.exit(124)
|
||||
except Exception:
|
||||
sys.exit(1)
|
||||
|
||||
28
tests/ci_use/iluvatar_UT/utils.py
Normal file
28
tests/ci_use/iluvatar_UT/utils.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import functools
|
||||
import signal
|
||||
|
||||
|
||||
def timeout(seconds):
|
||||
def decorator(func):
|
||||
def _handle_timeout(signum, frame):
|
||||
raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds")
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
original_handler = signal.signal(signal.SIGALRM, _handle_timeout)
|
||||
signal.alarm(seconds)
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
signal.alarm(0)
|
||||
return result
|
||||
finally:
|
||||
signal.signal(signal.SIGALRM, original_handler)
|
||||
signal.alarm(0)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
TIMEOUT_MSG = "The timeout exit may be due to multiple processes sharing the same gpu card. You can check this using ixsmi on the device."
|
||||
Reference in New Issue
Block a user