[Iluvatar] Support V1_KVCACHE_SCHEDULER and paddleocr-vl rope mode (#5555)

This commit is contained in:
yzwu
2025-12-18 18:14:25 +08:00
committed by GitHub
parent 48f3e9797e
commit ac013803f3
24 changed files with 1212 additions and 1090 deletions

View File

@@ -0,0 +1,235 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fastdeploy + ERNIE-4.5-Turbo 的指标评估"""
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
import argparse
import ast
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import requests
from tqdm import tqdm
INVALID = -9999999
def call_generate(prompt, **kwargs):
"""
Generates response based on the input prompt.
Args:
prompt (str): The input prompt text.
**kwargs: Keyword arguments, including server IP address and port number.
Returns:
str: The response generated based on the prompt.
"""
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
"messages": [
{
"role": "user",
"content": prompt,
}
],
"temperature": 0.6,
"max_tokens": 2047,
"top_p": 0.95,
"do_sample": True,
}
response = requests.post(url, headers=headers, data=json.dumps(data))
out = response.json()
return out["choices"][0]["message"]["content"]
def get_one_example(lines, i, include_answer):
"""
Retrieves a question-answer example from the given list of text lines.
Args:
lines (list of dict): A list of question-answer pairs.
i (int): The index of the question-answer pair to retrieve from lines.
include_answer (bool): Whether to include the answer in the returned string.
Returns:
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
"""
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
if include_answer:
ret += " " + lines[i]["answer"]
return ret
def get_few_shot_examples(lines, k):
"""
Selects k examples from the given list of text lines and concatenates them into a single string.
Args:
lines (list): A list containing text lines.
k (int): The number of examples to select.
Returns:
str: A string composed of k examples, separated by two newline characters.
"""
ret = ""
for i in range(k):
ret += get_one_example(lines, i, True) + "\n\n"
return ret
def get_answer_value(answer_str):
"""
Extracts numerical values from an answer string and returns them.
Args:
answer_str (str): The string containing the answer.
Returns:
The extracted numerical value; returns "INVALID" if extraction fails.
"""
answer_str = answer_str.replace(",", "")
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
return ast.literal_eval(numbers[-1])
except SyntaxError:
return INVALID
def read_jsonl(filename: str):
"""
Reads a JSONL file.
Args:
filename (str): Path to the JSONL file.
Yields:
dict: A dictionary object corresponding to each line in the JSONL file.
"""
with open(filename) as fin:
for line in fin:
if line.startswith("#"):
continue
yield json.loads(line)
def main(args):
"""
Process inputs and generate answers by calling the model in parallel using a thread pool.
Args:
args (argparse.Namespace):
- num_questions (int): Number of questions to process.
- num_shots (int): Number of few-shot learning examples.
- ip (str): IP address of the model service.
- port (int): Port number of the model service.
- parallel (int): Number of questions to process in parallel.
- result_file (str): File path to store the results.
Returns:
None
"""
# Read data
filename = "test.jsonl"
lines = list(read_jsonl(filename))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)
questions = []
labels = []
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
states = [None] * len(labels)
# Use thread pool
def get_one_answer(i):
answer = call_generate(
prompt=few_shot_examples + questions[i],
# stop=["Question", "Assistant:", "<|separator|>"],
ip=args.ip,
port=args.port,
)
states[i] = answer
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
latency = time.time() - tic
preds = []
for i in range(len(states)):
preds.append(get_answer_value(states[i]))
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
invalid = np.mean(np.array(preds) == INVALID)
# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Invalid: {invalid:.3f}")
print(f"Latency: {latency:.3f} s")
with open(args.result_file, "a") as fout:
value = {
"task": "gsm8k",
"backend": "paddlepaddle",
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--ip", type=str, default="127.0.0.1")
parser.add_argument("--port", type=str, default="8188")
parser.add_argument("--num-shots", type=int, default=10)
parser.add_argument("--data-path", type=str, default="test.jsonl")
parser.add_argument("--num-questions", type=int, default=1319)
parser.add_argument("--result-file", type=str, default="result.jsonl")
parser.add_argument("--parallel", type=int, default=1)
args = parser.parse_args()
main(args)

View File

@@ -12,43 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import os
import sys
import threading
from fastdeploy import LLM, SamplingParams
from fastdeploy.utils import set_random_seed
tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, tests_dir)
def timeout(seconds):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
result = [None]
exception = [None]
def target():
try:
result[0] = func(*args, **kwargs)
except Exception as e:
exception[0] = e
thread = threading.Thread(target=target)
thread.daemon = True
thread.start()
thread.join(seconds)
if thread.is_alive():
raise TimeoutError(f"Function timed out after {seconds} seconds")
if exception[0]:
raise exception[0]
return result[0]
return wrapper
return decorator
from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout
@timeout(80)
@@ -75,15 +48,15 @@ def offline_infer_check():
59335,
68170,
183,
49080,
94717,
82966,
99140,
31615,
51497,
94851,
60764,
10889,
97404,
100088,
36310,
95633,
95913,
41459,
95049,
94970,
96840,
2,
], f"{outputs[0].outputs.token_ids}"
print("PASSED")
@@ -94,10 +67,7 @@ if __name__ == "__main__":
result = offline_infer_check()
sys.exit(0)
except TimeoutError:
print(
"The timeout exit may be due to multiple processes sharing the "
"same gpu card. You can check this using ixsmi on the device."
)
print(TIMEOUT_MSG)
sys.exit(124)
except Exception:
sys.exit(1)

View File

@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import io
import os
import sys
import threading
import requests
from PIL import Image
@@ -24,39 +23,13 @@ from fastdeploy import LLM, SamplingParams
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.utils import set_random_seed
tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, tests_dir)
def timeout(seconds):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
result = [None]
exception = [None]
def target():
try:
result[0] = func(*args, **kwargs)
except Exception as e:
exception[0] = e
thread = threading.Thread(target=target)
thread.daemon = True
thread.start()
thread.join(seconds)
if thread.is_alive():
raise TimeoutError(f"Function timed out after {seconds} seconds")
if exception[0]:
raise exception[0]
return result[0]
return wrapper
return decorator
from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout
@timeout(180)
@timeout(210)
def offline_infer_check():
set_random_seed(123)
@@ -122,9 +95,9 @@ def offline_infer_check():
5119,
93956,
68725,
14449,
4356,
38225,
100282,
23,
23,
2,
], f"{outputs[0].outputs.token_ids}"
print("PASSED")
@@ -135,10 +108,7 @@ if __name__ == "__main__":
result = offline_infer_check()
sys.exit(0)
except TimeoutError:
print(
"The timeout exit may be due to multiple processes sharing the "
"same gpu card. You can check this using ixsmi on the device."
)
print(TIMEOUT_MSG)
sys.exit(124)
except Exception:
sys.exit(1)

View File

@@ -0,0 +1,28 @@
import functools
import signal
def timeout(seconds):
def decorator(func):
def _handle_timeout(signum, frame):
raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds")
@functools.wraps(func)
def wrapper(*args, **kwargs):
original_handler = signal.signal(signal.SIGALRM, _handle_timeout)
signal.alarm(seconds)
try:
result = func(*args, **kwargs)
signal.alarm(0)
return result
finally:
signal.signal(signal.SIGALRM, original_handler)
signal.alarm(0)
return wrapper
return decorator
TIMEOUT_MSG = "The timeout exit may be due to multiple processes sharing the same gpu card. You can check this using ixsmi on the device."