mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00

* add GPL lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * support yolov8 * add pybind for yolov8 * add yolov8 readme * add cpp benchmark * add cpu and gpu mem * public part split * add runtime mode * fixed bugs * add cpu_thread_nums * deal with comments * deal with comments * deal with comments * rm useless code * add FASTDEPLOY_DECL * add FASTDEPLOY_DECL
283 lines
9.3 KiB
Python
283 lines
9.3 KiB
Python
import paddlenlp
|
|
import numpy as np
|
|
from paddlenlp.transformers import AutoTokenizer
|
|
from paddlenlp.datasets import load_dataset
|
|
import fastdeploy as fd
|
|
import os
|
|
import time
|
|
import distutils.util
|
|
import sys
|
|
import pynvml
|
|
import psutil
|
|
import GPUtil
|
|
from prettytable import PrettyTable
|
|
import multiprocessing
|
|
|
|
|
|
def parse_arguments():
|
|
import argparse
|
|
import ast
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--model_dir",
|
|
required=True,
|
|
help="The directory of model and tokenizer.")
|
|
parser.add_argument(
|
|
"--device",
|
|
type=str,
|
|
default='gpu',
|
|
choices=['gpu', 'cpu'],
|
|
help="Type of inference device, support 'cpu' or 'gpu'.")
|
|
parser.add_argument(
|
|
"--backend",
|
|
type=str,
|
|
default='pp',
|
|
choices=['ort', 'pp', 'trt', 'pp-trt'],
|
|
help="The inference runtime backend.")
|
|
parser.add_argument(
|
|
"--device_id", type=int, default=0, help="device(gpu) id")
|
|
parser.add_argument(
|
|
"--batch_size", type=int, default=32, help="The batch size of data.")
|
|
parser.add_argument(
|
|
"--max_length",
|
|
type=int,
|
|
default=128,
|
|
help="The max length of sequence.")
|
|
parser.add_argument(
|
|
"--log_interval",
|
|
type=int,
|
|
default=10,
|
|
help="The interval of logging.")
|
|
parser.add_argument(
|
|
"--cpu_num_threads",
|
|
type=int,
|
|
default=1,
|
|
help="The number of threads when inferring on cpu.")
|
|
parser.add_argument(
|
|
"--use_fp16",
|
|
type=distutils.util.strtobool,
|
|
default=False,
|
|
help="Use FP16 mode")
|
|
parser.add_argument(
|
|
"--use_fast",
|
|
type=distutils.util.strtobool,
|
|
default=True,
|
|
help="Whether to use fast_tokenizer to accelarate the tokenization.")
|
|
return parser.parse_args()
|
|
|
|
|
|
def create_fd_runtime(args):
|
|
option = fd.RuntimeOption()
|
|
model_path = os.path.join(args.model_dir, "infer.pdmodel")
|
|
params_path = os.path.join(args.model_dir, "infer.pdiparams")
|
|
option.set_model_path(model_path, params_path)
|
|
if args.device == 'cpu':
|
|
option.use_cpu()
|
|
option.set_cpu_thread_num(args.cpu_num_threads)
|
|
else:
|
|
option.use_gpu(args.device_id)
|
|
if args.backend == 'pp':
|
|
option.use_paddle_backend()
|
|
elif args.backend == 'ort':
|
|
option.use_ort_backend()
|
|
else:
|
|
option.use_trt_backend()
|
|
if args.backend == 'pp-trt':
|
|
option.enable_paddle_to_trt()
|
|
option.enable_paddle_trt_collect_shape()
|
|
trt_file = os.path.join(args.model_dir, "infer.trt")
|
|
option.set_trt_input_shape(
|
|
'input_ids',
|
|
min_shape=[1, args.max_length],
|
|
opt_shape=[args.batch_size, args.max_length],
|
|
max_shape=[args.batch_size, args.max_length])
|
|
option.set_trt_input_shape(
|
|
'token_type_ids',
|
|
min_shape=[1, args.max_length],
|
|
opt_shape=[args.batch_size, args.max_length],
|
|
max_shape=[args.batch_size, args.max_length])
|
|
if args.use_fp16:
|
|
option.enable_trt_fp16()
|
|
trt_file = trt_file + ".fp16"
|
|
option.set_trt_cache_file(trt_file)
|
|
return fd.Runtime(option)
|
|
|
|
|
|
def convert_examples_to_data(dataset, batch_size):
|
|
texts, text_pairs, labels = [], [], []
|
|
batch_text, batch_text_pair, batch_label = [], [], []
|
|
|
|
for i, item in enumerate(dataset):
|
|
batch_text.append(item['sentence1'])
|
|
batch_text_pair.append(item['sentence2'])
|
|
batch_label.append(item['label'])
|
|
if (i + 1) % batch_size == 0:
|
|
texts.append(batch_text)
|
|
text_pairs.append(batch_text_pair)
|
|
labels.append(batch_label)
|
|
batch_text, batch_text_pair, batch_label = [], [], []
|
|
return texts, text_pairs, labels
|
|
|
|
|
|
def postprocess(logits):
|
|
max_value = np.max(logits, axis=1, keepdims=True)
|
|
exp_data = np.exp(logits - max_value)
|
|
probs = exp_data / np.sum(exp_data, axis=1, keepdims=True)
|
|
out_dict = {
|
|
"label": probs.argmax(axis=-1),
|
|
"confidence": probs.max(axis=-1)
|
|
}
|
|
return out_dict
|
|
|
|
|
|
def get_statistics_table(tokenizer_time_costs, runtime_time_costs,
|
|
postprocess_time_costs):
|
|
x = PrettyTable()
|
|
x.field_names = [
|
|
"Stage", "Mean latency", "P50 latency", "P90 latency", "P95 latency"
|
|
]
|
|
x.add_row([
|
|
"Tokenization", f"{np.mean(tokenizer_time_costs):.4f}",
|
|
f"{np.percentile(tokenizer_time_costs, 50):.4f}",
|
|
f"{np.percentile(tokenizer_time_costs, 90):.4f}",
|
|
f"{np.percentile(tokenizer_time_costs, 95):.4f}"
|
|
])
|
|
x.add_row([
|
|
"Runtime", f"{np.mean(runtime_time_costs):.4f}",
|
|
f"{np.percentile(runtime_time_costs, 50):.4f}",
|
|
f"{np.percentile(runtime_time_costs, 90):.4f}",
|
|
f"{np.percentile(runtime_time_costs, 95):.4f}"
|
|
])
|
|
x.add_row([
|
|
"Postprocessing", f"{np.mean(postprocess_time_costs):.4f}",
|
|
f"{np.percentile(postprocess_time_costs, 50):.4f}",
|
|
f"{np.percentile(postprocess_time_costs, 90):.4f}",
|
|
f"{np.percentile(postprocess_time_costs, 95):.4f}"
|
|
])
|
|
return x
|
|
|
|
|
|
def get_current_memory_mb(gpu_id=None):
|
|
pid = os.getpid()
|
|
p = psutil.Process(pid)
|
|
info = p.memory_full_info()
|
|
cpu_mem = info.uss / 1024. / 1024.
|
|
gpu_mem = 0
|
|
if gpu_id is not None:
|
|
pynvml.nvmlInit()
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
|
|
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
gpu_mem = meminfo.used / 1024. / 1024.
|
|
return cpu_mem, gpu_mem
|
|
|
|
|
|
def get_current_gputil(gpu_id):
|
|
GPUs = GPUtil.getGPUs()
|
|
gpu_load = GPUs[gpu_id].load
|
|
return gpu_load
|
|
|
|
|
|
def sample_gpuutil(gpu_id, gpu_utilization=[]):
|
|
while True:
|
|
gpu_utilization.append(get_current_gputil(gpu_id))
|
|
time.sleep(0.01)
|
|
|
|
|
|
def show_statistics(tokenizer_time_costs,
|
|
runtime_time_costs,
|
|
postprocess_time_costs,
|
|
correct_num,
|
|
total_num,
|
|
cpu_mem,
|
|
gpu_mem,
|
|
gpu_util,
|
|
prefix=""):
|
|
print(
|
|
f"{prefix}Acc = {correct_num/total_num*100:.2f} ({correct_num} /{total_num})."
|
|
f" CPU memory: {np.mean(cpu_mem):.2f} MB, GPU memory: {np.mean(gpu_mem):.2f} MB,"
|
|
f" GPU utilization {np.max(gpu_util) * 100:.2f}%.")
|
|
print(
|
|
get_statistics_table(tokenizer_time_costs, runtime_time_costs,
|
|
postprocess_time_costs))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_arguments()
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
"ernie-3.0-medium-zh", use_faster=args.use_fast)
|
|
runtime = create_fd_runtime(args)
|
|
input_ids_name = runtime.get_input_info(0).name
|
|
token_type_ids_name = runtime.get_input_info(1).name
|
|
|
|
test_ds = load_dataset("clue", "afqmc", splits=['dev'])
|
|
texts, text_pairs, labels = convert_examples_to_data(test_ds,
|
|
args.batch_size)
|
|
gpu_id = args.device_id
|
|
|
|
def run_inference(warmup_steps=None):
|
|
tokenizer_time_costs = []
|
|
runtime_time_costs = []
|
|
postprocess_time_costs = []
|
|
cpu_mem = []
|
|
gpu_mem = []
|
|
|
|
total_num = 0
|
|
correct_num = 0
|
|
|
|
manager = multiprocessing.Manager()
|
|
gpu_util = manager.list()
|
|
p = multiprocessing.Process(
|
|
target=sample_gpuutil, args=(gpu_id, gpu_util))
|
|
p.start()
|
|
for i, (text, text_pair,
|
|
label) in enumerate(zip(texts, text_pairs, labels)):
|
|
# Start the process to sample gpu utilization
|
|
start = time.time()
|
|
encoded_inputs = tokenizer(
|
|
text=text,
|
|
text_pair=text_pair,
|
|
max_length=args.max_length,
|
|
padding='max_length',
|
|
truncation=True,
|
|
return_tensors='np')
|
|
tokenizer_time_costs += [(time.time() - start) * 1000]
|
|
|
|
start = time.time()
|
|
input_map = {
|
|
input_ids_name: encoded_inputs["input_ids"].astype('int64'),
|
|
token_type_ids_name:
|
|
encoded_inputs["token_type_ids"].astype('int64'),
|
|
}
|
|
results = runtime.infer(input_map)
|
|
runtime_time_costs += [(time.time() - start) * 1000]
|
|
|
|
start = time.time()
|
|
output = postprocess(results[0])
|
|
postprocess_time_costs += [(time.time() - start) * 1000]
|
|
|
|
cm, gm = get_current_memory_mb(gpu_id)
|
|
cpu_mem.append(cm)
|
|
gpu_mem.append(gm)
|
|
|
|
total_num += len(label)
|
|
correct_num += (label == output["label"]).sum()
|
|
if warmup_steps is not None and i >= warmup_steps:
|
|
break
|
|
if (i + 1) % args.log_interval == 0:
|
|
show_statistics(tokenizer_time_costs, runtime_time_costs,
|
|
postprocess_time_costs, correct_num, total_num,
|
|
cpu_mem, gpu_mem, gpu_util,
|
|
f"Step {i + 1: 6d}: ")
|
|
show_statistics(tokenizer_time_costs, runtime_time_costs,
|
|
postprocess_time_costs, correct_num, total_num,
|
|
cpu_mem, gpu_mem, gpu_util, f"Final statistics: ")
|
|
p.terminate()
|
|
|
|
# Warm up
|
|
print("Warm up")
|
|
run_inference(10)
|
|
print("Start to test the benchmark")
|
|
run_inference()
|
|
print("Finish")
|