Files
FastDeploy/benchmark/benchmark_ernie_seq_cls.py
Jack Zhou c2533f1f75 [Benchmark] Add benchmark for ernie sequence classification (#444)
* Add benchmark for ernie sequence classification

* Add pretty print

* Update benchmark of ernie

* get_table -> get_statistics_table

* add comments

* Update the output

* Add cpu gpu memory statitis

* Add gpu utilization sampling

Co-authored-by: Jason <jiangjiajun@baidu.com>
2022-11-15 11:25:32 +08:00

283 lines
9.3 KiB
Python

import paddlenlp
import numpy as np
from paddlenlp.transformers import AutoTokenizer
from paddlenlp.datasets import load_dataset
import fastdeploy as fd
import os
import time
import distutils.util
import sys
import pynvml
import psutil
import GPUtil
from prettytable import PrettyTable
import multiprocessing
def parse_arguments():
import argparse
import ast
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_dir",
required=True,
help="The directory of model and tokenizer.")
parser.add_argument(
"--device",
type=str,
default='gpu',
choices=['gpu', 'cpu'],
help="Type of inference device, support 'cpu' or 'gpu'.")
parser.add_argument(
"--backend",
type=str,
default='pp',
choices=['ort', 'pp', 'trt', 'pp-trt'],
help="The inference runtime backend.")
parser.add_argument(
"--device_id", type=int, default=0, help="device(gpu) id")
parser.add_argument(
"--batch_size", type=int, default=32, help="The batch size of data.")
parser.add_argument(
"--max_length",
type=int,
default=128,
help="The max length of sequence.")
parser.add_argument(
"--log_interval",
type=int,
default=10,
help="The interval of logging.")
parser.add_argument(
"--cpu_num_threads",
type=int,
default=1,
help="The number of threads when inferring on cpu.")
parser.add_argument(
"--use_fp16",
type=distutils.util.strtobool,
default=False,
help="Use FP16 mode")
parser.add_argument(
"--use_fast",
type=distutils.util.strtobool,
default=True,
help="Whether to use fast_tokenizer to accelarate the tokenization.")
return parser.parse_args()
def create_fd_runtime(args):
option = fd.RuntimeOption()
model_path = os.path.join(args.model_dir, "infer.pdmodel")
params_path = os.path.join(args.model_dir, "infer.pdiparams")
option.set_model_path(model_path, params_path)
if args.device == 'cpu':
option.use_cpu()
option.set_cpu_thread_num(args.cpu_num_threads)
else:
option.use_gpu(args.device_id)
if args.backend == 'pp':
option.use_paddle_backend()
elif args.backend == 'ort':
option.use_ort_backend()
else:
option.use_trt_backend()
if args.backend == 'pp-trt':
option.enable_paddle_to_trt()
option.enable_paddle_trt_collect_shape()
trt_file = os.path.join(args.model_dir, "infer.trt")
option.set_trt_input_shape(
'input_ids',
min_shape=[1, args.max_length],
opt_shape=[args.batch_size, args.max_length],
max_shape=[args.batch_size, args.max_length])
option.set_trt_input_shape(
'token_type_ids',
min_shape=[1, args.max_length],
opt_shape=[args.batch_size, args.max_length],
max_shape=[args.batch_size, args.max_length])
if args.use_fp16:
option.enable_trt_fp16()
trt_file = trt_file + ".fp16"
option.set_trt_cache_file(trt_file)
return fd.Runtime(option)
def convert_examples_to_data(dataset, batch_size):
texts, text_pairs, labels = [], [], []
batch_text, batch_text_pair, batch_label = [], [], []
for i, item in enumerate(dataset):
batch_text.append(item['sentence1'])
batch_text_pair.append(item['sentence2'])
batch_label.append(item['label'])
if (i + 1) % batch_size == 0:
texts.append(batch_text)
text_pairs.append(batch_text_pair)
labels.append(batch_label)
batch_text, batch_text_pair, batch_label = [], [], []
return texts, text_pairs, labels
def postprocess(logits):
max_value = np.max(logits, axis=1, keepdims=True)
exp_data = np.exp(logits - max_value)
probs = exp_data / np.sum(exp_data, axis=1, keepdims=True)
out_dict = {
"label": probs.argmax(axis=-1),
"confidence": probs.max(axis=-1)
}
return out_dict
def get_statistics_table(tokenizer_time_costs, runtime_time_costs,
postprocess_time_costs):
x = PrettyTable()
x.field_names = [
"Stage", "Mean latency", "P50 latency", "P90 latency", "P95 latency"
]
x.add_row([
"Tokenization", f"{np.mean(tokenizer_time_costs):.4f}",
f"{np.percentile(tokenizer_time_costs, 50):.4f}",
f"{np.percentile(tokenizer_time_costs, 90):.4f}",
f"{np.percentile(tokenizer_time_costs, 95):.4f}"
])
x.add_row([
"Runtime", f"{np.mean(runtime_time_costs):.4f}",
f"{np.percentile(runtime_time_costs, 50):.4f}",
f"{np.percentile(runtime_time_costs, 90):.4f}",
f"{np.percentile(runtime_time_costs, 95):.4f}"
])
x.add_row([
"Postprocessing", f"{np.mean(postprocess_time_costs):.4f}",
f"{np.percentile(postprocess_time_costs, 50):.4f}",
f"{np.percentile(postprocess_time_costs, 90):.4f}",
f"{np.percentile(postprocess_time_costs, 95):.4f}"
])
return x
def get_current_memory_mb(gpu_id=None):
pid = os.getpid()
p = psutil.Process(pid)
info = p.memory_full_info()
cpu_mem = info.uss / 1024. / 1024.
gpu_mem = 0
if gpu_id is not None:
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu_mem = meminfo.used / 1024. / 1024.
return cpu_mem, gpu_mem
def get_current_gputil(gpu_id):
GPUs = GPUtil.getGPUs()
gpu_load = GPUs[gpu_id].load
return gpu_load
def sample_gpuutil(gpu_id, gpu_utilization=[]):
while True:
gpu_utilization.append(get_current_gputil(gpu_id))
time.sleep(0.01)
def show_statistics(tokenizer_time_costs,
runtime_time_costs,
postprocess_time_costs,
correct_num,
total_num,
cpu_mem,
gpu_mem,
gpu_util,
prefix=""):
print(
f"{prefix}Acc = {correct_num/total_num*100:.2f} ({correct_num} /{total_num})."
f" CPU memory: {np.mean(cpu_mem):.2f} MB, GPU memory: {np.mean(gpu_mem):.2f} MB,"
f" GPU utilization {np.max(gpu_util) * 100:.2f}%.")
print(
get_statistics_table(tokenizer_time_costs, runtime_time_costs,
postprocess_time_costs))
if __name__ == "__main__":
args = parse_arguments()
tokenizer = AutoTokenizer.from_pretrained(
"ernie-3.0-medium-zh", use_faster=args.use_fast)
runtime = create_fd_runtime(args)
input_ids_name = runtime.get_input_info(0).name
token_type_ids_name = runtime.get_input_info(1).name
test_ds = load_dataset("clue", "afqmc", splits=['dev'])
texts, text_pairs, labels = convert_examples_to_data(test_ds,
args.batch_size)
gpu_id = args.device_id
def run_inference(warmup_steps=None):
tokenizer_time_costs = []
runtime_time_costs = []
postprocess_time_costs = []
cpu_mem = []
gpu_mem = []
total_num = 0
correct_num = 0
manager = multiprocessing.Manager()
gpu_util = manager.list()
p = multiprocessing.Process(
target=sample_gpuutil, args=(gpu_id, gpu_util))
p.start()
for i, (text, text_pair,
label) in enumerate(zip(texts, text_pairs, labels)):
# Start the process to sample gpu utilization
start = time.time()
encoded_inputs = tokenizer(
text=text,
text_pair=text_pair,
max_length=args.max_length,
padding='max_length',
truncation=True,
return_tensors='np')
tokenizer_time_costs += [(time.time() - start) * 1000]
start = time.time()
input_map = {
input_ids_name: encoded_inputs["input_ids"].astype('int64'),
token_type_ids_name:
encoded_inputs["token_type_ids"].astype('int64'),
}
results = runtime.infer(input_map)
runtime_time_costs += [(time.time() - start) * 1000]
start = time.time()
output = postprocess(results[0])
postprocess_time_costs += [(time.time() - start) * 1000]
cm, gm = get_current_memory_mb(gpu_id)
cpu_mem.append(cm)
gpu_mem.append(gm)
total_num += len(label)
correct_num += (label == output["label"]).sum()
if warmup_steps is not None and i >= warmup_steps:
break
if (i + 1) % args.log_interval == 0:
show_statistics(tokenizer_time_costs, runtime_time_costs,
postprocess_time_costs, correct_num, total_num,
cpu_mem, gpu_mem, gpu_util,
f"Step {i + 1: 6d}: ")
show_statistics(tokenizer_time_costs, runtime_time_costs,
postprocess_time_costs, correct_num, total_num,
cpu_mem, gpu_mem, gpu_util, f"Final statistics: ")
p.terminate()
# Warm up
print("Warm up")
run_inference(10)
print("Start to test the benchmark")
run_inference()
print("Finish")