Files
FastDeploy/benchmark/benchmark_uie.py
2022-12-29 08:26:32 +00:00

204 lines
6.2 KiB
Python

import numpy as np
import os
import time
import distutils.util
import sys
import json
import fastdeploy as fd
from fastdeploy.text import UIEModel, SchemaLanguage
import pynvml
import psutil
import GPUtil
import multiprocessing
def parse_arguments():
import argparse
import ast
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_dir",
required=True,
help="The directory of model and tokenizer.")
parser.add_argument(
"--data_path", required=True, help="The path of uie data.")
parser.add_argument(
"--device",
type=str,
default='cpu',
choices=['gpu', 'cpu'],
help="Type of inference device, support 'cpu' or 'gpu'.")
parser.add_argument(
"--backend",
type=str,
default='pp',
choices=['ort', 'pp', 'trt', 'pp-trt', 'openvino'],
help="The inference runtime backend.")
parser.add_argument(
"--device_id", type=int, default=0, help="device(gpu) id")
parser.add_argument(
"--batch_size", type=int, default=1, help="The batch size of data.")
parser.add_argument(
"--max_length",
type=int,
default=128,
help="The max length of sequence.")
parser.add_argument(
"--log_interval",
type=int,
default=10,
help="The interval of logging.")
parser.add_argument(
"--cpu_num_threads",
type=int,
default=1,
help="The number of threads when inferring on cpu.")
parser.add_argument(
"--use_fp16",
type=distutils.util.strtobool,
default=False,
help="Use FP16 mode")
parser.add_argument(
"--epoch", type=int, default=1, help="The epoch of test")
return parser.parse_args()
def build_option(args):
option = fd.RuntimeOption()
if args.device == 'cpu':
option.use_cpu()
option.set_cpu_thread_num(args.cpu_num_threads)
else:
option.use_gpu(args.device_id)
if args.backend == 'pp':
option.use_paddle_backend()
elif args.backend == 'ort':
option.use_ort_backend()
elif args.backend == 'openvino':
option.use_openvino_backend()
else:
option.use_trt_backend()
if args.backend == 'pp-trt':
option.enable_paddle_to_trt()
option.enable_paddle_trt_collect_shape()
trt_file = os.path.join(args.model_dir, "infer.trt")
option.set_trt_input_shape(
'input_ids',
min_shape=[1, 1],
opt_shape=[args.batch_size, args.max_length // 2],
max_shape=[args.batch_size, args.max_length])
option.set_trt_input_shape(
'token_type_ids',
min_shape=[1, 1],
opt_shape=[args.batch_size, args.max_length // 2],
max_shape=[args.batch_size, args.max_length])
option.set_trt_input_shape(
'pos_ids',
min_shape=[1, 1],
opt_shape=[args.batch_size, args.max_length // 2],
max_shape=[args.batch_size, args.max_length])
option.set_trt_input_shape(
'att_mask',
min_shape=[1, 1],
opt_shape=[args.batch_size, args.max_length // 2],
max_shape=[args.batch_size, args.max_length])
if args.use_fp16:
option.enable_trt_fp16()
trt_file = trt_file + ".fp16"
option.set_trt_cache_file(trt_file)
return option
def get_current_memory_mb(gpu_id=None):
pid = os.getpid()
p = psutil.Process(pid)
info = p.memory_full_info()
cpu_mem = info.uss / 1024. / 1024.
gpu_mem = 0
if gpu_id is not None:
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu_mem = meminfo.used / 1024. / 1024.
return cpu_mem, gpu_mem
def get_current_gputil(gpu_id):
GPUs = GPUtil.getGPUs()
gpu_load = GPUs[gpu_id].load
return gpu_load
def sample_gpuutil(gpu_id, gpu_utilization=[]):
while True:
gpu_utilization.append(get_current_gputil(gpu_id))
time.sleep(0.01)
def get_dataset(data_path, max_seq_len=512):
json_lines = []
with open(data_path, 'r', encoding='utf-8') as f:
for line in f:
json_line = json.loads(line)
content = json_line['content'].strip()
prompt = json_line['prompt']
# Model Input is aslike: [CLS] Prompt [SEP] Content [SEP]
# It include three summary tokens.
if max_seq_len <= len(prompt) + 3:
raise ValueError(
"The value of max_seq_len is too small, please set a larger value"
)
json_lines.append(json_line)
return json_lines
def run_inference(ds, uie, epoch=1, warmup_steps=10):
for j, sample in enumerate(ds):
if j > warmup_steps:
break
uie.set_schema([sample['prompt']])
result = uie.predict([sample['content']])
print(f"Run {warmup_steps} steps to warm up")
start = time.time()
for ep in range(epoch):
curr_start = time.time()
for i, sample in enumerate(ds):
uie.set_schema([sample['prompt']])
result = uie.predict([sample['content']])
print(
f"Epoch {ep} average time = {(time.time() - curr_start) * 1000.0 / (len(ds)):.4f} ms"
)
end = time.time()
runtime_statis = uie.print_statis_info_of_runtime()
print(f"Final:")
print(runtime_statis)
print(
f"Total average time = {(end - start) * 1000.0 / (len(ds) * epoch):.4f} ms"
)
print()
if __name__ == '__main__':
args = parse_arguments()
runtime_option = build_option(args)
model_path = os.path.join(args.model_dir, "inference.pdmodel")
param_path = os.path.join(args.model_dir, "inference.pdiparams")
vocab_path = os.path.join(args.model_dir, "vocab.txt")
ds = get_dataset(args.data_path)
schema = ["时间"]
uie = UIEModel(
model_path,
param_path,
vocab_path,
position_prob=0.5,
max_length=args.max_length,
schema=schema,
runtime_option=runtime_option,
schema_language=SchemaLanguage.ZH)
uie.enable_record_time_of_runtime()
run_inference(ds, uie, args.epoch)