mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 09:07:10 +08:00
204 lines
6.2 KiB
Python
204 lines
6.2 KiB
Python
import numpy as np
|
|
import os
|
|
import time
|
|
import distutils.util
|
|
import sys
|
|
import json
|
|
|
|
import fastdeploy as fd
|
|
from fastdeploy.text import UIEModel, SchemaLanguage
|
|
import pynvml
|
|
import psutil
|
|
import GPUtil
|
|
import multiprocessing
|
|
|
|
|
|
def parse_arguments():
|
|
import argparse
|
|
import ast
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--model_dir",
|
|
required=True,
|
|
help="The directory of model and tokenizer.")
|
|
parser.add_argument(
|
|
"--data_path", required=True, help="The path of uie data.")
|
|
parser.add_argument(
|
|
"--device",
|
|
type=str,
|
|
default='cpu',
|
|
choices=['gpu', 'cpu'],
|
|
help="Type of inference device, support 'cpu' or 'gpu'.")
|
|
parser.add_argument(
|
|
"--backend",
|
|
type=str,
|
|
default='pp',
|
|
choices=['ort', 'pp', 'trt', 'pp-trt', 'openvino'],
|
|
help="The inference runtime backend.")
|
|
parser.add_argument(
|
|
"--device_id", type=int, default=0, help="device(gpu) id")
|
|
parser.add_argument(
|
|
"--batch_size", type=int, default=1, help="The batch size of data.")
|
|
parser.add_argument(
|
|
"--max_length",
|
|
type=int,
|
|
default=128,
|
|
help="The max length of sequence.")
|
|
parser.add_argument(
|
|
"--log_interval",
|
|
type=int,
|
|
default=10,
|
|
help="The interval of logging.")
|
|
parser.add_argument(
|
|
"--cpu_num_threads",
|
|
type=int,
|
|
default=1,
|
|
help="The number of threads when inferring on cpu.")
|
|
parser.add_argument(
|
|
"--use_fp16",
|
|
type=distutils.util.strtobool,
|
|
default=False,
|
|
help="Use FP16 mode")
|
|
parser.add_argument(
|
|
"--epoch", type=int, default=1, help="The epoch of test")
|
|
return parser.parse_args()
|
|
|
|
|
|
def build_option(args):
|
|
option = fd.RuntimeOption()
|
|
if args.device == 'cpu':
|
|
option.use_cpu()
|
|
option.set_cpu_thread_num(args.cpu_num_threads)
|
|
else:
|
|
option.use_gpu(args.device_id)
|
|
if args.backend == 'pp':
|
|
option.use_paddle_backend()
|
|
elif args.backend == 'ort':
|
|
option.use_ort_backend()
|
|
elif args.backend == 'openvino':
|
|
option.use_openvino_backend()
|
|
else:
|
|
option.use_trt_backend()
|
|
if args.backend == 'pp-trt':
|
|
option.enable_paddle_to_trt()
|
|
option.enable_paddle_trt_collect_shape()
|
|
trt_file = os.path.join(args.model_dir, "infer.trt")
|
|
option.set_trt_input_shape(
|
|
'input_ids',
|
|
min_shape=[1, 1],
|
|
opt_shape=[args.batch_size, args.max_length // 2],
|
|
max_shape=[args.batch_size, args.max_length])
|
|
option.set_trt_input_shape(
|
|
'token_type_ids',
|
|
min_shape=[1, 1],
|
|
opt_shape=[args.batch_size, args.max_length // 2],
|
|
max_shape=[args.batch_size, args.max_length])
|
|
option.set_trt_input_shape(
|
|
'pos_ids',
|
|
min_shape=[1, 1],
|
|
opt_shape=[args.batch_size, args.max_length // 2],
|
|
max_shape=[args.batch_size, args.max_length])
|
|
option.set_trt_input_shape(
|
|
'att_mask',
|
|
min_shape=[1, 1],
|
|
opt_shape=[args.batch_size, args.max_length // 2],
|
|
max_shape=[args.batch_size, args.max_length])
|
|
if args.use_fp16:
|
|
option.enable_trt_fp16()
|
|
trt_file = trt_file + ".fp16"
|
|
option.set_trt_cache_file(trt_file)
|
|
return option
|
|
|
|
|
|
def get_current_memory_mb(gpu_id=None):
|
|
pid = os.getpid()
|
|
p = psutil.Process(pid)
|
|
info = p.memory_full_info()
|
|
cpu_mem = info.uss / 1024. / 1024.
|
|
gpu_mem = 0
|
|
if gpu_id is not None:
|
|
pynvml.nvmlInit()
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
|
|
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
gpu_mem = meminfo.used / 1024. / 1024.
|
|
return cpu_mem, gpu_mem
|
|
|
|
|
|
def get_current_gputil(gpu_id):
|
|
GPUs = GPUtil.getGPUs()
|
|
gpu_load = GPUs[gpu_id].load
|
|
return gpu_load
|
|
|
|
|
|
def sample_gpuutil(gpu_id, gpu_utilization=[]):
|
|
while True:
|
|
gpu_utilization.append(get_current_gputil(gpu_id))
|
|
time.sleep(0.01)
|
|
|
|
|
|
def get_dataset(data_path, max_seq_len=512):
|
|
json_lines = []
|
|
with open(data_path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
json_line = json.loads(line)
|
|
content = json_line['content'].strip()
|
|
prompt = json_line['prompt']
|
|
# Model Input is aslike: [CLS] Prompt [SEP] Content [SEP]
|
|
# It include three summary tokens.
|
|
if max_seq_len <= len(prompt) + 3:
|
|
raise ValueError(
|
|
"The value of max_seq_len is too small, please set a larger value"
|
|
)
|
|
json_lines.append(json_line)
|
|
|
|
return json_lines
|
|
|
|
|
|
def run_inference(ds, uie, epoch=1, warmup_steps=10):
|
|
for j, sample in enumerate(ds):
|
|
if j > warmup_steps:
|
|
break
|
|
uie.set_schema([sample['prompt']])
|
|
result = uie.predict([sample['content']])
|
|
print(f"Run {warmup_steps} steps to warm up")
|
|
start = time.time()
|
|
for ep in range(epoch):
|
|
curr_start = time.time()
|
|
for i, sample in enumerate(ds):
|
|
uie.set_schema([sample['prompt']])
|
|
result = uie.predict([sample['content']])
|
|
print(
|
|
f"Epoch {ep} average time = {(time.time() - curr_start) * 1000.0 / (len(ds)):.4f} ms"
|
|
)
|
|
end = time.time()
|
|
runtime_statis = uie.print_statis_info_of_runtime()
|
|
print(f"Final:")
|
|
print(runtime_statis)
|
|
print(
|
|
f"Total average time = {(end - start) * 1000.0 / (len(ds) * epoch):.4f} ms"
|
|
)
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args = parse_arguments()
|
|
runtime_option = build_option(args)
|
|
model_path = os.path.join(args.model_dir, "inference.pdmodel")
|
|
param_path = os.path.join(args.model_dir, "inference.pdiparams")
|
|
vocab_path = os.path.join(args.model_dir, "vocab.txt")
|
|
|
|
ds = get_dataset(args.data_path)
|
|
schema = ["时间"]
|
|
uie = UIEModel(
|
|
model_path,
|
|
param_path,
|
|
vocab_path,
|
|
position_prob=0.5,
|
|
max_length=args.max_length,
|
|
schema=schema,
|
|
runtime_option=runtime_option,
|
|
schema_language=SchemaLanguage.ZH)
|
|
|
|
uie.enable_record_time_of_runtime()
|
|
run_inference(ds, uie, args.epoch)
|