mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 20:02:53 +08:00 
			
		
		
		
	 66f059b65b
			
		
	
	66f059b65b
	
	
	
		
			
			* add GPL lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * support yolov8 * add pybind for yolov8 * add yolov8 readme * add cpp benchmark * add cpu and gpu mem * public part split * add runtime mode * fixed bugs * add cpu_thread_nums * deal with comments * deal with comments * deal with comments * rm useless code * add FASTDEPLOY_DECL * add FASTDEPLOY_DECL * fixed for windows * mv rss to pss * mv rss to pss * Update utils.cc --------- Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
		
			
				
	
	
		
			283 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			283 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import paddlenlp
 | |
| import numpy as np
 | |
| from paddlenlp.transformers import AutoTokenizer
 | |
| from paddlenlp.datasets import load_dataset
 | |
| import fastdeploy as fd
 | |
| import os
 | |
| import time
 | |
| import distutils.util
 | |
| import sys
 | |
| import pynvml
 | |
| import psutil
 | |
| import GPUtil
 | |
| from prettytable import PrettyTable
 | |
| import multiprocessing
 | |
| 
 | |
| 
 | |
| def parse_arguments():
 | |
|     import argparse
 | |
|     import ast
 | |
|     parser = argparse.ArgumentParser()
 | |
|     parser.add_argument(
 | |
|         "--model_dir",
 | |
|         required=True,
 | |
|         help="The directory of model and tokenizer.")
 | |
|     parser.add_argument(
 | |
|         "--device",
 | |
|         type=str,
 | |
|         default='gpu',
 | |
|         choices=['gpu', 'cpu'],
 | |
|         help="Type of inference device, support 'cpu' or 'gpu'.")
 | |
|     parser.add_argument(
 | |
|         "--backend",
 | |
|         type=str,
 | |
|         default='pp',
 | |
|         choices=['ort', 'pp', 'trt', 'pp-trt'],
 | |
|         help="The inference runtime backend.")
 | |
|     parser.add_argument(
 | |
|         "--device_id", type=int, default=0, help="device(gpu) id")
 | |
|     parser.add_argument(
 | |
|         "--batch_size", type=int, default=32, help="The batch size of data.")
 | |
|     parser.add_argument(
 | |
|         "--max_length",
 | |
|         type=int,
 | |
|         default=128,
 | |
|         help="The max length of sequence.")
 | |
|     parser.add_argument(
 | |
|         "--log_interval",
 | |
|         type=int,
 | |
|         default=10,
 | |
|         help="The interval of logging.")
 | |
|     parser.add_argument(
 | |
|         "--cpu_num_threads",
 | |
|         type=int,
 | |
|         default=1,
 | |
|         help="The number of threads when inferring on cpu.")
 | |
|     parser.add_argument(
 | |
|         "--use_fp16",
 | |
|         type=distutils.util.strtobool,
 | |
|         default=False,
 | |
|         help="Use FP16 mode")
 | |
|     parser.add_argument(
 | |
|         "--use_fast",
 | |
|         type=distutils.util.strtobool,
 | |
|         default=True,
 | |
|         help="Whether to use fast_tokenizer to accelarate the tokenization.")
 | |
|     return parser.parse_args()
 | |
| 
 | |
| 
 | |
| def create_fd_runtime(args):
 | |
|     option = fd.RuntimeOption()
 | |
|     model_path = os.path.join(args.model_dir, "infer.pdmodel")
 | |
|     params_path = os.path.join(args.model_dir, "infer.pdiparams")
 | |
|     option.set_model_path(model_path, params_path)
 | |
|     if args.device == 'cpu':
 | |
|         option.use_cpu()
 | |
|         option.set_cpu_thread_num(args.cpu_num_threads)
 | |
|     else:
 | |
|         option.use_gpu(args.device_id)
 | |
|     if args.backend == 'pp':
 | |
|         option.use_paddle_backend()
 | |
|     elif args.backend == 'ort':
 | |
|         option.use_ort_backend()
 | |
|     else:
 | |
|         option.use_trt_backend()
 | |
|         if args.backend == 'pp-trt':
 | |
|             option.enable_paddle_to_trt()
 | |
|             option.enable_paddle_trt_collect_shape()
 | |
|         trt_file = os.path.join(args.model_dir, "infer.trt")
 | |
|         option.set_trt_input_shape(
 | |
|             'input_ids',
 | |
|             min_shape=[1, args.max_length],
 | |
|             opt_shape=[args.batch_size, args.max_length],
 | |
|             max_shape=[args.batch_size, args.max_length])
 | |
|         option.set_trt_input_shape(
 | |
|             'token_type_ids',
 | |
|             min_shape=[1, args.max_length],
 | |
|             opt_shape=[args.batch_size, args.max_length],
 | |
|             max_shape=[args.batch_size, args.max_length])
 | |
|         if args.use_fp16:
 | |
|             option.enable_trt_fp16()
 | |
|             trt_file = trt_file + ".fp16"
 | |
|         option.set_trt_cache_file(trt_file)
 | |
|     return fd.Runtime(option)
 | |
| 
 | |
| 
 | |
| def convert_examples_to_data(dataset, batch_size):
 | |
|     texts, text_pairs, labels = [], [], []
 | |
|     batch_text, batch_text_pair, batch_label = [], [], []
 | |
| 
 | |
|     for i, item in enumerate(dataset):
 | |
|         batch_text.append(item['sentence1'])
 | |
|         batch_text_pair.append(item['sentence2'])
 | |
|         batch_label.append(item['label'])
 | |
|         if (i + 1) % batch_size == 0:
 | |
|             texts.append(batch_text)
 | |
|             text_pairs.append(batch_text_pair)
 | |
|             labels.append(batch_label)
 | |
|             batch_text, batch_text_pair, batch_label = [], [], []
 | |
|     return texts, text_pairs, labels
 | |
| 
 | |
| 
 | |
| def postprocess(logits):
 | |
|     max_value = np.max(logits, axis=1, keepdims=True)
 | |
|     exp_data = np.exp(logits - max_value)
 | |
|     probs = exp_data / np.sum(exp_data, axis=1, keepdims=True)
 | |
|     out_dict = {
 | |
|         "label": probs.argmax(axis=-1),
 | |
|         "confidence": probs.max(axis=-1)
 | |
|     }
 | |
|     return out_dict
 | |
| 
 | |
| 
 | |
| def get_statistics_table(tokenizer_time_costs, runtime_time_costs,
 | |
|                          postprocess_time_costs):
 | |
|     x = PrettyTable()
 | |
|     x.field_names = [
 | |
|         "Stage", "Mean latency", "P50 latency", "P90 latency", "P95 latency"
 | |
|     ]
 | |
|     x.add_row([
 | |
|         "Tokenization", f"{np.mean(tokenizer_time_costs):.4f}",
 | |
|         f"{np.percentile(tokenizer_time_costs, 50):.4f}",
 | |
|         f"{np.percentile(tokenizer_time_costs, 90):.4f}",
 | |
|         f"{np.percentile(tokenizer_time_costs, 95):.4f}"
 | |
|     ])
 | |
|     x.add_row([
 | |
|         "Runtime", f"{np.mean(runtime_time_costs):.4f}",
 | |
|         f"{np.percentile(runtime_time_costs, 50):.4f}",
 | |
|         f"{np.percentile(runtime_time_costs, 90):.4f}",
 | |
|         f"{np.percentile(runtime_time_costs, 95):.4f}"
 | |
|     ])
 | |
|     x.add_row([
 | |
|         "Postprocessing", f"{np.mean(postprocess_time_costs):.4f}",
 | |
|         f"{np.percentile(postprocess_time_costs, 50):.4f}",
 | |
|         f"{np.percentile(postprocess_time_costs, 90):.4f}",
 | |
|         f"{np.percentile(postprocess_time_costs, 95):.4f}"
 | |
|     ])
 | |
|     return x
 | |
| 
 | |
| 
 | |
| def get_current_memory_mb(gpu_id=None):
 | |
|     pid = os.getpid()
 | |
|     p = psutil.Process(pid)
 | |
|     info = p.memory_full_info()
 | |
|     cpu_mem = info.uss / 1024. / 1024.
 | |
|     gpu_mem = 0
 | |
|     if gpu_id is not None:
 | |
|         pynvml.nvmlInit()
 | |
|         handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
 | |
|         meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
 | |
|         gpu_mem = meminfo.used / 1024. / 1024.
 | |
|     return cpu_mem, gpu_mem
 | |
| 
 | |
| 
 | |
| def get_current_gputil(gpu_id):
 | |
|     GPUs = GPUtil.getGPUs()
 | |
|     gpu_load = GPUs[gpu_id].load
 | |
|     return gpu_load
 | |
| 
 | |
| 
 | |
| def sample_gpuutil(gpu_id, gpu_utilization=[]):
 | |
|     while True:
 | |
|         gpu_utilization.append(get_current_gputil(gpu_id))
 | |
|         time.sleep(0.01)
 | |
| 
 | |
| 
 | |
| def show_statistics(tokenizer_time_costs,
 | |
|                     runtime_time_costs,
 | |
|                     postprocess_time_costs,
 | |
|                     correct_num,
 | |
|                     total_num,
 | |
|                     cpu_mem,
 | |
|                     gpu_mem,
 | |
|                     gpu_util,
 | |
|                     prefix=""):
 | |
|     print(
 | |
|         f"{prefix}Acc =  {correct_num/total_num*100:.2f} ({correct_num} /{total_num})."
 | |
|         f" CPU memory: {np.mean(cpu_mem):.2f} MB, GPU memory: {np.mean(gpu_mem):.2f} MB,"
 | |
|         f" GPU utilization {np.max(gpu_util) * 100:.2f}%.")
 | |
|     print(
 | |
|         get_statistics_table(tokenizer_time_costs, runtime_time_costs,
 | |
|                              postprocess_time_costs))
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     args = parse_arguments()
 | |
| 
 | |
|     tokenizer = AutoTokenizer.from_pretrained(
 | |
|         "ernie-3.0-medium-zh", use_faster=args.use_fast)
 | |
|     runtime = create_fd_runtime(args)
 | |
|     input_ids_name = runtime.get_input_info(0).name
 | |
|     token_type_ids_name = runtime.get_input_info(1).name
 | |
| 
 | |
|     test_ds = load_dataset("clue", "afqmc", splits=['dev'])
 | |
|     texts, text_pairs, labels = convert_examples_to_data(test_ds,
 | |
|                                                          args.batch_size)
 | |
|     gpu_id = args.device_id
 | |
| 
 | |
|     def run_inference(warmup_steps=None):
 | |
|         tokenizer_time_costs = []
 | |
|         runtime_time_costs = []
 | |
|         postprocess_time_costs = []
 | |
|         cpu_mem = []
 | |
|         gpu_mem = []
 | |
| 
 | |
|         total_num = 0
 | |
|         correct_num = 0
 | |
| 
 | |
|         manager = multiprocessing.Manager()
 | |
|         gpu_util = manager.list()
 | |
|         p = multiprocessing.Process(
 | |
|             target=sample_gpuutil, args=(gpu_id, gpu_util))
 | |
|         p.start()
 | |
|         for i, (text, text_pair,
 | |
|                 label) in enumerate(zip(texts, text_pairs, labels)):
 | |
|             # Start the process to sample gpu utilization
 | |
|             start = time.time()
 | |
|             encoded_inputs = tokenizer(
 | |
|                 text=text,
 | |
|                 text_pair=text_pair,
 | |
|                 max_length=args.max_length,
 | |
|                 padding='max_length',
 | |
|                 truncation=True,
 | |
|                 return_tensors='np')
 | |
|             tokenizer_time_costs += [(time.time() - start) * 1000]
 | |
| 
 | |
|             start = time.time()
 | |
|             input_map = {
 | |
|                 input_ids_name: encoded_inputs["input_ids"].astype('int64'),
 | |
|                 token_type_ids_name:
 | |
|                 encoded_inputs["token_type_ids"].astype('int64'),
 | |
|             }
 | |
|             results = runtime.infer(input_map)
 | |
|             runtime_time_costs += [(time.time() - start) * 1000]
 | |
| 
 | |
|             start = time.time()
 | |
|             output = postprocess(results[0])
 | |
|             postprocess_time_costs += [(time.time() - start) * 1000]
 | |
| 
 | |
|             cm, gm = get_current_memory_mb(gpu_id)
 | |
|             cpu_mem.append(cm)
 | |
|             gpu_mem.append(gm)
 | |
| 
 | |
|             total_num += len(label)
 | |
|             correct_num += (label == output["label"]).sum()
 | |
|             if warmup_steps is not None and i >= warmup_steps:
 | |
|                 break
 | |
|             if (i + 1) % args.log_interval == 0:
 | |
|                 show_statistics(tokenizer_time_costs, runtime_time_costs,
 | |
|                                 postprocess_time_costs, correct_num, total_num,
 | |
|                                 cpu_mem, gpu_mem, gpu_util,
 | |
|                                 f"Step {i + 1: 6d}: ")
 | |
|         show_statistics(tokenizer_time_costs, runtime_time_costs,
 | |
|                         postprocess_time_costs, correct_num, total_num,
 | |
|                         cpu_mem, gpu_mem, gpu_util, f"Final statistics: ")
 | |
|         p.terminate()
 | |
| 
 | |
|     # Warm up
 | |
|     print("Warm up")
 | |
|     run_inference(10)
 | |
|     print("Start to test the benchmark")
 | |
|     run_inference()
 | |
|     print("Finish")
 |