From 4d2fbcb030fb7d9e90f453c3e6a6710f82531d12 Mon Sep 17 00:00:00 2001 From: WJJ1995 Date: Thu, 15 Sep 2022 21:36:10 +0800 Subject: [PATCH] Add Benchmark readme (#236) * add ppcls benchmark * add ppcls benchmark * add ppcls benchmark * add ppcls benchmark * fixed txt path * resolve conflict * resolve conflict * deal with comments * Add enable_trt_fp16 option * Add OV backend for seg and det * fixed valid backends in ppdet * fixed valid backends in yolo * add copyright and rm Chinese Notes * add ppdet&ppseg&yolo benchmark * add cpu/gpu mem info * Add benchmark readme * fixed bug Co-authored-by: Jason --- benchmark/README.md | 91 ++++++++++++++++++++++++++++++++++++ benchmark/benchmark_ppcls.py | 23 +++++---- benchmark/benchmark_ppdet.py | 23 +++++---- benchmark/benchmark_ppseg.py | 23 +++++---- benchmark/benchmark_yolo.py | 23 +++++---- 5 files changed, 147 insertions(+), 36 deletions(-) create mode 100644 benchmark/README.md diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 000000000..532e07c55 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,91 @@ +# FastDeploy Benchmarks + +在跑benchmark前,需确认以下两个步骤 + +* 1. 软硬件环境满足要求,参考[FastDeploy环境要求](../docs/environment.md) +* 2. FastDeploy Python whl包安装,参考[FastDeploy Python安装](../docs/quick_start) + +FastDeploy 目前支持多种推理后端,下面以 PaddleClas MobileNetV1 为例,跑出多后端在 CPU/GPU 对应 benchmark 数据 + +```bash +# 下载 MobileNetV1 模型 +wget https://bj.bcebos.com/paddlehub/fastdeploy/MobileNetV1_x0_25_infer.tgz +tar -xvf MobileNetV1_x0_25_infer + +# 下载图片 +wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg + +# CPU +# Paddle Inference +python benchmark_ppcls.py --model MobileNetV1_x0_25_infer --image ILSVRC2012_val_00000010.jpeg --cpu_num_thread 8 --iter_num 2000 --backend paddle + +# ONNX Runtime +python benchmark_ppcls.py --model MobileNetV1_x0_25_infer --image ILSVRC2012_val_00000010.jpeg --cpu_num_thread 8 --iter_num 2000 --backend ort + +# OpenVINO +python benchmark_ppcls.py --model MobileNetV1_x0_25_infer --image ILSVRC2012_val_00000010.jpeg --cpu_num_thread 8 --iter_num 2000 --backend ov + +# GPU +# Paddle Inference +python benchmark_ppcls.py --model MobileNetV1_x0_25_infer --image ILSVRC2012_val_00000010.jpeg --device gpu --iter_num 2000 --backend paddle + +# ONNX Runtime +python benchmark_ppcls.py --model MobileNetV1_x0_25_infer --image ILSVRC2012_val_00000010.jpeg --device gpu --iter_num 2000 --backend ort + +# TensorRT +python benchmark_ppcls.py --model MobileNetV1_x0_25_infer --image ILSVRC2012_val_00000010.jpeg --device gpu --iter_num 2000 --backend trt + +# TensorRT fp16 +python benchmark_ppcls.py --model MobileNetV1_x0_25_infer --image ILSVRC2012_val_00000010.jpeg --device gpu --iter_num 2000 --backend trt --enable_trt_fp16 True + +``` + +**具体参数说明** + +| 参数 | 作用 | +| -------------------- | ------------------------------------------ | +| --model | 模型路径 | +| --image | 图片路径 | +| --device | 选择 CPU 还是 GPU,默认 CPU | +| --cpu_num_thread | CPU 线程数 | +| --device_id | GPU 卡号 | +| --iter_num | 跑 benchmark 的迭代次数 | +| --backend | 指定后端类型,有ort, ov, trt, paddle四个选项 | +| --enable_trt_fp16 | 当后端为trt时,是否开启fp16 | + +**最终txt结果** + +将当前目录的所有txt汇总并结构化,执行下列命令 + +```bash +# 汇总 +cat *.txt >> ./result_ppcls.txt + +# 结构化信息 +python convert_info.py --txt_path result_ppcls.txt --domain ppcls +``` + +得到 CPU 结果```struct_cpu_ppcls.txt```以及 GPU 结果```struct_gpu_ppcls.txt```如下所示 + +```bash +# struct_cpu_ppcls.txt +model_name thread_nums ort_run ort_end2end cpu_rss_mb ov_run ov_end2end cpu_rss_mb paddle_run paddle_end2end cpu_rss_mb +MobileNetV1_x0_25 8 1.18 3.27 270.43 0.87 1.98 272.26 3.13 5.29 899.57 + +# struct_gpu_ppcls.txt +model_name ort_run ort_end2end gpu_rss_mb paddle_run paddle_end2end gpu_rss_mb trt_run trt_end2end gpu_rss_mb trt_fp16_run trt_fp16_end2end gpu_rss_mb +MobileNetV1_x0_25 1.25 3.24 677.06 2.00 3.77 945.06 0.67 2.66 851.06 0.53 2.46 839.06 +``` + +**结果说明** + +* ```_run```后缀代表一次infer耗时,包括H2D以及D2H;```_end2end```后缀代表包含前后处理耗时 +* ```cpu_rss_mb```代表内存占用;```gpu_rss_mb```代表显存占用 + +若有多个PaddleClas模型,在当前目录新建ppcls_model目录,将所有模型放入该目录即可,运行下列命令 + +```bash +sh run_benchmark_ppcls.sh +``` + +一键得到所有模型在 CPU 以及 GPU 的 benchmark 数据 diff --git a/benchmark/benchmark_ppcls.py b/benchmark/benchmark_ppcls.py index 75381ceeb..410f20e41 100644 --- a/benchmark/benchmark_ppcls.py +++ b/benchmark/benchmark_ppcls.py @@ -117,7 +117,9 @@ if __name__ == '__main__': gpu_id = args.device_id end2end_statis = list() - cpu_mem, gpu_mem, gpu_util = 0, 0, 0 + cpu_mem = list() + gpu_mem = list() + gpu_util = list() if args.device == "cpu": file_path = args.model + "_model_" + args.backend + "_" + \ args.device + "_" + str(args.cpu_num_thread) + ".txt" @@ -139,23 +141,26 @@ if __name__ == '__main__': start = time.time() result = model.predict(im) end2end_statis.append(time.time() - start) - gpu_util += get_current_gputil(gpu_id) + gpu_util.append(get_current_gputil(gpu_id)) cm, gm = get_current_memory_mb(gpu_id) - cpu_mem += cm - gpu_mem += gm + cpu_mem.append(cm) + gpu_mem.append(gm) runtime_statis = model.print_statis_info_of_runtime() warmup_iter = args.iter_num // 5 repeat_iter = args.iter_num - warmup_iter - end2end_statis = end2end_statis[warmup_iter:] + end2end_statis_repeat = end2end_statis[warmup_iter:] + cpu_mem_repeat = cpu_mem[warmup_iter:] + gpu_mem_repeat = gpu_mem[warmup_iter:] + gpu_util_repeat = gpu_util[warmup_iter:] dump_result = dict() dump_result["runtime"] = runtime_statis["avg_time"] * 1000 - dump_result["end2end"] = np.mean(end2end_statis) * 1000 - dump_result["cpu_rss_mb"] = cpu_mem / repeat_iter - dump_result["gpu_rss_mb"] = gpu_mem / repeat_iter - dump_result["gpu_util"] = gpu_util / repeat_iter + dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000 + dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat) + dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat) + dump_result["gpu_util"] = np.mean(gpu_util_repeat) f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"]))) f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"]))) diff --git a/benchmark/benchmark_ppdet.py b/benchmark/benchmark_ppdet.py index 71fb665c4..6b2f946f9 100644 --- a/benchmark/benchmark_ppdet.py +++ b/benchmark/benchmark_ppdet.py @@ -119,7 +119,9 @@ if __name__ == '__main__': gpu_id = args.device_id end2end_statis = list() - cpu_mem, gpu_mem, gpu_util = 0, 0, 0 + cpu_mem = list() + gpu_mem = list() + gpu_util = list() if args.device == "cpu": file_path = args.model + "_model_" + args.backend + "_" + \ args.device + "_" + str(args.cpu_num_thread) + ".txt" @@ -159,23 +161,26 @@ if __name__ == '__main__': start = time.time() result = model.predict(im) end2end_statis.append(time.time() - start) - gpu_util += get_current_gputil(gpu_id) + gpu_util.append(get_current_gputil(gpu_id)) cm, gm = get_current_memory_mb(gpu_id) - cpu_mem += cm - gpu_mem += gm + cpu_mem.append(cm) + gpu_mem.append(gm) runtime_statis = model.print_statis_info_of_runtime() warmup_iter = args.iter_num // 5 repeat_iter = args.iter_num - warmup_iter - end2end_statis = end2end_statis[warmup_iter:] + end2end_statis_repeat = end2end_statis[warmup_iter:] + cpu_mem_repeat = cpu_mem[warmup_iter:] + gpu_mem_repeat = gpu_mem[warmup_iter:] + gpu_util_repeat = gpu_util[warmup_iter:] dump_result = dict() dump_result["runtime"] = runtime_statis["avg_time"] * 1000 - dump_result["end2end"] = np.mean(end2end_statis) * 1000 - dump_result["cpu_rss_mb"] = cpu_mem / repeat_iter - dump_result["gpu_rss_mb"] = gpu_mem / repeat_iter - dump_result["gpu_util"] = gpu_util / repeat_iter + dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000 + dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat) + dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat) + dump_result["gpu_util"] = np.mean(gpu_util_repeat) f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"]))) f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"]))) diff --git a/benchmark/benchmark_ppseg.py b/benchmark/benchmark_ppseg.py index 889264c9f..7c118cec5 100644 --- a/benchmark/benchmark_ppseg.py +++ b/benchmark/benchmark_ppseg.py @@ -117,7 +117,9 @@ if __name__ == '__main__': gpu_id = args.device_id end2end_statis = list() - cpu_mem, gpu_mem, gpu_util = 0, 0, 0 + cpu_mem = list() + gpu_mem = list() + gpu_util = list() if args.device == "cpu": file_path = args.model + "_model_" + args.backend + "_" + \ args.device + "_" + str(args.cpu_num_thread) + ".txt" @@ -138,23 +140,26 @@ if __name__ == '__main__': start = time.time() result = model.predict(im) end2end_statis.append(time.time() - start) - gpu_util += get_current_gputil(gpu_id) + gpu_util.append(get_current_gputil(gpu_id)) cm, gm = get_current_memory_mb(gpu_id) - cpu_mem += cm - gpu_mem += gm + cpu_mem.append(cm) + gpu_mem.append(gm) runtime_statis = model.print_statis_info_of_runtime() warmup_iter = args.iter_num // 5 repeat_iter = args.iter_num - warmup_iter - end2end_statis = end2end_statis[warmup_iter:] + end2end_statis_repeat = end2end_statis[warmup_iter:] + cpu_mem_repeat = cpu_mem[warmup_iter:] + gpu_mem_repeat = gpu_mem[warmup_iter:] + gpu_util_repeat = gpu_util[warmup_iter:] dump_result = dict() dump_result["runtime"] = runtime_statis["avg_time"] * 1000 - dump_result["end2end"] = np.mean(end2end_statis) * 1000 - dump_result["cpu_rss_mb"] = cpu_mem / repeat_iter - dump_result["gpu_rss_mb"] = gpu_mem / repeat_iter - dump_result["gpu_util"] = gpu_util / repeat_iter + dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000 + dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat) + dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat) + dump_result["gpu_util"] = np.mean(gpu_util_repeat) f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"]))) f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"]))) diff --git a/benchmark/benchmark_yolo.py b/benchmark/benchmark_yolo.py index 6c7fec6e4..81a87323c 100644 --- a/benchmark/benchmark_yolo.py +++ b/benchmark/benchmark_yolo.py @@ -117,7 +117,9 @@ if __name__ == '__main__': gpu_id = args.device_id end2end_statis = list() - cpu_mem, gpu_mem, gpu_util = 0, 0, 0 + cpu_mem = list() + gpu_mem = list() + gpu_util = list() if args.device == "cpu": file_path = args.model + "_model_" + args.backend + "_" + \ args.device + "_" + str(args.cpu_num_thread) + ".txt" @@ -152,23 +154,26 @@ if __name__ == '__main__': start = time.time() result = model.predict(im) end2end_statis.append(time.time() - start) - gpu_util += get_current_gputil(gpu_id) + gpu_util.append(get_current_gputil(gpu_id)) cm, gm = get_current_memory_mb(gpu_id) - cpu_mem += cm - gpu_mem += gm + cpu_mem.append(cm) + gpu_mem.append(gm) runtime_statis = model.print_statis_info_of_runtime() warmup_iter = args.iter_num // 5 repeat_iter = args.iter_num - warmup_iter - end2end_statis = end2end_statis[warmup_iter:] + end2end_statis_repeat = end2end_statis[warmup_iter:] + cpu_mem_repeat = cpu_mem[warmup_iter:] + gpu_mem_repeat = gpu_mem[warmup_iter:] + gpu_util_repeat = gpu_util[warmup_iter:] dump_result = dict() dump_result["runtime"] = runtime_statis["avg_time"] * 1000 - dump_result["end2end"] = np.mean(end2end_statis) * 1000 - dump_result["cpu_rss_mb"] = cpu_mem / repeat_iter - dump_result["gpu_rss_mb"] = gpu_mem / repeat_iter - dump_result["gpu_util"] = gpu_util / repeat_iter + dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000 + dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat) + dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat) + dump_result["gpu_util"] = np.mean(gpu_util_repeat) f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"]))) f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))