diff --git a/docs/cn/quantize.md b/docs/cn/quantize.md index 6277c8385..7717176f6 100644 --- a/docs/cn/quantize.md +++ b/docs/cn/quantize.md @@ -2,22 +2,22 @@ # 量化加速 量化是一种流行的模型压缩方法,量化后的模型拥有更小的体积和更快的推理速度. -FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDeploy支持推理部署量化后的模型, 帮助用户实现推理加速. +FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具. FastDeploy一键模型自动压缩可包含多种策略, 目前主要采用离线量化和量化蒸馏训练.同时, FastDeploy支持部署压缩后的模型, 帮助用户实现推理加速. 本文主要描述量化模型在FastDeploy上的部署. ## FastDeploy 多个引擎和硬件支持量化模型部署 当前,FastDeploy中多个推理后端可以在不同硬件上支持量化模型的部署. 支持情况如下: -| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT | -| :-----------| :-------- | :--------------- | :------- | -| CPU | 支持 | 支持 | | -| GPU | | | 支持 | +| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT | Paddle-TensorRT | +| :-----------| :-------- | :--------------- | :------- | :------- | +| CPU | 支持 | 支持 | | | +| GPU | | | 支持 | 支持 | ## 模型量化 ### 量化方法 -基于PaddleSlim,目前FastDeploy提供的的量化方法有量化蒸馏训练和离线量化,量化蒸馏训练通过模型训练来获得量化模型,离线量化不需要模型训练即可完成模型的量化。 FastDeploy 对两种方式产出的量化模型均能部署。 +基于PaddleSlim,目前FastDeploy一键模型自动压缩提供的的量化方法有量化蒸馏训练和离线量化,量化蒸馏训练通过模型训练来获得量化模型,离线量化不需要模型训练即可完成模型的量化。 FastDeploy 对两种方式产出的量化模型均能部署。 两种方法的主要对比如下表所示: | 量化方法 | 量化过程耗时 | 量化模型精度 | 模型体积 | 推理速度 | @@ -25,44 +25,115 @@ FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDe | 离线量化 | 无需训练,耗时短 | 比量化蒸馏训练稍低 | 两者一致 | 两者一致 | | 量化蒸馏训练 | 需要训练,耗时稍高 | 较未量化模型有少量损失 | 两者一致 |两者一致 | -### 使用FastDeploy一键模型量化工具来量化模型 -Fastdeploy基于PaddleSlim, 为用户提供了一键模型量化的工具,请参考如下文档进行模型量化。 -- [FastDeploy 一键模型量化](../../tools/quantization/) -当用户获得产出的量化模型之后,即可以使用FastDeploy来部署量化模型。 +### 使用FastDeploy一键模型自动化压缩工具来量化模型 +FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具,请参考如下文档进行一键模型自动化压缩。 +- [FastDeploy 一键模型自动化压缩](../../tools/auto_compression/) +当用户获得产出的压缩模型之后,即可以使用FastDeploy来部署压缩模型。 -## 量化benchmark +## 量化模型 Benchmark -目前, FastDeploy已支持的模型量化如下表所示: +目前, FastDeploy支持自动化压缩,并完成部署测试的模型的Runtime Benchmark和端到端Benchmark如下所示. + +Benchmark表格说明: +- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. +- 所测时延均为推理1000次后求得的平均值, 单位是毫秒. +- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 +- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度 +- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比. +- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度. +- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1. GPU为Tesla T4, TensorRT版本8.4.15. ### YOLO 系列 -| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 mAP | INT8 mAP | 量化方式 | -| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | -| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | TensorRT | GPU | 14.13 | 11.22 | 1.26 | 37.6 | 36.6 | 量化蒸馏训练 | -| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | ONNX Runtime | CPU | 183.68 | 100.39 | 1.83 | 37.6 | 33.1 |量化蒸馏训练 | -| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | Paddle Inference | CPU | 226.36 | 152.27 | 1.48 |37.6 | 36.8 | 量化蒸馏训练 | -| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | TensorRT | GPU | 12.89 | 8.92 | 1.45 | 42.5 | 40.6|量化蒸馏训练 | -| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | ONNX Runtime | CPU | 345.85 | 131.81 | 2.60 |42.5| 36.1|量化蒸馏训练 | -| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | Paddle Inference | CPU | 366.41 | 131.70 | 2.78 |42.5| 41.2|量化蒸馏训练 | -| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | TensorRT | GPU | 30.43 | 15.40 | 1.98 | 51.1| 50.8|量化蒸馏训练 | -| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | ONNX Runtime | CPU | 971.27 | 471.88 | 2.06 | 51.1 | 42.5|量化蒸馏训练 | -| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | Paddle Inference | CPU | 1015.70 | 562.41 | 1.82 |51.1 | 46.3|量化蒸馏训练 | +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | TensorRT | GPU | 7.87 | 4.51 | 4.31 | 3.17 | 2.48 | 37.6 | 36.7 | 量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | Paddle-TensorRT | GPU | 7.99 | None | 4.46 | 3.31 | 2.41 | 37.6 | 36.8 | 量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | ONNX Runtime | CPU | 176.41 | 91.90 | None | None | 1.90 | 37.6 | 33.1 |量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | Paddle Inference| CPU | 213.73 | 130.19 | None | None | 1.64 |37.6 | 35.2 | 量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | TensorRT | GPU | 9.47 | 3.23 | 4.09 |2.81 | 3.37 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | Paddle-TensorRT | GPU | 9.31 | None| 4.17 | 2.95 | 3.16 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | ONNX Runtime | CPU | 334.65 | 126.38 | None | None| 2.65 |42.5| 36.8|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | Paddle Inference | CPU | 352.87 | 123.12 |None | None| 2.87 |42.5| 40.8|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | TensorRT | GPU | 27.47 | 6.52 | 6.74| 5.19| 5.29 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | Paddle-TensorRT | GPU | 27.87|None|6.91|5.86 | 4.76 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | ONNX Runtime | CPU | 996.65 | 467.15 |None|None | 2.13 | 51.1 | 43.3|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | Paddle Inference | CPU | 995.85 | 477.93|None|None | 2.08 |51.1 | 46.2|量化蒸馏训练 | + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | TensorRT | GPU | 24.61 | 21.20 | 20.78 | 20.94 | 1.18 | 37.6 | 36.7 | 量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | Paddle-TensorRT | GPU | 23.53 | None | 21.98 | 19.84 | 1.28 | 37.6 | 36.8 | 量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | ONNX Runtime | CPU | 197.323 | 110.99 | None | None | 1.78 | 37.6 | 33.1 |量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | Paddle Inference| CPU | 235.73 | 144.82 | None | None | 1.63 |37.6 | 35.2 | 量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | TensorRT | GPU | 15.66 | 11.30 | 10.25 |9.59 | 1.63 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | Paddle-TensorRT | GPU | 15.03 | None| 11.36 | 9.32 | 1.61 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | ONNX Runtime | CPU | 348.21 | 126.38 | None | None| 2.82 |42.5| 36.8|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | Paddle Inference | CPU | 352.87 | 121.64 |None | None| 3.04 |42.5| 40.8|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | TensorRT | GPU | 36.47 | 18.81 | 20.33| 17.58| 2.07 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | Paddle-TensorRT | GPU | 37.06|None|20.26|17.53 | 2.11 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | ONNX Runtime | CPU | 988.85 | 478.08 |None|None | 2.07 | 51.1 | 43.3|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | Paddle Inference | CPU | 1031.73 | 500.12|None|None | 2.06 |51.1 | 46.2|量化蒸馏训练 | -上表中的数据, 为模型量化前后,在FastDeploy部署的端到端推理性能. -- 测试数据为COCO2017验证集中的图片. -- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒. -- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. ### PaddleClas系列 -| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 Top1 | INT8 Top1 |量化方式 | -| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | -| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 86.87 | 59 .32 | 1.46 | 79.12 | 78.87| 离线量化| -| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 7.85 | 5.42 | 1.45 | 79.12 | 79.06 | 离线量化 | -| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 40.32 | 16.87 | 2.39 |77.89 | 75.09 |离线量化 | -| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 5.10 | 3.35 | 1.52 |77.89 | 76.86 | 离线量化 | +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 3.55 | 0.99|0.98|1.06 | 3.62 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | Paddle-TensorRT | GPU | 3.46 |None |0.87|1.03 | 3.98 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 76.14 | 35.43 |None|None | 2.15 | 79.12 | 78.87| 离线量化| +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | Paddle Inference | CPU | 76.21 | 24.01 |None|None | 3.17 | 79.12 | 78.55 | 离线量化| +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 0.91 | 0.43 |0.49 | 0.54 | 2.12 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | Paddle-TensorRT | GPU | 0.88| None| 0.49|0.51 | 1.80 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 30.53 | 9.59|None|None | 3.18 |77.89 | 75.09 |离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | Paddle Inference | CPU | 12.29 | 4.68 | None|None|2.62 |77.89 | 71.36 |离线量化 | -上表中的数据, 为模型量化前后,在FastDeploy部署的端到端推理性能. -- 测试数据为ImageNet-2012验证集中的图片. -- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒. -- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 4.92| 2.28|2.24|2.23 | 2.21 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | Paddle-TensorRT | GPU | 4.48|None |2.09|2.10 | 2.14 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 77.43 | 41.90 |None|None | 1.85 | 79.12 | 78.87| 离线量化| +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | Paddle Inference | CPU | 80.60 | 27.75 |None|None | 2.90 | 79.12 | 78.55 | 离线量化| +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 2.19 | 1.48|1.57| 1.57 | 1.48 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | Paddle-TensorRT | GPU | 2.04| None| 1.47|1.45 | 1.41 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 34.02 | 12.97|None|None | 2.62 |77.89 | 75.09 |离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | Paddle Inference | CPU | 16.31 | 7.42 | None|None| 2.20 |77.89 | 71.36 |离线量化 | + + + +### PaddleDetection系列 +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize ) | TensorRT | GPU | 27.90 | 6.39 |6.44|5.95 | 4.67 | 51.4 | 50.7 | 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize ) | Paddle-TensorRT | GPU | 30.89 |None | 13.78 |14.01 | 2.24 | 51.4 | 50.5| 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize) | ONNX Runtime | CPU | 1057.82 | 449.52 |None|None | 2.35 |51.4 | 50.0 |量化蒸馏训练 | + +NOTE: +- TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子 + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize ) | TensorRT | GPU | 35.75 | 15.42 |20.70|20.85 | 2.32 | 51.4 | 50.7 | 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize ) | Paddle-TensorRT | GPU | 33.48 |None | 18.47 |18.03 | 1.81 | 51.4 | 50.5| 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize) | ONNX Runtime | CPU | 1067.17 | 461.037 |None|None | 2.31 |51.4 | 50.0 |量化蒸馏训练 | + + + +### PaddleSeg系列 +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize) | Paddle Inference | CPU | 1138.04| 602.62 |None|None | 1.89 |77.37 | 71.62 |量化蒸馏训练 | + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize) | Paddle Inference | CPU | 4726.65| 4134.91|None|None | 1.14 |77.37 | 71.62 |量化蒸馏训练 | diff --git a/docs/en/quantize.md b/docs/en/quantize.md index eb626c6e6..effce0700 100644 --- a/docs/en/quantize.md +++ b/docs/en/quantize.md @@ -1,11 +1,79 @@ [English](../en/quantize.md) | 简体中文 # 量化加速 +量化是一种流行的模型压缩方法,量化后的模型拥有更小的体积和更快的推理速度. +FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDeploy支持部署量化后的模型, 帮助用户实现推理加速. -简要介绍量化加速的原理。 -目前量化支持在哪些硬件及后端的使用 +## FastDeploy 多个引擎和硬件支持量化模型部署 +当前,FastDeploy中多个推理后端可以在不同硬件上支持量化模型的部署. 支持情况如下: + +| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT | +| :-----------| :-------- | :--------------- | :------- | +| CPU | 支持 | 支持 | | +| GPU | | | 支持 | + + +## 模型量化 + +### 量化方法 +基于PaddleSlim, 目前FastDeploy提供的的量化方法有量化蒸馏训练和离线量化, 量化蒸馏训练通过模型训练来获得量化模型, 离线量化不需要模型训练即可完成模型的量化. FastDeploy 对两种方式产出的量化模型均能部署. + +两种方法的主要对比如下表所示: +| 量化方法 | 量化过程耗时 | 量化模型精度 | 模型体积 | 推理速度 | +| :-----------| :--------| :-------| :------- | :------- | +| 离线量化 | 无需训练,耗时短 | 比量化蒸馏训练稍低 | 两者一致 | 两者一致 | +| 量化蒸馏训练 | 需要训练,耗时稍高 | 较未量化模型有少量损失 | 两者一致 |两者一致 | + +### 用户使用FastDeploy一键模型量化工具来量化模型 +Fastdeploy基于PaddleSlim, 为用户提供了一键模型量化的工具,请参考如下文档进行模型量化. +- [FastDeploy 一键模型量化](../../tools/quantization/) +当用户获得产出的量化模型之后,即可以使用FastDeploy来部署量化模型. + ## 量化示例 +目前, FastDeploy已支持的模型量化如下表所示: -这里一个表格,展示目前支持的量化列表(跳转到相应的example下去),精度、性能 +### YOLO 系列 +| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | TensorRT | GPU | 8.79 | 5.17 | 1.70 | 37.6 | 36.6 | 量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | ONNX Runtime | CPU | 176.34 | 92.95 | 1.90 | 37.6 | 33.1 |量化蒸馏训练 | +| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | Paddle Inference | CPU | 217.05 | 133.31 | 1.63 |37.6 | 36.8 | 量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | TensorRT | GPU | 8.60 | 5.16 | 1.67 | 42.5 | 40.6|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | ONNX Runtime | CPU | 338.60 | 128.58 | 2.60 |42.5| 36.1|量化蒸馏训练 | +| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/) | Paddle Inference | CPU | 356.62 | 125.72 | 2.84 |42.5| 41.2|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | TensorRT | GPU | 24.57 | 9.40 | 2.61 | 51.1| 50.8|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | ONNX Runtime | CPU | 976.88 | 462.69 | 2.11 | 51.1 | 42.5|量化蒸馏训练 | +| [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | Paddle Inference | CPU | 1022.55 | 490.87 | 2.08 |51.1 | 46.3|量化蒸馏训练 | + +上表中的数据, 为模型量化前后,在FastDeploy部署的Runtime推理性能. +- 测试数据为COCO2017验证集中的图片. +- 推理时延为在不同Runtime上推理的时延, 单位是毫秒. +- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. + + +### PaddleDetection系列 +| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 mAP | INT8 mAP |量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize ) | TensorRT | GPU | 24.52 | 11.53 | 2.13 | 51.4 | 50.7 | 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize) | ONNX Runtime | CPU | 1085.62 | 457.56 | 2.37 |51.4 | 50.0 |量化蒸馏训练 | + +上表中的数据, 为模型量化前后,在FastDeploy部署的Runtime推理性能. +- 测试图片为COCO val2017中的图片. +- 推理时延为在不同Runtime上推理的时延, 单位是毫秒. +- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. + + +### PaddleClas系列 +| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 Top1 | INT8 Top1 |量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 77.20 | 40.08 | 1.93 | 79.12 | 78.87| 离线量化| +| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 3.70 | 1.80 | 2.06 | 79.12 | 79.06 | 离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | ONNX Runtime | CPU | 30.99 | 10.24 | 3.03 |77.89 | 75.09 |离线量化 | +| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 1.80 | 0.58 | 3.10 |77.89 | 76.86 | 离线量化 | + +上表中的数据, 为模型量化前后,在FastDeploy部署的Runtime推理性能. +- 测试数据为ImageNet-2012验证集中的图片. +- 推理时延为在不同Runtime上推理的时延, 单位是毫秒. +- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. diff --git a/examples/vision/classification/paddleclas/quantize/README.md b/examples/vision/classification/paddleclas/quantize/README.md index 3a100a823..6e3f78b4d 100644 --- a/examples/vision/classification/paddleclas/quantize/README.md +++ b/examples/vision/classification/paddleclas/quantize/README.md @@ -1,25 +1,48 @@ # PaddleClas 量化模型部署 -FastDeploy已支持部署量化模型,并提供一键模型量化的工具. -用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. +FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具. +用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. -## FastDeploy一键模型量化工具 -FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化. -详细教程请见: [一键模型量化工具](../../../../../tools/quantization/) +## FastDeploy一键模型自动化压缩工具 +FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化. +详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/) 注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。 ## 下载量化完成的PaddleClas模型 用户也可以直接下载下表中的量化模型进行部署. -| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 Top1 | INT8 Top1 |量化方式 | -| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | -| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | ONNX Runtime | CPU | 86.87 | 59 .32 | 1.46 | 79.12 | 78.87| 离线量化| -| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | TensorRT | GPU | 7.85 | 5.42 | 1.45 | 79.12 | 79.06 | 离线量化 | -| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | ONNX Runtime | CPU | 40.32 | 16.87 | 2.39 |77.89 | 75.09 |离线量化 | -| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | TensorRT | GPU | 5.10 | 3.35 | 1.52 |77.89 | 76.86 | 离线量化 | -上表中的数据, 为模型量化前后,在FastDeploy部署的端到端推理性能. -- 测试图片为ImageNet-2012验证集中的图片. -- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒. -- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. +Benchmark表格说明: +- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. +- 所测时延均为推理1000次后求得的平均值, 单位是毫秒. +- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 +- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度 +- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比. +- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度. +- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1. GPU为Tesla T4, TensorRT版本8.4.15. + +### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | TensorRT | GPU | 3.55 | 0.99|0.98|1.06 | 3.62 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | Paddle-TensorRT | GPU | 3.46 |None |0.87|1.03 | 3.98 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | ONNX Runtime | CPU | 76.14 | 35.43 |None|None | 2.15 | 79.12 | 78.87| 离线量化| +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | Paddle Inference | CPU | 76.21 | 24.01 |None|None | 3.17 | 79.12 | 78.55 | 离线量化| +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | TensorRT | GPU | 0.91 | 0.43 |0.49 | 0.54 | 2.12 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | Paddle-TensorRT | GPU | 0.88| None| 0.49|0.51 | 1.80 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | ONNX Runtime | CPU | 30.53 | 9.59|None|None | 3.18 |77.89 | 75.09 |离线量化 | +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | Paddle Inference | CPU | 12.29 | 4.68 | None|None|2.62 |77.89 | 71.36 |离线量化 | + +### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | TensorRT | GPU | 4.92| 2.28|2.24|2.23 | 2.21 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | Paddle-TensorRT | GPU | 4.48|None |2.09|2.10 | 2.14 | 79.12 | 79.06 | 离线量化 | +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | ONNX Runtime | CPU | 77.43 | 41.90 |None|None | 1.85 | 79.12 | 78.87| 离线量化| +| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | Paddle Inference | CPU | 80.60 | 27.75 |None|None | 2.90 | 79.12 | 78.55 | 离线量化| +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | TensorRT | GPU | 2.19 | 1.48|1.57| 1.57 | 1.48 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | Paddle-TensorRT | GPU | 2.04| None| 1.47|1.45 | 1.41 |77.89 | 76.86 | 离线量化 | +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | ONNX Runtime | CPU | 34.02 | 12.97|None|None | 2.62 |77.89 | 75.09 |离线量化 | +| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | Paddle Inference | CPU | 16.31 | 7.42 | None|None| 2.20 |77.89 | 71.36 |离线量化 | ## 详细部署文档 diff --git a/examples/vision/classification/paddleclas/quantize/cpp/README.md b/examples/vision/classification/paddleclas/quantize/cpp/README.md index 0f9bfc8f3..e2e625dbd 100644 --- a/examples/vision/classification/paddleclas/quantize/cpp/README.md +++ b/examples/vision/classification/paddleclas/quantize/cpp/README.md @@ -1,4 +1,4 @@ -# PaddleClas 量化模型 Python部署示例 +# PaddleClas 量化模型 C++部署示例 本目录下提供的`infer.cc`,可以帮助用户快速完成PaddleClas量化模型在CPU/GPU上的部署推理加速. ## 部署准备 @@ -8,7 +8,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) ## 以量化后的ResNet50_Vd模型为例, 进行部署 在本目录执行如下命令即可完成编译,以及量化模型部署. @@ -26,8 +26,10 @@ tar -xvf resnet50_vd_ptq.tar wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg -# 在CPU上使用Paddle-Inference推理量化模型 +# 在CPU上使用ONNX Runtime推理量化模型 ./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 0 # 在GPU上使用TensorRT推理量化模型 ./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 1 +# 在GPU上使用Paddle-TensorRT推理量化模型 +./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 2 ``` diff --git a/examples/vision/classification/paddleclas/quantize/cpp/infer.cc b/examples/vision/classification/paddleclas/quantize/cpp/infer.cc index ed4f05a24..b0a774feb 100644 --- a/examples/vision/classification/paddleclas/quantize/cpp/infer.cc +++ b/examples/vision/classification/paddleclas/quantize/cpp/infer.cc @@ -21,8 +21,8 @@ const char sep = '/'; void InitAndInfer(const std::string& model_dir, const std::string& image_file, const fastdeploy::RuntimeOption& option) { - auto model_file = model_dir + sep + "inference.pdmodel"; - auto params_file = model_dir + sep + "inference.pdiparams"; + auto model_file = model_dir + sep + "model.pdmodel"; + auto params_file = model_dir + sep + "model.pdiparams"; auto config_file = model_dir + sep + "inference_cls.yaml"; auto model = fastdeploy::vision::classification::PaddleClasModel( @@ -67,7 +67,11 @@ int main(int argc, char* argv[]) { option.UseGpu(); option.UseTrtBackend(); option.SetTrtInputShape("inputs",{1, 3, 224, 224}); - } + } else if (flag == 2) { + option.UseGpu(); + option.UseTrtBackend(); + option.EnablePaddleToTrt(); + } std::string model_dir = argv[1]; std::string test_image = argv[2]; diff --git a/examples/vision/classification/paddleclas/quantize/cpp/ocr.sh b/examples/vision/classification/paddleclas/quantize/cpp/ocr.sh new file mode 100644 index 000000000..90ad6a9e3 --- /dev/null +++ b/examples/vision/classification/paddleclas/quantize/cpp/ocr.sh @@ -0,0 +1,10 @@ +rm -rf build +mkdir build + +cd build + +#/xieyunyao/project/FastDeploy + +cmake .. -DFASTDEPLOY_INSTALL_DIR=/xieyunyao/project/FastDeploy + +make -j diff --git a/examples/vision/classification/paddleclas/quantize/python/README.md b/examples/vision/classification/paddleclas/quantize/python/README.md index 5da97d48a..00fd7bef9 100644 --- a/examples/vision/classification/paddleclas/quantize/python/README.md +++ b/examples/vision/classification/paddleclas/quantize/python/README.md @@ -8,7 +8,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) ## 以量化后的ResNet50_Vd模型为例, 进行部署 @@ -22,8 +22,10 @@ wget https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar tar -xvf resnet50_vd_ptq.tar wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg -# 在CPU上使用Paddle-Inference推理量化模型 +# 在CPU上使用ONNX Runtime推理量化模型 python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device cpu --backend ort # 在GPU上使用TensorRT推理量化模型 python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device gpu --backend trt +# 在GPU上使用Paddle-TensorRT推理量化模型 +python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device gpu --backend pptrt ``` diff --git a/examples/vision/classification/paddleclas/quantize/python/infer.py b/examples/vision/classification/paddleclas/quantize/python/infer.py index 0a4df1768..e981744bd 100644 --- a/examples/vision/classification/paddleclas/quantize/python/infer.py +++ b/examples/vision/classification/paddleclas/quantize/python/infer.py @@ -48,6 +48,11 @@ def build_option(args): ) == "gpu", "TensorRT backend require inferences on device GPU." option.use_trt_backend() option.set_trt_input_shape("inputs", min_shape=[1, 3, 224, 224]) + elif args.backend.lower() == "pptrt": + assert args.device.lower( + ) == "gpu", "TensorRT backend require inference on device GPU." + option.use_trt_backend() + option.enable_paddle_to_trt() elif args.backend.lower() == "ort": option.use_ort_backend() elif args.backend.lower() == "paddle": diff --git a/examples/vision/detection/paddledetection/quantize/README.md b/examples/vision/detection/paddledetection/quantize/README.md index f3e87e70d..8c6f1feee 100644 --- a/examples/vision/detection/paddledetection/quantize/README.md +++ b/examples/vision/detection/paddledetection/quantize/README.md @@ -1,22 +1,43 @@ # PaddleDetection 量化模型部署 -FastDeploy已支持部署量化模型,并提供一键模型量化的工具. -用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. +FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具. +用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. -## FastDeploy一键模型量化工具 -FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化. -详细教程请见: [一键模型量化工具](../../../../../tools/quantization/) +## FastDeploy一键模型自动化压缩工具 +FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化. +详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/) ## 下载量化完成的PP-YOLOE-l模型 -用户也可以直接下载下表中的量化模型进行部署. -| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 mAP | INT8 mAP |量化方式 | -| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | -| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | TensorRT | GPU | 43.83 | 31.57 | 1.39 | 51.4 | 50.7 | 量化蒸馏训练 | -| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | ONNX Runtime | CPU | 1085.18 | 475.55 | 2.29 |51.4 | 50.0 |量化蒸馏训练 | +用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) + + +Benchmark表格说明: +- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. +- 所测时延均为推理1000次后求得的平均值, 单位是毫秒. +- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 +- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度 +- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比. +- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度. +- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1. GPU为Tesla T4, TensorRT版本8.4.15. + + +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | TensorRT | GPU | 27.90 | 6.39 |6.44|5.95 | 4.67 | 51.4 | 50.7 | 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | Paddle-TensorRT | GPU | 30.89 |None | 13.78 |14.01 | 2.24 | 51.4 | 50.5 | 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar) | ONNX Runtime | CPU | 1057.82 | 449.52 |None|None | 2.35 |51.4 | 50.0 |量化蒸馏训练 | + +NOTE: +- TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子 + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | TensorRT | GPU | 35.75 | 15.42 |20.70|20.85 | 2.32 | 51.4 | 50.7 | 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | Paddle-TensorRT | GPU | 33.48 |None | 18.47 |18.03 | 1.81 | 51.4 | 50.5 | 量化蒸馏训练 | +| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar) | ONNX Runtime | CPU | 1067.17 | 461.037 |None|None | 2.31 |51.4 | 50.0 |量化蒸馏训练 | -上表中的数据, 为模型量化前后,在FastDeploy部署的端到端推理性能. -- 测试图片为COCO val2017中的图片. -- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒. -- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. ## 详细部署文档 diff --git a/examples/vision/detection/paddledetection/quantize/cpp/README.md b/examples/vision/detection/paddledetection/quantize/cpp/README.md index 034957ffd..42bf40acb 100644 --- a/examples/vision/detection/paddledetection/quantize/cpp/README.md +++ b/examples/vision/detection/paddledetection/quantize/cpp/README.md @@ -9,7 +9,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) ## 以量化后的PP-YOLOE-l模型为例, 进行部署 在本目录执行如下命令即可完成编译,以及量化模型部署. @@ -30,4 +30,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000 ./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 0 # 在GPU上使用TensorRT推理量化模型 ./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 1 +# 在GPU上使用Paddle-TensorRT推理量化模型 +./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 2 ``` diff --git a/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc b/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc index 9ed06b575..4d2abd3fc 100644 --- a/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc +++ b/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc @@ -71,7 +71,15 @@ int main(int argc, char* argv[]) { option.UseTrtBackend(); option.SetTrtInputShape("inputs",{1, 3, 640, 640}); option.SetTrtInputShape("scale_factor",{1,2}); + } else if (flag == 2) { + option.UseGpu(); + option.UseTrtBackend(); + option.EnablePaddleToTrt(); } + else if (flag == 3) { + option.UseCpu(); + option.UsePaddleBackend(); + } std::string model_dir = argv[1]; std::string test_image = argv[2]; diff --git a/examples/vision/detection/paddledetection/quantize/python/README.md b/examples/vision/detection/paddledetection/quantize/python/README.md index 9535df5c3..cecb5a140 100644 --- a/examples/vision/detection/paddledetection/quantize/python/README.md +++ b/examples/vision/detection/paddledetection/quantize/python/README.md @@ -8,7 +8,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) ## 以量化后的PP-YOLOE-l模型为例, 进行部署 @@ -26,4 +26,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000 python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device cpu --backend ort # 在GPU上使用TensorRT推理量化模型 python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device gpu --backend trt +# 在GPU上使用Paddle-TensorRT推理量化模型 +python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device gpu --backend pptrt ``` diff --git a/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py b/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py index 85f3c9d55..59e602f6e 100644 --- a/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py +++ b/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py @@ -49,6 +49,11 @@ def build_option(args): option.set_trt_cache_file(os.path.join(args.model, "model.trt")) option.set_trt_input_shape("image", min_shape=[1, 3, 640, 640]) option.set_trt_input_shape("scale_factor", min_shape=[1, 2]) + elif args.backend.lower() == "pptrt": + assert args.device.lower( + ) == "gpu", "TensorRT backend require inference on device GPU." + option.use_trt_backend() + option.enable_paddle_to_trt() elif args.backend.lower() == "ort": option.use_ort_backend() elif args.backend.lower() == "paddle": diff --git a/examples/vision/detection/yolov5/quantize/README.md b/examples/vision/detection/yolov5/quantize/README.md index 16dff9e84..853718381 100644 --- a/examples/vision/detection/yolov5/quantize/README.md +++ b/examples/vision/detection/yolov5/quantize/README.md @@ -1,22 +1,42 @@ # YOLOv5量化模型部署 -FastDeploy已支持部署量化模型,并提供一键模型量化的工具. -用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. +FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具. +用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. -## FastDeploy一键模型量化工具 -FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化. -详细教程请见: [一键模型量化工具](../../../../../tools/quantization/) +## FastDeploy一键模型自动化压缩工具 +FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化. +详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/) ## 下载量化完成的YOLOv5s模型 -用户也可以直接下载下表中的量化模型进行部署. -| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 mAP | INT8 mAP |量化方式 | -| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | -| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | TensorRT | GPU | 14.13 | 11.22 | 1.26 | 37.6 | 36.6 | 量化蒸馏训练 | -| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | Paddle Inference | CPU | 226.36 | 152.27 | 1.48 |37.6 | 36.8 |量化蒸馏训练 | +用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) + +Benchmark表格说明: +- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. +- 所测时延均为推理1000次后求得的平均值, 单位是毫秒. +- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 +- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度 +- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比. +- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度. +- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1. GPU为Tesla T4, TensorRT版本8.4.15. + + +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | TensorRT | GPU | 7.87 | 4.51 | 4.31 | 3.17 | 2.48 | 37.6 | 36.7 | 量化蒸馏训练 | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | Paddle-TensorRT | GPU | 7.99 | None | 4.46 | 3.31 | 2.41 | 37.6 | 36.8 | 量化蒸馏训练 | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | ONNX Runtime | CPU | 176.41 | 91.90 | None | None | 1.90 | 37.6 | 33.1 |量化蒸馏训练 | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | Paddle Inference| CPU | 213.73 | 130.19 | None | None | 1.64 |37.6 | 35.2 | 量化蒸馏训练 | + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | TensorRT | GPU | 24.61 | 21.20 | 20.78 | 20.94 | 1.18 | 37.6 | 36.7 | 量化蒸馏训练 | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | Paddle-TensorRT | GPU | 23.53 | None | 21.98 | 19.84 | 1.28 | 37.6 | 36.8 | 量化蒸馏训练 | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | ONNX Runtime | CPU | 197.323 | 110.99 | None | None | 1.78 | 37.6 | 33.1 |量化蒸馏训练 | +| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | Paddle Inference| CPU | 235.73 | 144.82 | None | None | 1.63 |37.6 | 35.2 | 量化蒸馏训练 | + -上表中的数据, 为模型量化前后,在FastDeploy部署的端到端推理性能. -- 测试图片为COCO val2017中的图片. -- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒. -- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. ## 详细部署文档 diff --git a/examples/vision/detection/yolov5/quantize/cpp/README.md b/examples/vision/detection/yolov5/quantize/cpp/README.md index 21f351a0e..7d76bad51 100644 --- a/examples/vision/detection/yolov5/quantize/cpp/README.md +++ b/examples/vision/detection/yolov5/quantize/cpp/README.md @@ -9,7 +9,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署. +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署. ## 以量化后的YOLOv5s模型为例, 进行部署 在本目录执行如下命令即可完成编译,以及量化模型部署. @@ -31,4 +31,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000 ./infer_demo yolov5s_quant 000000014439.jpg 0 # 在GPU上使用TensorRT推理量化模型 ./infer_demo yolov5s_quant 000000014439.jpg 1 +# 在GPU上使用Paddle-TensorRT推理量化模型 +./infer_demo yolov5s_quant 000000014439.jpg 2 ``` diff --git a/examples/vision/detection/yolov5/quantize/cpp/infer.cc b/examples/vision/detection/yolov5/quantize/cpp/infer.cc index 88a9e15fc..54e9d6dc1 100644 --- a/examples/vision/detection/yolov5/quantize/cpp/infer.cc +++ b/examples/vision/detection/yolov5/quantize/cpp/infer.cc @@ -68,7 +68,11 @@ int main(int argc, char* argv[]) { } else if (flag == 1) { option.UseGpu(); option.UseTrtBackend(); - } + } else if (flag == 2) { + option.UseGpu(); + option.UseTrtBackend(); + option.EnablePaddleToTrt(); + } std::string model_dir = argv[1]; std::string test_image = argv[2]; diff --git a/examples/vision/detection/yolov5/quantize/python/README.md b/examples/vision/detection/yolov5/quantize/python/README.md index 00c92dc84..9aa03a8cc 100644 --- a/examples/vision/detection/yolov5/quantize/python/README.md +++ b/examples/vision/detection/yolov5/quantize/python/README.md @@ -8,7 +8,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署. +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署. ## 以量化后的YOLOv5s模型为例, 进行部署 @@ -26,4 +26,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000 python infer.py --model yolov5s_quant --image 000000014439.jpg --device cpu --backend paddle # 在GPU上使用TensorRT推理量化模型 python infer.py --model yolov5s_quant --image 000000014439.jpg --device gpu --backend trt +# 在GPU上使用Paddle-TensorRT推理量化模型 +python infer.py --model yolov5s_quant --image 000000014439.jpg --device gpu --backend pptrt ``` diff --git a/examples/vision/detection/yolov5/quantize/python/infer.py b/examples/vision/detection/yolov5/quantize/python/infer.py index aa56ef18b..2e420c360 100644 --- a/examples/vision/detection/yolov5/quantize/python/infer.py +++ b/examples/vision/detection/yolov5/quantize/python/infer.py @@ -47,6 +47,11 @@ def build_option(args): assert args.device.lower( ) == "gpu", "TensorRT backend require inference on device GPU." option.use_trt_backend() + elif args.backend.lower() == "pptrt": + assert args.device.lower( + ) == "gpu", "TensorRT backend require inference on device GPU." + option.use_trt_backend() + option.enable_paddle_to_trt() elif args.backend.lower() == "ort": option.use_ort_backend() elif args.backend.lower() == "paddle": diff --git a/examples/vision/detection/yolov6/quantize/README.md b/examples/vision/detection/yolov6/quantize/README.md index 594d59e5c..04af3f689 100644 --- a/examples/vision/detection/yolov6/quantize/README.md +++ b/examples/vision/detection/yolov6/quantize/README.md @@ -1,23 +1,42 @@ # YOLOv6量化模型部署 -FastDeploy已支持部署量化模型,并提供一键模型量化的工具. -用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. - -## FastDeploy一键模型量化工具 -FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化. -详细教程请见: [一键模型量化工具](../../../../../tools/quantization/) +FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具. +用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. +## FastDeploy一键模型自动化压缩工具 +FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化. +详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/) ## 下载量化完成的YOLOv6s模型 -用户也可以直接下载下表中的量化模型进行部署. +用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) + +Benchmark表格说明: +- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. +- 所测时延均为推理1000次后求得的平均值, 单位是毫秒. +- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 +- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度 +- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比. +- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度. +- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1. GPU为Tesla T4, TensorRT版本8.4.15. + +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | TensorRT | GPU | 9.47 | 3.23 | 4.09 |2.81 | 3.37 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | Paddle-TensorRT | GPU | 9.31 | None| 4.17 | 2.95 | 3.16 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | ONNX Runtime | CPU | 334.65 | 126.38 | None | None| 2.65 |42.5| 36.8|量化蒸馏训练 | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | Paddle Inference | CPU | 352.87 | 123.12 |None | None| 2.87 |42.5| 40.8|量化蒸馏训练 | + + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | TensorRT | GPU | 15.66 | 11.30 | 10.25 |9.59 | 1.63 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | Paddle-TensorRT | GPU | 15.03 | None| 11.36 | 9.32 | 1.61 | 42.5 | 40.7|量化蒸馏训练 | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | ONNX Runtime | CPU | 348.21 | 126.38 | None | None| 2.82 |42.5| 36.8|量化蒸馏训练 | +| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | Paddle Inference | CPU | 352.87 | 121.64 |None | None| 3.04 |42.5| 40.8|量化蒸馏训练 | + -| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 mAP | INT8 mAP | 量化方式 | -| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- | ------ | -| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar) | TensorRT | GPU | 12.89 | 8.92 | 1.45 | 42.5 | 40.6| 量化蒸馏训练 | -| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar) | Paddle Inference | CPU | 366.41 | 131.70 | 2.78 |42.5| 41.2|量化蒸馏训练 | -上表中的数据, 为模型量化前后,在FastDeploy部署的端到端推理性能. -- 测试图片为COCO val2017中的图片. -- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒. -- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. ## 详细部署文档 diff --git a/examples/vision/detection/yolov6/quantize/cpp/README.md b/examples/vision/detection/yolov6/quantize/cpp/README.md index 14a2a94e7..bf2208fab 100644 --- a/examples/vision/detection/yolov6/quantize/cpp/README.md +++ b/examples/vision/detection/yolov6/quantize/cpp/README.md @@ -9,7 +9,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署. +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署. ## 以量化后的YOLOv6s模型为例, 进行部署 在本目录执行如下命令即可完成编译,以及量化模型部署. @@ -22,13 +22,15 @@ cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-0.4.0 make -j #下载FastDeloy提供的yolov6s量化模型文件和测试图片 -wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar -tar -xvf yolov6s_quant.tar +wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_qat_model.tar +tar -xvf yolov6s_qat_model.tar wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg # 在CPU上使用Paddle-Inference推理量化模型 -./infer_demo yolov6s_quant 000000014439.jpg 0 +./infer_demo yolov6s_qat_model 000000014439.jpg 0 # 在GPU上使用TensorRT推理量化模型 -./infer_demo yolov6s_quant 000000014439.jpg 1 +./infer_demo yolov6s_qat_model 000000014439.jpg 1 +# 在GPU上使用Paddle-TensorRT推理量化模型 +./infer_demo yolov6s_qat_model 000000014439.jpg 2 ``` diff --git a/examples/vision/detection/yolov6/quantize/cpp/infer.cc b/examples/vision/detection/yolov6/quantize/cpp/infer.cc index f7a9d2c16..64f4d9f22 100644 --- a/examples/vision/detection/yolov6/quantize/cpp/infer.cc +++ b/examples/vision/detection/yolov6/quantize/cpp/infer.cc @@ -68,7 +68,11 @@ int main(int argc, char* argv[]) { } else if (flag == 1) { option.UseGpu(); option.UseTrtBackend(); - } + } else if (flag == 2) { + option.UseGpu(); + option.UseTrtBackend(); + option.EnablePaddleToTrt(); + } std::string model_dir = argv[1]; std::string test_image = argv[2]; diff --git a/examples/vision/detection/yolov6/quantize/python/README.md b/examples/vision/detection/yolov6/quantize/python/README.md index 03208f46d..5f70a02c8 100644 --- a/examples/vision/detection/yolov6/quantize/python/README.md +++ b/examples/vision/detection/yolov6/quantize/python/README.md @@ -8,7 +8,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署. +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署. ## 以量化后的YOLOv6s模型为例, 进行部署 ```bash @@ -17,12 +17,14 @@ git clone https://github.com/PaddlePaddle/FastDeploy.git cd examples/slim/yolov6/python #下载FastDeloy提供的yolov6s量化模型文件和测试图片 -wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar -tar -xvf yolov6s_quant.tar +wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_qat_model.tar +tar -xvf yolov6s_qat_model.tar wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg # 在CPU上使用Paddle-Inference推理量化模型 -python infer.py --model yolov6s_quant --image 000000014439.jpg --device cpu --backend paddle +python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device cpu --backend paddle # 在GPU上使用TensorRT推理量化模型 -python infer.py --model yolov6s_quant --image 000000014439.jpg --device gpu --backend trt +python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device gpu --backend trt +# 在GPU上使用Paddle-TensorRT推理量化模型 +python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device gpu --backend pptrt ``` diff --git a/examples/vision/detection/yolov6/quantize/python/infer.py b/examples/vision/detection/yolov6/quantize/python/infer.py index ec0602272..d34c7cd59 100644 --- a/examples/vision/detection/yolov6/quantize/python/infer.py +++ b/examples/vision/detection/yolov6/quantize/python/infer.py @@ -47,6 +47,11 @@ def build_option(args): assert args.device.lower( ) == "gpu", "TensorRT backend require inference on device GPU." option.use_trt_backend() + elif args.backend.lower() == "pptrt": + assert args.device.lower( + ) == "gpu", "TensorRT backend require inference on device GPU." + option.use_trt_backend() + option.enable_paddle_to_trt() elif args.backend.lower() == "ort": option.use_ort_backend() elif args.backend.lower() == "paddle": diff --git a/examples/vision/detection/yolov7/quantize/README.md b/examples/vision/detection/yolov7/quantize/README.md index 6d29ea3f3..579532568 100644 --- a/examples/vision/detection/yolov7/quantize/README.md +++ b/examples/vision/detection/yolov7/quantize/README.md @@ -1,23 +1,40 @@ # YOLOv7量化模型部署 -FastDeploy已支持部署量化模型,并提供一键模型量化的工具. -用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. +FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具. +用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. -## FastDeploy一键模型量化工具 -FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化. -详细教程请见: [一键模型量化工具](../../../../../tools/quantization/) +## FastDeploy一键模型自动化压缩工具 +FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化. +详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/) ## 下载量化完成的YOLOv7模型 -用户也可以直接下载下表中的量化模型进行部署. +用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) -| 模型 |推理后端 |部署硬件 | FP32推理时延 | INT8推理时延 | 加速比 | FP32 mAP | INT8 mAP | 量化方式 | -| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- | -| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | TensorRT | GPU | 30.43 | 15.40 | 1.98 | 51.1| 50.8| 量化蒸馏训练 | -| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | Paddle Inference | CPU | 1015.70 | 562.41 | 1.82 |51.1 | 46.3| 量化蒸馏训练 | -上表中的数据, 为模型量化前后,在FastDeploy部署的端到端推理性能. -- 测试图片为COCO val2017中的图片. -- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒. -- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1. +Benchmark表格说明: +- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. +- 所测时延均为推理1000次后求得的平均值, 单位是毫秒. +- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 +- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度 +- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比. +- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度. +- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1. GPU为Tesla T4, TensorRT版本8.4.15. + +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | TensorRT | GPU | 27.47 | 6.52 | 6.74| 5.19| 5.29 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | Paddle-TensorRT | GPU | 27.87|None|6.91|5.86 | 4.76 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | ONNX Runtime | CPU | 996.65 | 467.15 |None|None | 2.13 | 51.1 | 43.3|量化蒸馏训练 | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | Paddle Inference | CPU | 995.85 | 477.93|None|None | 2.08 |51.1 | 46.2|量化蒸馏训练 | + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | TensorRT | GPU | 36.47 | 18.81 | 20.33| 17.58| 2.07 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | Paddle-TensorRT | GPU | 37.06|None|20.26|17.53 | 2.11 | 51.1| 50.4|量化蒸馏训练 | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | ONNX Runtime | CPU | 988.85 | 478.08 |None|None | 2.07 | 51.1 | 43.3|量化蒸馏训练 | +| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | Paddle Inference | CPU | 1031.73 | 500.12|None|None | 2.06 |51.1 | 46.2|量化蒸馏训练 | ## 详细部署文档 diff --git a/examples/vision/detection/yolov7/quantize/cpp/README.md b/examples/vision/detection/yolov7/quantize/cpp/README.md index 705edda0e..53110591e 100644 --- a/examples/vision/detection/yolov7/quantize/cpp/README.md +++ b/examples/vision/detection/yolov7/quantize/cpp/README.md @@ -9,7 +9,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署. +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署. ## 以量化后的YOLOv7模型为例, 进行部署 在本目录执行如下命令即可完成编译,以及量化模型部署. @@ -31,4 +31,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000 ./infer_demo yolov7_quant 000000014439.jpg 0 # 在GPU上使用TensorRT推理量化模型 ./infer_demo yolov7_quant 000000014439.jpg 1 +# 在GPU上使用Paddle-TensorRT推理量化模型 +./infer_demo yolov7_quant 000000014439.jpg 2 ``` diff --git a/examples/vision/detection/yolov7/quantize/cpp/infer.cc b/examples/vision/detection/yolov7/quantize/cpp/infer.cc index 45cba4b29..8a656adee 100644 --- a/examples/vision/detection/yolov7/quantize/cpp/infer.cc +++ b/examples/vision/detection/yolov7/quantize/cpp/infer.cc @@ -68,7 +68,11 @@ int main(int argc, char* argv[]) { } else if (flag == 1) { option.UseGpu(); option.UseTrtBackend(); - } + } else if (flag == 2) { + option.UseGpu(); + option.UseTrtBackend(); + option.EnablePaddleToTrt(); + } std::string model_dir = argv[1]; std::string test_image = argv[2]; diff --git a/examples/vision/detection/yolov7/quantize/python/README.md b/examples/vision/detection/yolov7/quantize/python/README.md index 1ccc026fd..ac1c44889 100644 --- a/examples/vision/detection/yolov7/quantize/python/README.md +++ b/examples/vision/detection/yolov7/quantize/python/README.md @@ -8,7 +8,7 @@ ### 量化模型准备 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. -- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署. +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署. ## 以量化后的YOLOv7模型为例, 进行部署 ```bash @@ -25,4 +25,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000 python infer.py --model yolov7_quant --image 000000014439.jpg --device cpu --backend paddle # 在GPU上使用TensorRT推理量化模型 python infer.py --model yolov7_quant --image 000000014439.jpg --device gpu --backend trt +# 在GPU上使用Paddle-TensorRT推理量化模型 +python infer.py --model yolov7_quant --image 000000014439.jpg --device gpu --backend pptrt ``` diff --git a/examples/vision/detection/yolov7/quantize/python/infer.py b/examples/vision/detection/yolov7/quantize/python/infer.py index 3c42679e7..4790a4d94 100644 --- a/examples/vision/detection/yolov7/quantize/python/infer.py +++ b/examples/vision/detection/yolov7/quantize/python/infer.py @@ -47,6 +47,11 @@ def build_option(args): assert args.device.lower( ) == "gpu", "TensorRT backend require inference on device GPU." option.use_trt_backend() + elif args.backend.lower() == "pptrt": + assert args.device.lower( + ) == "gpu", "TensorRT backend require inference on device GPU." + option.use_trt_backend() + option.enable_paddle_to_trt() elif args.backend.lower() == "ort": option.use_ort_backend() elif args.backend.lower() == "paddle": diff --git a/examples/vision/segmentation/paddleseg/quantize/README.md b/examples/vision/segmentation/paddleseg/quantize/README.md new file mode 100644 index 000000000..6199c653a --- /dev/null +++ b/examples/vision/segmentation/paddleseg/quantize/README.md @@ -0,0 +1,36 @@ +# PaddleSeg 量化模型部署 +FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具. +用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署. + +## FastDeploy一键模型自动化压缩工具 +FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化. +详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/) +注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。 + +## 下载量化完成的PaddleSeg模型 +用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) + +Benchmark表格说明: +- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. +- 所测时延均为推理1000次后求得的平均值, 单位是毫秒. +- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 +- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度 +- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比. +- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度. +- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1. GPU为Tesla T4, TensorRT版本8.4.15. + +#### Runtime Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar)) | Paddle Inference | CPU | 1138.04| 602.62 |None|None | 1.89 |77.37 | 71.62 |量化蒸馏训练 | + +#### 端到端 Benchmark +| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | +| ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | +| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar)) | Paddle Inference | CPU | 4726.65| 4134.91|None|None | 1.14 |77.37 | 71.62 |量化蒸馏训练 | + +## 详细部署文档 + +- [Python部署](python) +- [C++部署](cpp) diff --git a/examples/vision/segmentation/paddleseg/quantize/cpp/CMakeLists.txt b/examples/vision/segmentation/paddleseg/quantize/cpp/CMakeLists.txt new file mode 100644 index 000000000..fea1a2888 --- /dev/null +++ b/examples/vision/segmentation/paddleseg/quantize/cpp/CMakeLists.txt @@ -0,0 +1,14 @@ +PROJECT(infer_demo C CXX) +CMAKE_MINIMUM_REQUIRED (VERSION 3.12) + +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) + +# 添加FastDeploy依赖头文件 +include_directories(${FASTDEPLOY_INCS}) + +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_demo ${FASTDEPLOY_LIBS}) diff --git a/examples/vision/segmentation/paddleseg/quantize/cpp/README.md b/examples/vision/segmentation/paddleseg/quantize/cpp/README.md new file mode 100644 index 000000000..fa334fba4 --- /dev/null +++ b/examples/vision/segmentation/paddleseg/quantize/cpp/README.md @@ -0,0 +1,30 @@ +# PaddleSeg 量化模型 C++部署示例 +本目录下提供的`infer.cc`,可以帮助用户快速完成PaddleSeg量化模型在CPU/GPU上的部署推理加速. + +## 部署准备 +### FastDeploy环境准备 +- 1. 软硬件环境满足要求,参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md) +- 2. FastDeploy Python whl包安装,参考[FastDeploy Python安装](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md) + +### 量化模型准备 +- 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署. +- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.) + +## 以量化后的PP_LiteSeg_T_STDC1_cityscapes模型为例, 进行部署 +在本目录执行如下命令即可完成编译,以及量化模型部署. +```bash +mkdir build +cd build +wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-0.3.0.tgz +tar xvf fastdeploy-linux-x64-0.3.0.tgz +cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-0.3.0 +make -j + +#下载FastDeloy提供的PP_LiteSeg_T_STDC1_cityscapes量化模型文件和测试图片 +wget https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar +tar -xvf PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar +wget https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png + +# 在CPU上使用Paddle-Inference推理量化模型 +./infer_demo PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ cityscapes_demo.png 1 +``` diff --git a/examples/vision/segmentation/paddleseg/quantize/cpp/infer.cc b/examples/vision/segmentation/paddleseg/quantize/cpp/infer.cc new file mode 100644 index 000000000..3e7240dd8 --- /dev/null +++ b/examples/vision/segmentation/paddleseg/quantize/cpp/infer.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void InitAndInfer(const std::string& model_dir, const std::string& image_file, + const fastdeploy::RuntimeOption& option) { + auto model_file = model_dir + sep + "model.pdmodel"; + auto params_file = model_dir + sep + "model.pdiparams"; + auto config_file = model_dir + sep + "deploy.yaml"; + + auto model = fastdeploy::vision::segmentation::PaddleSegModel( + model_file, params_file, config_file,option); + + assert(model.Initialized()); + + auto im = cv::imread(image_file); + auto im_bak = im.clone(); + + fastdeploy::vision::SegmentationResult res; + if (!model.Predict(&im, &res)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + std::cout << res.Str() << std::endl; + +} + +// int main(int argc, char* argv[]) { +// if (argc < 3) { +// std::cout +// << "Usage: infer_demo path/to/model_dir path/to/image run_option, " +// "e.g ./infer_model ./ppseg_model_dir ./test.jpeg 0" +// << std::endl; +// std::cout << "The data type of run_option is int, 0: run with cpu; 1: run " +// "with gpu; 2: run with gpu and use tensorrt backend." +// << std::endl; +// return -1; +// } + +// fastdeploy::RuntimeOption option; +// option.UseCpu(); +// option.UsePaddleBackend(); +// std::cout<<"Xyy-debug, enable Paddle Backend==!"; + +// std::string model_dir = argv[1]; +// std::string test_image = argv[2]; +// InitAndInfer(model_dir, test_image, option); +// return 0; +// } + +int main(int argc, char* argv[]) { + if (argc < 4) { + std::cout << "Usage: infer_demo path/to/quant_model " + "path/to/image " + "run_option, " + "e.g ./infer_demo ./ResNet50_vd_quant ./test.jpeg 0" + << std::endl; + std::cout << "The data type of run_option is int, 0: run on cpu with ORT " + "backend; 1: run " + "on gpu with TensorRT backend. " + << std::endl; + return -1; + } + + fastdeploy::RuntimeOption option; + int flag = std::atoi(argv[3]); + + if (flag == 0) { + option.UseCpu(); + option.UseOrtBackend(); + std::cout<<"Use ORT!"< target_size_max: + im_scale = float(target_size_max) / float(im_size_max) + im_scale_x = im_scale + im_scale_y = im_scale + + return im_scale_y, im_scale_x + + +def yolo_image_preprocess(img, target_shape=[640, 640]): + # Resize image + im_scale_y, im_scale_x = generate_scale(img, target_shape) + img = cv2.resize( + img, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=cv2.INTER_LINEAR) + # Pad + im_h, im_w = img.shape[:2] + h, w = target_shape[:] + if h != im_h or w != im_w: + canvas = np.ones((h, w, 3), dtype=np.float32) + canvas *= np.array([114.0, 114.0, 114.0], dtype=np.float32) + canvas[0:im_h, 0:im_w, :] = img.astype(np.float32) + img = canvas + img = np.transpose(img / 255, [2, 0, 1]) + + return img.astype(np.float32) + + +""" +Preprocess for PaddleClas model +""" + + +def cls_resize_short(img, target_size): + + img_h, img_w = img.shape[:2] + percent = float(target_size) / min(img_w, img_h) + w = int(round(img_w * percent)) + h = int(round(img_h * percent)) + + return cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR) + + +def crop_image(img, target_size, center): + + height, width = img.shape[:2] + size = target_size + + if center == True: + w_start = (width - size) // 2 + h_start = (height - size) // 2 + else: + w_start = np.random.randint(0, width - size + 1) + h_start = np.random.randint(0, height - size + 1) + w_end = w_start + size + h_end = h_start + size + + return img[h_start:h_end, w_start:w_end, :] + + +def cls_image_preprocess(img): + + # resize + img = cls_resize_short(img, target_size=256) + # crop + img = crop_image(img, target_size=224, center=True) + + #ToCHWImage & Normalize + img = np.transpose(img / 255, [2, 0, 1]) + + img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) + img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) + img -= img_mean + img /= img_std + + return img.astype(np.float32) + + +""" +Preprocess for PPYOLOE +""" + + +def ppdet_resize_no_keepratio(img, target_shape=[640, 640]): + im_shape = img.shape + + resize_h, resize_w = target_shape + im_scale_y = resize_h / im_shape[0] + im_scale_x = resize_w / im_shape[1] + + scale_factor = np.asarray([im_scale_y, im_scale_x], dtype=np.float32) + return cv2.resize( + img, None, None, fx=im_scale_x, fy=im_scale_y, + interpolation=2), scale_factor + + +def ppyoloe_withNMS_image_preprocess(img): + + img, scale_factor = ppdet_resize_no_keepratio(img, target_shape=[640, 640]) + + img = np.transpose(img / 255, [2, 0, 1]) + + img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) + img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) + img -= img_mean + img /= img_std + + return img.astype(np.float32), scale_factor + + +def ppyoloe_plus_withNMS_image_preprocess(img): + + img, scale_factor = ppdet_resize_no_keepratio(img, target_shape=[640, 640]) + + img = np.transpose(img / 255, [2, 0, 1]) + + return img.astype(np.float32), scale_factor + + +""" +Preprocess for PP_LiteSeg + +""" + + +def ppseg_cityscapes_ptq_preprocess(img): + + #ToCHWImage & Normalize + img = np.transpose(img / 255.0, [2, 0, 1]) + + img_mean = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1)) + img_std = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1)) + img -= img_mean + img /= img_std + + return img.astype(np.float32) + + +def ResizeStepScaling(img, + min_scale_factor=0.75, + max_scale_factor=1.25, + scale_step_size=0.25): + # refer form ppseg + if min_scale_factor == max_scale_factor: + scale_factor = min_scale_factor + elif scale_step_size == 0: + scale_factor = np.random.uniform(min_scale_factor, max_scale_factor) + else: + num_steps = int((max_scale_factor - min_scale_factor) / scale_step_size + + 1) + scale_factors = np.linspace(min_scale_factor, max_scale_factor, + num_steps).tolist() + np.random.shuffle(scale_factors) + scale_factor = scale_factors[0] + + w = int(round(scale_factor * img.shape[1])) + h = int(round(scale_factor * img.shape[0])) + + img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR) + + return img + + +def RandomPaddingCrop(img, + crop_size=(512, 512), + im_padding_value=(127.5, 127.5, 127.5), + label_padding_value=255): + + if isinstance(crop_size, list) or isinstance(crop_size, tuple): + if len(crop_size) != 2: + raise ValueError( + 'Type of `crop_size` is list or tuple. It should include 2 elements, but it is {}' + .format(crop_size)) + else: + raise TypeError( + "The type of `crop_size` is invalid. It should be list or tuple, but it is {}" + .format(type(crop_size))) + + if isinstance(crop_size, int): + crop_width = crop_size + crop_height = crop_size + else: + crop_width = crop_size[0] + crop_height = crop_size[1] + + img_height = img.shape[0] + img_width = img.shape[1] + + if img_height == crop_height and img_width == crop_width: + return img + else: + pad_height = max(crop_height - img_height, 0) + pad_width = max(crop_width - img_width, 0) + if (pad_height > 0 or pad_width > 0): + img = cv2.copyMakeBorder( + img, + 0, + pad_height, + 0, + pad_width, + cv2.BORDER_CONSTANT, + value=im_padding_value) + + img_height = img.shape[0] + img_width = img.shape[1] + + if crop_height > 0 and crop_width > 0: + h_off = np.random.randint(img_height - crop_height + 1) + w_off = np.random.randint(img_width - crop_width + 1) + + img = img[h_off:(crop_height + h_off), w_off:(w_off + crop_width + ), :] + + return img + + +def RandomHorizontalFlip(img, prob=0.5): + if random.random() < prob: + + if len(img.shape) == 3: + img = img[:, ::-1, :] + elif len(img.shape) == 2: + img = img[:, ::-1] + + return img + else: + return img + + +def brightness(im, brightness_lower, brightness_upper): + brightness_delta = np.random.uniform(brightness_lower, brightness_upper) + im = ImageEnhance.Brightness(im).enhance(brightness_delta) + return im + + +def contrast(im, contrast_lower, contrast_upper): + contrast_delta = np.random.uniform(contrast_lower, contrast_upper) + im = ImageEnhance.Contrast(im).enhance(contrast_delta) + return im + + +def saturation(im, saturation_lower, saturation_upper): + saturation_delta = np.random.uniform(saturation_lower, saturation_upper) + im = ImageEnhance.Color(im).enhance(saturation_delta) + return im + + +def hue(im, hue_lower, hue_upper): + hue_delta = np.random.uniform(hue_lower, hue_upper) + im = np.array(im.convert('HSV')) + im[:, :, 0] = im[:, :, 0] + hue_delta + im = Image.fromarray(im, mode='HSV').convert('RGB') + return im + + +def sharpness(im, sharpness_lower, sharpness_upper): + sharpness_delta = np.random.uniform(sharpness_lower, sharpness_upper) + im = ImageEnhance.Sharpness(im).enhance(sharpness_delta) + return im + + +def RandomDistort(img, + brightness_range=0.5, + brightness_prob=0.5, + contrast_range=0.5, + contrast_prob=0.5, + saturation_range=0.5, + saturation_prob=0.5, + hue_range=18, + hue_prob=0.5, + sharpness_range=0.5, + sharpness_prob=0): + + brightness_lower = 1 - brightness_range + brightness_upper = 1 + brightness_range + contrast_lower = 1 - contrast_range + contrast_upper = 1 + contrast_range + saturation_lower = 1 - saturation_range + saturation_upper = 1 + saturation_range + hue_lower = -hue_range + hue_upper = hue_range + sharpness_lower = 1 - sharpness_range + sharpness_upper = 1 + sharpness_range + ops = [brightness, contrast, saturation, hue, sharpness] + random.shuffle(ops) + params_dict = { + 'brightness': { + 'brightness_lower': brightness_lower, + 'brightness_upper': brightness_upper + }, + 'contrast': { + 'contrast_lower': contrast_lower, + 'contrast_upper': contrast_upper + }, + 'saturation': { + 'saturation_lower': saturation_lower, + 'saturation_upper': saturation_upper + }, + 'hue': { + 'hue_lower': hue_lower, + 'hue_upper': hue_upper + }, + 'sharpness': { + 'sharpness_lower': sharpness_lower, + 'sharpness_upper': sharpness_upper, + } + } + prob_dict = { + 'brightness': brightness_prob, + 'contrast': contrast_prob, + 'saturation': saturation_prob, + 'hue': hue_prob, + 'sharpness': sharpness_prob + } + + img = img.astype('uint8') + img = Image.fromarray(img) + + for id in range(len(ops)): + params = params_dict[ops[id].__name__] + prob = prob_dict[ops[id].__name__] + params['im'] = img + if np.random.uniform(0, 1) < prob: + img = ops[id](**params) + img = np.asarray(img).astype('float32') + return img + + +def ppseg_cityscapes_qat_preprocess(img): + + min_scale_factor = 0.5 + max_scale_factor = 2.0 + scale_step_size = 0.25 + + crop_size = (1024, 512) + + brightness_range = 0.5 + contrast_range = 0.5 + saturation_range = 0.5 + + img = ResizeStepScaling( + img, min_scale_factor=0.5, max_scale_factor=2.0, scale_step_size=0.25) + img = RandomPaddingCrop(img, crop_size=(1024, 512)) + img = RandomHorizontalFlip(img) + img = RandomDistort( + img, brightness_range=0.5, contrast_range=0.5, saturation_range=0.5) + + img = np.transpose(img / 255.0, [2, 0, 1]) + img_mean = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1)) + img_std = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1)) + img -= img_mean + img /= img_std + return img.astype(np.float32) diff --git a/tools/quantization/fdquant/fdquant.py b/tools/auto_compression/fd_auto_compress/fd_auto_compress.py similarity index 59% rename from tools/quantization/fdquant/fdquant.py rename to tools/auto_compression/fd_auto_compress/fd_auto_compress.py index 4d2bb511e..145f4a468 100644 --- a/tools/quantization/fdquant/fdquant.py +++ b/tools/auto_compression/fd_auto_compress/fd_auto_compress.py @@ -22,7 +22,7 @@ import paddle from paddleslim.common import load_config, load_onnx_model from paddleslim.auto_compression import AutoCompression from paddleslim.quant import quant_post_static -from fdquant.dataset import * +from fd_auto_compress.dataset import * def argsparser(): @@ -53,16 +53,33 @@ def argsparser(): return parser -def reader_wrapper(reader, input_list=None): - def gen(): - for data_list in reader: +def reader_wrapper(reader, input_list): + + if isinstance(input_list, list) and len(input_list) == 1: + input_name = input_list[0] + + def gen(): in_dict = {} - for data in data_list: - for i, input_name in enumerate(input_list): - in_dict[input_name] = data[i] + for i, data in enumerate(reader()): + imgs = np.array(data[0]) + in_dict[input_name] = imgs yield in_dict - return gen + return gen + + if isinstance(input_list, list) and len(input_list) > 1: + + def gen(): + for idx, data in enumerate(reader()): + in_dict = {} + for i in range(len(input_list)): + intput_name = input_list[i] + feed_data = np.array(data[0][i]) + in_dict[intput_name] = feed_data + + yield in_dict + + return gen def main(): @@ -75,31 +92,32 @@ def main(): assert FLAGS.devices in ['cpu', 'gpu', 'xpu', 'npu'] paddle.set_device(FLAGS.devices) - global global_config - all_config = load_config(FLAGS.config_path) - assert "Global" in all_config, f"Key 'Global' not found in config file. \n{all_config}" - global_config = all_config["Global"] - input_list = global_config['input_list'] - assert os.path.exists(global_config[ - 'image_path']), "image_path does not exist!" - paddle.vision.image.set_image_backend('cv2') - # transform could be customized. - train_dataset = paddle.vision.datasets.ImageFolder( - global_config['image_path'], - transform=eval(global_config['preprocess'])) - train_loader = paddle.io.DataLoader( - train_dataset, - batch_size=1, - shuffle=True, - drop_last=True, - num_workers=0) - train_loader = reader_wrapper(train_loader, input_list=input_list) - eval_func = None - - # ACT compression if FLAGS.method == 'QAT': + + all_config = load_config(FLAGS.config_path) + assert "Global" in all_config, f"Key 'Global' not found in config file. \n{all_config}" + global_config = all_config["Global"] + input_list = global_config['input_list'] + + assert os.path.exists(global_config[ + 'qat_image_path']), "image_path does not exist!" + paddle.vision.image.set_image_backend('cv2') + # transform could be customized. + train_dataset = paddle.vision.datasets.ImageFolder( + global_config['qat_image_path'], + transform=eval(global_config['qat_preprocess'])) + train_loader = paddle.io.DataLoader( + train_dataset, + batch_size=global_config['qat_batch_size'], + shuffle=True, + drop_last=True, + num_workers=0) + train_loader = reader_wrapper(train_loader, input_list=input_list) + eval_func = None + + # ACT compression ac = AutoCompression( model_dir=global_config['model_dir'], model_filename=global_config['model_filename'], @@ -113,6 +131,28 @@ def main(): # PTQ compression if FLAGS.method == 'PTQ': + # Read Global config and prepare dataset + all_config = load_config(FLAGS.config_path) + assert "Global" in all_config, f"Key 'Global' not found in config file. \n{all_config}" + global_config = all_config["Global"] + input_list = global_config['input_list'] + + assert os.path.exists(global_config[ + 'ptq_image_path']), "image_path does not exist!" + + paddle.vision.image.set_image_backend('cv2') + # transform could be customized. + val_dataset = paddle.vision.datasets.ImageFolder( + global_config['ptq_image_path'], + transform=eval(global_config['ptq_preprocess'])) + val_loader = paddle.io.DataLoader( + val_dataset, + batch_size=1, + shuffle=True, + drop_last=True, + num_workers=0) + val_loader = reader_wrapper(val_loader, input_list=input_list) + # Read PTQ config assert "PTQ" in all_config, f"Key 'PTQ' not found in config file. \n{all_config}" ptq_config = all_config["PTQ"] @@ -134,7 +174,7 @@ def main(): executor=exe, model_dir=inference_model_path, quantize_model_path=FLAGS.save_dir, - data_loader=train_loader, + data_loader=val_loader, model_filename=global_config["model_filename"], params_filename=global_config["params_filename"], batch_size=32, diff --git a/tools/quantization/requirements.txt b/tools/auto_compression/requirements.txt similarity index 100% rename from tools/quantization/requirements.txt rename to tools/auto_compression/requirements.txt diff --git a/tools/auto_compression/setup.py b/tools/auto_compression/setup.py new file mode 100644 index 000000000..9ee3ce28f --- /dev/null +++ b/tools/auto_compression/setup.py @@ -0,0 +1,26 @@ +import setuptools +import fd_auto_compress + +long_description = "fastdeploy-auto-compression is a toolkit for model auto compression of FastDeploy.\n\n" +long_description += "Usage: fastdeploy_auto_compress --config_path=./yolov7_tiny_qat_dis.yaml --method='QAT' --save_dir='../v7_qat_outmodel/' \n" + +with open("requirements.txt") as fin: + REQUIRED_PACKAGES = fin.read() + +setuptools.setup( + name="fastdeploy-auto-compression", # name of package + description="A toolkit for model auto compression of FastDeploy.", + long_description=long_description, + long_description_content_type="text/plain", + packages=setuptools.find_packages(), + install_requires=REQUIRED_PACKAGES, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], + license='Apache 2.0', + entry_points={ + 'console_scripts': + ['fastdeploy_auto_compress=fd_auto_compress.fd_auto_compress:main', ] + }) diff --git a/tools/quantization/README.md b/tools/quantization/README.md deleted file mode 100644 index 4459d526d..000000000 --- a/tools/quantization/README.md +++ /dev/null @@ -1,108 +0,0 @@ -# FastDeploy 一键模型量化 -FastDeploy基于PaddleSlim, 给用户提供了一键模型量化的工具, 支持离线量化和量化蒸馏训练. -本文档以Yolov5s为例, 供用户参考如何安装并执行FastDeploy的一键模型量化. - -## 1.安装 - -### 环境依赖 - -1.用户参考PaddlePaddle官网, 安装develop版本 -``` -https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html -``` - -2.安装paddleslim-develop版本 -```bash -git clone https://github.com/PaddlePaddle/PaddleSlim.git & cd PaddleSlim -python setup.py install -``` - -### FastDeploy-Quantization 安装方式 -用户在当前目录下,运行如下命令: -``` -python setup.py install -``` - -## 2.使用方式 - -### 一键量化示例 - -#### 离线量化 - -##### 1. 准备模型和Calibration数据集 -用户需要自行准备待量化模型与Calibration数据集. -本例中用户可执行以下命令, 下载待量化的yolov5s.onnx模型和我们为用户准备的Calibration数据集示例. - -```shell -# 下载yolov5.onnx -wget https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx - -# 下载数据集, 此Calibration数据集为COCO val2017中的前320张图片 -wget https://bj.bcebos.com/paddlehub/fastdeploy/COCO_val_320.tar.gz -tar -xvf COCO_val_320.tar.gz -``` - -##### 2.使用fastdeploy_quant命令,执行一键模型量化: -以下命令是对yolov5s模型进行量化, 用户若想量化其他模型, 替换config_path为configs文件夹下的其他模型配置文件即可. -```shell -fastdeploy_quant --config_path=./configs/detection/yolov5s_quant.yaml --method='PTQ' --save_dir='./yolov5s_ptq_model/' -``` -【说明】离线量化(训练后量化):post-training quantization,缩写是PTQ - -##### 3.参数说明 - -目前用户只需要提供一个定制的模型config文件,并指定量化方法和量化后的模型保存路径即可完成量化. - -| 参数 | 作用 | -| -------------------- | ------------------------------------------------------------ | -| --config_path | 一键量化所需要的量化配置文件.[详解](./configs/README.md) | -| --method | 量化方式选择, 离线量化选PTQ,量化蒸馏训练选QAT | -| --save_dir | 产出的量化后模型路径, 该模型可直接在FastDeploy部署 | - - - -#### 量化蒸馏训练 - -##### 1.准备待量化模型和训练数据集 -FastDeploy目前的量化蒸馏训练,只支持无标注图片训练,训练过程中不支持评估模型精度. -数据集为真实预测场景下的图片,图片数量依据数据集大小来定,尽量覆盖所有部署场景. 此例中,我们为用户准备了COCO2017验证集中的前320张图片. -注: 如果用户想通过量化蒸馏训练的方法,获得精度更高的量化模型, 可以自行准备更多的数据, 以及训练更多的轮数. - -```shell -# 下载yolov5.onnx -wget https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx - -# 下载数据集, 此Calibration数据集为COCO2017验证集中的前320张图片 -wget https://bj.bcebos.com/paddlehub/fastdeploy/COCO_val_320.tar.gz -tar -xvf COCO_val_320.tar.gz -``` - -##### 2.使用fastdeploy_quant命令,执行一键模型量化: -以下命令是对yolov5s模型进行量化, 用户若想量化其他模型, 替换config_path为configs文件夹下的其他模型配置文件即可. -```shell -# 执行命令默认为单卡训练,训练前请指定单卡GPU, 否则在训练过程中可能会卡住. -export CUDA_VISIBLE_DEVICES=0 -fastdeploy_quant --config_path=./configs/detection/yolov5s_quant.yaml --method='QAT' --save_dir='./yolov5s_qat_model/' -``` - -##### 3.参数说明 - -目前用户只需要提供一个定制的模型config文件,并指定量化方法和量化后的模型保存路径即可完成量化. - -| 参数 | 作用 | -| -------------------- | ------------------------------------------------------------ | -| --config_path | 一键量化所需要的量化配置文件.[详解](./configs/README.md)| -| --method | 量化方式选择, 离线量化选PTQ,量化蒸馏训练选QAT | -| --save_dir | 产出的量化后模型路径, 该模型可直接在FastDeploy部署 | - - -## 3. FastDeploy 部署量化模型 -用户在获得量化模型之后,即可以使用FastDeploy进行部署, 部署文档请参考: -具体请用户参考示例文档: -- [YOLOv5 量化模型部署](../../examples/vision/detection/yolov5/quantize/) - -- [YOLOv6 量化模型部署](../../examples/vision/detection/yolov6/quantize/) - -- [YOLOv7 量化模型部署](../../examples/vision/detection/yolov7/quantize/) - -- [PadddleClas 量化模型部署](../../examples/vision/classification/paddleclas/quantize/) diff --git a/tools/quantization/configs/README.md b/tools/quantization/configs/README.md deleted file mode 100644 index 7bab2de34..000000000 --- a/tools/quantization/configs/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# FastDeploy 量化配置文件说明 -FastDeploy 量化配置文件中,包含了全局配置,量化蒸馏训练配置,离线量化配置和训练配置. -用户除了直接使用FastDeploy提供在本目录的配置文件外,可以按需求自行修改相关配置文件 - -## 实例解读 - -``` -# 全局配置 -Global: - model_dir: ./yolov5s.onnx #输入模型的路径 - format: 'onnx' #输入模型的格式, paddle模型请选择'paddle' - model_filename: model.pdmodel #量化后转为paddle格式模型的模型名字 - params_filename: model.pdiparams #量化后转为paddle格式模型的参数名字 - image_path: ./COCO_val_320 #离线量化或者量化蒸馏训练使用的数据集路径 - arch: YOLOv5 #模型结构 - input_list: ['x2paddle_images'] #待量化的模型的输入名字 - preprocess: yolo_image_preprocess #模型量化时,对数据做的预处理函数, 用户可以在 ../fdquant/dataset.py 中修改或自行编写新的预处理函数 - -#量化蒸馏训练配置 -Distillation: - alpha: 1.0 #蒸馏loss所占权重 - loss: soft_label #蒸馏loss算法 - -Quantization: - onnx_format: true #是否采用ONNX量化标准格式, 要在FastDeploy上部署, 必须选true - use_pact: true #量化训练是否使用PACT方法 - activation_quantize_type: 'moving_average_abs_max' #激活量化方式 - quantize_op_types: #需要进行量化的OP - - conv2d - - depthwise_conv2d - -#离线量化配置 -PTQ: - calibration_method: 'avg' #离线量化的激活校准算法, 可选: avg, abs_max, hist, KL, mse, emd - skip_tensor_list: None #用户可指定跳过某些conv层,不进行量化 - -#训练参数配置 -TrainConfig: - train_iter: 3000 - learning_rate: 0.00001 - optimizer_builder: - optimizer: - type: SGD - weight_decay: 4.0e-05 - target_metric: 0.365 - -``` -## 更多详细配置方法 - -FastDeploy一键量化功能由PaddeSlim助力, 更详细的量化配置方法请参考: -[自动化压缩超参详细教程](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/hyperparameter_tutorial.md) diff --git a/tools/quantization/fdquant/dataset.py b/tools/quantization/fdquant/dataset.py deleted file mode 100644 index a373d973d..000000000 --- a/tools/quantization/fdquant/dataset.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cv2 -import os -import numpy as np -import paddle - - -def generate_scale(im, target_shape): - origin_shape = im.shape[:2] - im_size_min = np.min(origin_shape) - im_size_max = np.max(origin_shape) - target_size_min = np.min(target_shape) - target_size_max = np.max(target_shape) - im_scale = float(target_size_min) / float(im_size_min) - if np.round(im_scale * im_size_max) > target_size_max: - im_scale = float(target_size_max) / float(im_size_max) - im_scale_x = im_scale - im_scale_y = im_scale - - return im_scale_y, im_scale_x - - -def yolo_image_preprocess(img, target_shape=[640, 640]): - # Resize image - im_scale_y, im_scale_x = generate_scale(img, target_shape) - img = cv2.resize( - img, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=cv2.INTER_LINEAR) - # Pad - im_h, im_w = img.shape[:2] - h, w = target_shape[:] - if h != im_h or w != im_w: - canvas = np.ones((h, w, 3), dtype=np.float32) - canvas *= np.array([114.0, 114.0, 114.0], dtype=np.float32) - canvas[0:im_h, 0:im_w, :] = img.astype(np.float32) - img = canvas - img = np.transpose(img / 255, [2, 0, 1]) - - return img.astype(np.float32) - - -def cls_resize_short(img, target_size): - - img_h, img_w = img.shape[:2] - percent = float(target_size) / min(img_w, img_h) - w = int(round(img_w * percent)) - h = int(round(img_h * percent)) - - return cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR) - - -def crop_image(img, target_size, center): - - height, width = img.shape[:2] - size = target_size - - if center == True: - w_start = (width - size) // 2 - h_start = (height - size) // 2 - else: - w_start = np.random.randint(0, width - size + 1) - h_start = np.random.randint(0, height - size + 1) - w_end = w_start + size - h_end = h_start + size - - return img[h_start:h_end, w_start:w_end, :] - - -def cls_image_preprocess(img): - - # resize - img = cls_resize_short(img, target_size=256) - # crop - img = crop_image(img, target_size=224, center=True) - - #ToCHWImage & Normalize - img = np.transpose(img / 255, [2, 0, 1]) - - img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) - img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) - img -= img_mean - img /= img_std - - return img.astype(np.float32) - - -def ppdet_resize_no_keepratio(img, target_shape=[640, 640]): - im_shape = img.shape - - resize_h, resize_w = target_shape - im_scale_y = resize_h / im_shape[0] - im_scale_x = resize_w / im_shape[1] - - scale_factor = np.asarray([im_scale_y, im_scale_x], dtype=np.float32) - return cv2.resize( - img, None, None, fx=im_scale_x, fy=im_scale_y, - interpolation=2), scale_factor - - -def ppdet_normliaze(img, is_scale=True): - - mean = [0.485, 0.456, 0.406] - std = [0.229, 0.224, 0.225] - img = img.astype(np.float32, copy=False) - - if is_scale: - scale = 1.0 / 255.0 - img *= scale - - mean = np.array(mean)[np.newaxis, np.newaxis, :] - std = np.array(std)[np.newaxis, np.newaxis, :] - img -= mean - img /= std - return img - - -def hwc_to_chw(img): - img = img.transpose((2, 0, 1)) - return img - - -def ppdet_image_preprocess(img): - - img, scale_factor = ppdet_resize_no_keepratio(img, target_shape=[640, 640]) - - img = np.transpose(img / 255, [2, 0, 1]) - - img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) - img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) - img -= img_mean - img /= img_std - - return img.astype(np.float32), scale_factor diff --git a/tools/quantization/setup.py b/tools/quantization/setup.py deleted file mode 100644 index a0c0c2fc0..000000000 --- a/tools/quantization/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -import setuptools -import fdquant - -long_description = "FDQuant is a toolkit for model quantization of FastDeploy.\n\n" -long_description += "Usage: fastdeploy_quant --config_path=./yolov7_tiny_qat_dis.yaml --method='QAT' --save_dir='../v7_qat_outmodel/' \n" - -with open("requirements.txt") as fin: - REQUIRED_PACKAGES = fin.read() - -setuptools.setup( - name="fastdeploy-quantization", # name of package - description="A toolkit for model quantization of FastDeploy.", - long_description=long_description, - long_description_content_type="text/plain", - packages=setuptools.find_packages(), - install_requires=REQUIRED_PACKAGES, - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - license='Apache 2.0', - entry_points={ - 'console_scripts': ['fastdeploy_quant=fdquant.fdquant:main', ] - })