From dc5b4be7a90dd308c80522126e0d56bc041bc4bd Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Mon, 20 Feb 2023 17:24:12 +0800
Subject: [PATCH 01/20] [Other] Enable to hidden INFO/WARNING log informations
 (#1368)

* Enable to hidden INFO/WARNING log informations

* Fix build error

---------

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
---
 fastdeploy/function/gather_scatter_along_axis.h |  3 ++-
 fastdeploy/pybind/main.cc.in                    |  2 ++
 fastdeploy/utils/utils.cc                       | 13 +++++++++++--
 fastdeploy/utils/utils.h                        | 12 ++++++++++--
 python/fastdeploy/__init__.py                   | 11 +++++++++++
 5 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/fastdeploy/function/gather_scatter_along_axis.h b/fastdeploy/function/gather_scatter_along_axis.h
index bd1093af1..fa627a411 100644
--- a/fastdeploy/function/gather_scatter_along_axis.h
+++ b/fastdeploy/function/gather_scatter_along_axis.h
@@ -26,7 +26,8 @@ namespace function {
     @param out The output tensor which stores the result.
     @param axis Axis which will be gathered.
 */
-void GatherAlongAxis(const FDTensor& x, const FDTensor& index, FDTensor* result,
+FASTDEPLOY_DECL void GatherAlongAxis(const FDTensor& x,
+                    const FDTensor& index, FDTensor* result,
                      int axis);
 
 }  // namespace function
diff --git a/fastdeploy/pybind/main.cc.in b/fastdeploy/pybind/main.cc.in
index 5da3ef9fc..5e11ee808 100755
--- a/fastdeploy/pybind/main.cc.in
+++ b/fastdeploy/pybind/main.cc.in
@@ -156,6 +156,8 @@ PYBIND11_MODULE(@PY_LIBRARY_NAME@, m) {
       "Make programer easier to deploy deeplearning model, save time to save "
       "the world!";
 
+  m.def("set_logger", &SetLogger);
+
   BindFDTensor(m);
   BindRuntime(m);
   BindFDModel(m);
diff --git a/fastdeploy/utils/utils.cc b/fastdeploy/utils/utils.cc
index 760c10406..c39b6adab 100644
--- a/fastdeploy/utils/utils.cc
+++ b/fastdeploy/utils/utils.cc
@@ -13,18 +13,27 @@
 // limitations under the License.
 
 #include "fastdeploy/utils/utils.h"
+
 #include <sstream>
 
 namespace fastdeploy {
 
+bool FDLogger::enable_info = true;
+bool FDLogger::enable_warning = true;
+
+void SetLogger(bool enable_info, bool enable_warning) {
+  FDLogger::enable_info = enable_info;
+  FDLogger::enable_warning = enable_warning;
+}
+
 FDLogger::FDLogger(bool verbose, const std::string& prefix) {
   verbose_ = verbose;
   line_ = "";
-#ifdef __ANDROID__  
+#ifdef __ANDROID__
   prefix_ = std::string("[FastDeploy]") + prefix;
 #else
   prefix_ = prefix;
-#endif  
+#endif
 }
 
 FDLogger& FDLogger::operator<<(std::ostream& (*os)(std::ostream&)) {
diff --git a/fastdeploy/utils/utils.h b/fastdeploy/utils/utils.h
index d44b7f187..cfc666bb2 100644
--- a/fastdeploy/utils/utils.h
+++ b/fastdeploy/utils/utils.h
@@ -43,6 +43,9 @@ namespace fastdeploy {
 
 class FASTDEPLOY_DECL FDLogger {
  public:
+  static bool enable_info;
+  static bool enable_warning;
+
   FDLogger() {
     line_ = "";
     prefix_ = "[FastDeploy]";
@@ -90,11 +93,12 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
       << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 
 #define FDWARNING                                                              \
-  FDLogger(true, "[WARNING]")                                                  \
+  FDLogger(fastdeploy::FDLogger::enable_warning, "[WARNING]")                  \
       << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 
 #define FDINFO                                                                 \
-  FDLogger(true, "[INFO]") << __REL_FILE__ << "(" << __LINE__                  \
+  FDLogger(fastdeploy::FDLogger::enable_info, "[INFO]")                        \
+                           << __REL_FILE__ << "(" << __LINE__                  \
                            << ")::" << __FUNCTION__ << "\t"
 
 #define FDASSERT(condition, format, ...)                                       \
@@ -214,6 +218,10 @@ std::string Str(const std::vector<T>& shape) {
   return oss.str();
 }
 
+/// Set behaviour of logging while using FastDeploy
+FASTDEPLOY_DECL void SetLogger(bool enable_info = true,
+                               bool enable_warning = true);
+
 template <typename T>
 void CalculateStatisInfo(const void* src_ptr, int size, double* mean,
                          double* max, double* min) {
diff --git a/python/fastdeploy/__init__.py b/python/fastdeploy/__init__.py
index 1d9640c7b..ac982fcfb 100755
--- a/python/fastdeploy/__init__.py
+++ b/python/fastdeploy/__init__.py
@@ -30,6 +30,17 @@ from .c_lib_wrap import (
     is_built_with_trt,
     get_default_cuda_directory, )
 
+
+def set_logger(enable_info=True, enable_warning=True):
+    """Set behaviour of logger while using FastDeploy
+
+    :param enable_info: (boolean)Whether to print out log level of INFO
+    :param enable_warning: (boolean)Whether to print out log level of WARNING, recommend to set to True
+    """
+    from .c_lib_wrap import set_logger
+    set_logger(enable_info, enable_warning)
+
+
 from .runtime import Runtime, RuntimeOption
 from .model import FastDeployModel
 from . import c_lib_wrap as C

From 9593c15ad895fa69dd2a926dbd8027ae8ab26d55 Mon Sep 17 00:00:00 2001
From: Zheng-Bicheng <58363586+Zheng-Bicheng@users.noreply.github.com>
Date: Mon, 20 Feb 2023 18:19:11 +0800
Subject: [PATCH 02/20] [Doc]Rknn docs (#1376)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 按照要求更新

* update rknn docs
---
 docs/cn/faq/rknpu2/build.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/docs/cn/faq/rknpu2/build.md b/docs/cn/faq/rknpu2/build.md
index 3334387b7..bd6a636a8 100644
--- a/docs/cn/faq/rknpu2/build.md
+++ b/docs/cn/faq/rknpu2/build.md
@@ -11,9 +11,11 @@ FastDeploy当前在RK平台上支持后端引擎如下:
 
 ## 编译FastDeploy SDK
 
+针对RK356X和RK3588的性能差异，我们提供了两种编译FastDeploy的方式。
+
 ### 板端编译FastDeploy C++ SDK
 
-RKNPU2暂时仅支持linux系统, 以下教程在RK3568(debian 10)、RK3588(debian 11) 环境下完成。
+针对RK3588，其CPU性能较强，板端编译的速度还是可以接受的，我们推荐在板端上进行编译。以下教程在RK356X(debian10),RK3588(debian 11) 环境下完成。
 
 ```bash
 git clone https://github.com/PaddlePaddle/FastDeploy.git
@@ -33,6 +35,9 @@ make install
 ```
 
 ### 交叉编译FastDeploy C++ SDK
+
+针对RK356X，其CPU性能较弱，我们推荐使用交叉编译进行编译。以下教程在Ubuntu 22.04环境下完成。
+
 ```bash
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy
@@ -54,9 +59,11 @@ make -j8
 make install
 ```
 
+如果你找不到编译工具，你可以复制[交叉编译工具](https://bj.bcebos.com/paddle2onnx/libs/gcc-linaro-6.3.1-2017.zip)进行下载。
+
 ### 板端编译Python SDK
 
-RKNPU2暂时仅支持linux系统, 以下教程在RK3568(debian 10)、RK3588(debian 11) 环境下完成。Python打包依赖`wheel`，编译前请先执行`pip install wheel`
+Python SDK的编译暂时仅支持板端编译, 以下教程在RK3568(debian 10)、RK3588(debian 11) 环境下完成。Python打包依赖`wheel`，编译前请先执行`pip install wheel`
 
 ```bash
 git clone https://github.com/PaddlePaddle/FastDeploy.git
@@ -69,8 +76,15 @@ cd python
 export ENABLE_ORT_BACKEND=ON
 export ENABLE_RKNPU2_BACKEND=ON
 export ENABLE_VISION=ON
+
+# 请根据你的开发版的不同，选择RK3588和RK356X
 export RKNN2_TARGET_SOC=RK3588
+
+# 如果你的核心板的运行内存大于等于8G，我们建议您执行以下命令进行编译。
 python3 setup.py build
+# 值得注意的是，如果你的核心板的运行内存小于8G，我们建议您执行以下命令进行编译。
+python3 setup.py build -j1
+
 python3 setup.py bdist_wheel
 cd dist
 pip3 install fastdeploy_python-0.0.0-cp39-cp39-linux_aarch64.whl

From 7dc15e4a7788e64fa9f67bcd90317459025cc606 Mon Sep 17 00:00:00 2001
From: yunyaoXYY <109218879+yunyaoXYY@users.noreply.github.com>
Date: Mon, 20 Feb 2023 18:32:49 +0800
Subject: [PATCH 03/20] [Other] Remove useless OpenCV usage for PP-OCR. (#1375)

* Add Huawei Ascend NPU deploy through PaddleLite CANN

* Add NNAdapter interface for paddlelite

* Modify Huawei Ascend Cmake

* Update way for compiling Huawei Ascend NPU deployment

* remove UseLiteBackend in UseCANN

* Support compile python whlee

* Change names of nnadapter API

* Add nnadapter pybind and remove useless API

* Support Python deployment on Huawei Ascend NPU

* Fix links in readme

* Fix links in readme

* Update PPOCRv2/v3 examples

* Update auto compression configs

* Add neww quantization  support for paddleclas model

* Update quantized Yolov6s model download link

* Improve PPOCR comments

* Add models suppor for ascend

* Add PPOCR rec reszie for ascend

* fix conflict for ascend

* Rename CANN to Ascend

* Rename CANN to Ascend

* Improve ascend

* fix ascend bug

* improve ascend docs

* improve ascend docs

* improve ascend docs

* Add English doc for quantization

* Improve Ascend

* Improve Ascend

* Move ascend python demo

* Imporve ascend

* Fix PPOCR rec model bug

* Improve ascend

* Improve ascend

* Improve ascend

* Improve ascend

* Add  new paddleseg quantization support

* Add  new paddleseg quantization support

* Add  new paddleseg quantization support

* Add  new paddleseg quantization support

* Imporve ascend

* Imporve ascend

* Improve ascend

* acc eval script

* acc eval

* remove acc_eval from branch huawei

* Add detection and segmentation examples for Ascend deployment

* Add detection and segmentation examples for Ascend deployment

* Add Ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Add PPOCR example for ascend deploy

* Imporve paddle lite compiliation

* Add FlyCV doc

* Add FlyCV doc

* Add FlyCV doc

* Imporve Ascend docs

* Imporve Ascend docs

* Improve PPOCR example

* Support Ascend deployment on X86 platform

* Improve Ascend docs

* Improve ascend

* Improve ascend

* Change Paddle Lite Ascend URL

* fix ascend docs

* fix ascend docs

* Fix Paddle Lite Ascend Lib

* Imporve compile of Ascend

* Imporve compile of Ascend

* fix paddle lite compile

* Remove useless opencv code for ocr
---
 fastdeploy/vision/ocr/ppocr/utils/ocr_postprocess_op.h | 1 -
 fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h          | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fastdeploy/vision/ocr/ppocr/utils/ocr_postprocess_op.h b/fastdeploy/vision/ocr/ppocr/utils/ocr_postprocess_op.h
index 5900daea2..778f618ed 100644
--- a/fastdeploy/vision/ocr/ppocr/utils/ocr_postprocess_op.h
+++ b/fastdeploy/vision/ocr/ppocr/utils/ocr_postprocess_op.h
@@ -20,7 +20,6 @@
 #include <ostream>
 #include <vector>
 #include "opencv2/core.hpp"
-#include "opencv2/imgcodecs.hpp"
 #include "opencv2/imgproc.hpp"
 
 #include <cstring>
diff --git a/fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h b/fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h
index f12f40f71..101926cb5 100755
--- a/fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h
+++ b/fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h
@@ -21,7 +21,6 @@
 #include "fastdeploy/vision/common/result.h"
 
 #include "opencv2/core.hpp"
-#include "opencv2/imgcodecs.hpp"
 #include "opencv2/imgproc.hpp"
 
 namespace fastdeploy {

From c3aee4f7f040d050e27ddb02c55fb677ff5bd9f7 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Mon, 20 Feb 2023 18:52:50 +0800
Subject: [PATCH 04/20] [Doc] Add cmake version notice in Jetson doc (#1379)

* Update jetson.md

* Update jetson.md
---
 docs/cn/build_and_install/jetson.md | 2 ++
 docs/en/build_and_install/jetson.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/docs/cn/build_and_install/jetson.md b/docs/cn/build_and_install/jetson.md
index f2579be5b..7fa221727 100644
--- a/docs/cn/build_and_install/jetson.md
+++ b/docs/cn/build_and_install/jetson.md
@@ -4,6 +4,8 @@
 
 FastDeploy当前在Jetson仅支持ONNX Runtime CPU和TensorRT GPU/Paddle Inference三种后端推理
 
+- 如若编译过程，出现错误提示`Could not find a package configuration file provided by "Python" with any of the following names: PythonConfig.cmake python-config.cmake`，请尝试将[cmake升级至3.25或最新版本](https://cmake.org/download/)解决。
+
 ## C++ SDK编译安装
 
 编译需满足
diff --git a/docs/en/build_and_install/jetson.md b/docs/en/build_and_install/jetson.md
index ddbb67f22..0826ecc03 100644
--- a/docs/en/build_and_install/jetson.md
+++ b/docs/en/build_and_install/jetson.md
@@ -4,6 +4,8 @@ English | [中文](../../cn/build_and_install/jetson.md)
 
 FastDeploy supports CPU inference with ONNX Runtime and GPU inference with Nvidia TensorRT/Paddle Inference on Nvidia Jetson platform
 
+- If there's error occurs, shows `Could not find a package configuration file provided by "Python" with any of the following names: PythonConfig.cmake python-config.cmake`, please try to [upgrade cmake to 3.25 or newer version](https://cmake.org/download/) to solve the problem.
+- 
 ## How to Build and Install FastDeploy C++ Library
 
 Prerequisite for Compiling on NVIDIA Jetson:

From 08aa209ea4f4275710482df491e99fed40bc5932 Mon Sep 17 00:00:00 2001
From: huangjianhui <852142024@qq.com>
Date: Mon, 20 Feb 2023 19:44:51 +0800
Subject: [PATCH 05/20] [Doc] Fix wrong word (#1380)

* Update README_CN.md

Wrong word

* Update README_EN.md

Wrong word
---
 README_CN.md | 2 +-
 README_EN.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_CN.md b/README_CN.md
index c03ee7b54..c5a7b2633 100755
--- a/README_CN.md
+++ b/README_CN.md
@@ -44,7 +44,7 @@
 </div>
 
 
- **⚡️FastDeploy**是一款**全场景**、**易用灵活**、**极致高效**的AI推理部署工具， 支持**云边端**部署。提供超过 🔥160+ **Text**，**Vision**， **Speech**和**跨模态**模型📦**开箱即用**的部署体验，并实现🔚**端到端**的推理性能优化。包括 [物体检测](./examples/vision/detection)、[字符识别（OCR）](./examples/vision/ocr)、[人脸](./examples/vision/facedet)、[人像扣图](./examples/vision/matting)、[多目标跟踪系统](./examples/vision/tracking/pptracking)、[NLP](./examples/text)、[Stable Difussion文图生成](./examples/multimodal/stable_diffusion)、[TTS](./examples/audio/pp-tts) 等几十种任务场景，满足开发者**多场景、多硬件、多平台**的产业部署需求。
+ **⚡️FastDeploy**是一款**全场景**、**易用灵活**、**极致高效**的AI推理部署工具， 支持**云边端**部署。提供超过 🔥160+ **Text**，**Vision**， **Speech**和**跨模态**模型📦**开箱即用**的部署体验，并实现🔚**端到端**的推理性能优化。包括 [物体检测](./examples/vision/detection)、[字符识别（OCR）](./examples/vision/ocr)、[人脸](./examples/vision/facedet)、[人像扣图](./examples/vision/matting)、[多目标跟踪系统](./examples/vision/tracking/pptracking)、[NLP](./examples/text)、[Stable Diffusion文图生成](./examples/multimodal/stable_diffusion)、[TTS](./examples/audio/pp-tts) 等几十种任务场景，满足开发者**多场景、多硬件、多平台**的产业部署需求。
 
 <div align="center">
     
diff --git a/README_EN.md b/README_EN.md
index 48e66e506..7761fd895 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -41,7 +41,7 @@ English | [简体中文](README_CN.md) | [हिन्दी](./docs/docs_i18n/R
 </div>
 
 **⚡️FastDeploy** is an **Easy-to-use** and **High Performance** AI model deployment toolkit for Cloud, Mobile and Edge with 📦**out-of-the-box and unified experience**, 🔚**end-to-end optimization** for over **🔥160+ Text, Vision, Speech and Cross-modal AI models**.
-Including [image classification](examples/vision/classification), [object detection](examples/vision/detection), [OCR](./examples/vision/ocr), [face detection](./examples/vision/facedet), [matting](./examples/vision/matting), [pp-tracking](./examples/vision/tracking/pptracking), [NLP](./examples/text), [stable difussion](./examples/multimodal/stable_diffusion), [TTS](./examples/audio/pp-tts) and other tasks to meet developers' industrial deployment needs for **multi-scenario**, **multi-hardware** and **multi-platform**.
+Including [image classification](examples/vision/classification), [object detection](examples/vision/detection), [OCR](./examples/vision/ocr), [face detection](./examples/vision/facedet), [matting](./examples/vision/matting), [pp-tracking](./examples/vision/tracking/pptracking), [NLP](./examples/text), [stable diffusion](./examples/multimodal/stable_diffusion), [TTS](./examples/audio/pp-tts) and other tasks to meet developers' industrial deployment needs for **multi-scenario**, **multi-hardware** and **multi-platform**.
 
 <div align="center">
     

From f212f5c5926c0fcacef2e0c1a4a687c1045c9c2e Mon Sep 17 00:00:00 2001
From: tianshaoqing <tianshaoqing@baidu.com>
Date: Mon, 20 Feb 2023 21:49:58 +0800
Subject: [PATCH 06/20] [Backend] update poros

---
 .gitmodules                                   |  6 ++
 poros/CMakeLists.txt                          | 56 ++++++-----
 poros/README.md                               |  2 +-
 poros/{src => }/poros/compile/compile.cpp     | 12 +--
 poros/{src => }/poros/compile/compile.h       |  0
 .../{src => }/poros/compile/graph_prewarm.cpp |  0
 poros/{src => }/poros/compile/graph_prewarm.h |  0
 .../{src => }/poros/compile/graph_segment.cpp | 18 ++++
 poros/{src => }/poros/compile/graph_segment.h |  0
 .../poros/compile/ivalues_analysis.cpp        |  0
 .../poros/compile/ivalues_analysis.h          |  0
 poros/{src => }/poros/compile/partition.cpp   |  0
 poros/{src => }/poros/compile/partition.h     |  0
 .../{src => }/poros/compile/poros_module.cpp  |  0
 poros/{src => }/poros/compile/poros_module.h  |  0
 poros/{src => }/poros/compile/segment.cpp     |  0
 poros/{src => }/poros/compile/segment.h       |  0
 .../{src => }/poros/context/poros_global.cpp  |  0
 poros/{src => }/poros/context/poros_global.h  |  0
 .../poros/converter/gpu/activation.cpp        |  0
 .../poros/converter/gpu/activation.h          |  0
 poros/{src => }/poros/converter/gpu/add.cpp   |  0
 poros/{src => }/poros/converter/gpu/add.h     |  0
 .../poros/converter/gpu/aten_eval.cpp         |  4 +
 .../{src => }/poros/converter/gpu/aten_eval.h |  0
 .../poros/converter/gpu/aten_trt_util.cpp     |  0
 .../poros/converter/gpu/aten_trt_util.h       |  0
 .../poros/converter/gpu/batch_norm.cpp        |  0
 .../poros/converter/gpu/batch_norm.h          |  0
 poros/{src => }/poros/converter/gpu/clone.cpp |  0
 poros/{src => }/poros/converter/gpu/clone.h   |  0
 .../poros/converter/gpu/coercion.cpp          |  0
 .../{src => }/poros/converter/gpu/coercion.h  |  0
 .../{src => }/poros/converter/gpu/concat.cpp  |  0
 poros/{src => }/poros/converter/gpu/concat.h  |  0
 .../poros/converter/gpu/constant.cpp          |  0
 .../{src => }/poros/converter/gpu/constant.h  |  0
 .../poros/converter/gpu/constant_pad_nd.cpp   |  0
 .../poros/converter/gpu/constant_pad_nd.h     |  0
 .../poros/converter/gpu/converter_util.cpp    |  0
 .../poros/converter/gpu/converter_util.h      |  0
 .../poros/converter/gpu/convolution.cpp       |  0
 .../poros/converter/gpu/convolution.h         |  0
 .../{src => }/poros/converter/gpu/einsum.cpp  |  0
 poros/{src => }/poros/converter/gpu/einsum.h  |  0
 .../poros/converter/gpu/element_wise.cpp      |  0
 .../poros/converter/gpu/element_wise.h        |  0
 .../{src => }/poros/converter/gpu/expand.cpp  |  0
 poros/{src => }/poros/converter/gpu/expand.h  |  0
 .../poros/converter/gpu/generate.cpp          |  0
 .../{src => }/poros/converter/gpu/generate.h  |  0
 .../poros/converter/gpu/gpu_converter.cpp     |  0
 .../poros/converter/gpu/gpu_converter.h       |  0
 .../poros/converter/gpu/group_norm.cpp        |  0
 .../poros/converter/gpu/group_norm.h          |  0
 .../poros/converter/gpu/interpolate.cpp       |  0
 .../poros/converter/gpu/interpolate.h         |  0
 .../poros/converter/gpu/layer_norm.cpp        |  0
 .../poros/converter/gpu/layer_norm.h          |  0
 .../{src => }/poros/converter/gpu/linear.cpp  |  0
 poros/{src => }/poros/converter/gpu/linear.h  |  0
 poros/{src => }/poros/converter/gpu/list.cpp  |  0
 poros/{src => }/poros/converter/gpu/list.h    |  0
 .../{src => }/poros/converter/gpu/logical.cpp |  0
 poros/{src => }/poros/converter/gpu/logical.h |  0
 poros/{src => }/poros/converter/gpu/lstm.cpp  |  0
 poros/{src => }/poros/converter/gpu/lstm.h    |  0
 .../poros/converter/gpu/lstm_cell.cpp         |  0
 .../{src => }/poros/converter/gpu/lstm_cell.h |  0
 .../poros/converter/gpu/matrix_multiply.cpp   |  0
 .../poros/converter/gpu/matrix_multiply.h     |  0
 .../poros/converter/gpu/meshgrid.cpp          |  0
 .../{src => }/poros/converter/gpu/meshgrid.h  |  0
 .../{src => }/poros/converter/gpu/mul_div.cpp |  0
 poros/{src => }/poros/converter/gpu/mul_div.h |  0
 .../poros/converter/gpu/non_converterable.cpp |  0
 .../poros/converter/gpu/non_converterable.h   |  0
 poros/{src => }/poros/converter/gpu/norm.cpp  |  0
 poros/{src => }/poros/converter/gpu/norm.h    |  0
 .../gpu/plugins/interpolate_plugin.cpp        |  0
 .../gpu/plugins/interpolate_plugin.h          |  0
 .../{src => }/poros/converter/gpu/pooling.cpp |  0
 poros/{src => }/poros/converter/gpu/pooling.h |  0
 .../{src => }/poros/converter/gpu/reduce.cpp  | 65 +++++++++++++
 poros/{src => }/poros/converter/gpu/reduce.h  | 20 ++++
 .../poros/converter/gpu/reflection_pad.cpp    |  0
 .../poros/converter/gpu/reflection_pad.h      |  0
 .../poros/converter/gpu/replication_pad.cpp   |  0
 .../poros/converter/gpu/replication_pad.h     |  0
 poros/{src => }/poros/converter/gpu/roll.cpp  |  0
 poros/{src => }/poros/converter/gpu/roll.h    |  0
 .../{src => }/poros/converter/gpu/select.cpp  |  0
 poros/{src => }/poros/converter/gpu/select.h  |  0
 .../poros/converter/gpu/shape_handle.cpp      |  0
 .../poros/converter/gpu/shape_handle.h        |  0
 .../{src => }/poros/converter/gpu/shuffle.cpp |  0
 poros/{src => }/poros/converter/gpu/shuffle.h |  0
 .../{src => }/poros/converter/gpu/softmax.cpp |  0
 poros/{src => }/poros/converter/gpu/softmax.h |  0
 .../{src => }/poros/converter/gpu/squeeze.cpp |  0
 poros/{src => }/poros/converter/gpu/squeeze.h |  0
 poros/{src => }/poros/converter/gpu/stack.cpp |  0
 poros/{src => }/poros/converter/gpu/stack.h   |  0
 poros/{src => }/poros/converter/gpu/to.cpp    |  0
 poros/{src => }/poros/converter/gpu/to.h      |  0
 poros/{src => }/poros/converter/gpu/topk.cpp  |  0
 poros/{src => }/poros/converter/gpu/topk.h    |  0
 poros/{src => }/poros/converter/gpu/unary.cpp |  0
 poros/{src => }/poros/converter/gpu/unary.h   |  0
 .../{src => }/poros/converter/gpu/weight.cpp  |  0
 poros/{src => }/poros/converter/gpu/weight.h  |  0
 poros/{src => }/poros/converter/iconverter.h  |  3 +
 poros/{src => }/poros/engine/engine.cpp       |  0
 poros/{src => }/poros/engine/engine_context.h |  0
 poros/{src => }/poros/engine/iengine.h        |  0
 .../poros/engine/tensorrt_engine.cpp          |  0
 .../{src => }/poros/engine/tensorrt_engine.h  |  0
 .../{src => }/poros/engine/trtengine_util.cpp |  0
 poros/{src => }/poros/engine/trtengine_util.h |  0
 .../{src => }/poros/iplugin/plugin_create.cpp |  0
 poros/{src => }/poros/iplugin/plugin_create.h |  0
 poros/{src => }/poros/log/poros_logging.h     |  0
 .../{src => }/poros/log/tensorrt_logging.cpp  |  0
 poros/{src => }/poros/log/tensorrt_logging.h  |  0
 .../lowering/eliminate_exception_pass.cpp     |  0
 .../eliminate_maxpoll_with_indices.cpp        |  0
 .../eliminate_simple_useless_nodes.cpp        |  0
 .../poros/lowering/eliminate_some_dict.cpp    |  0
 .../poros/lowering/eliminate_some_list.cpp    |  0
 .../eliminate_subgraph_uesless_nodes.cpp      |  0
 .../poros/lowering/eliminate_useless_copy.cpp |  0
 poros/{src => }/poros/lowering/fuse_clip.cpp  |  0
 poros/{src => }/poros/lowering/fuse_clip.h    |  0
 .../{src => }/poros/lowering/fuse_conv_bn.cpp |  0
 poros/{src => }/poros/lowering/fuse_conv_bn.h |  0
 .../poros/lowering/fuse_conv_mul.cpp          |  0
 .../{src => }/poros/lowering/fuse_conv_mul.h  |  0
 poros/{src => }/poros/lowering/fuse_copy.cpp  |  0
 poros/{src => }/poros/lowering/fuse_copy.h    |  0
 poros/{src => }/poros/lowering/fuse_gelu.cpp  |  0
 poros/{src => }/poros/lowering/fuse_gelu.h    |  0
 .../poros/lowering/fuse_hard_swish.cpp        |  0
 .../poros/lowering/fuse_hard_swish.h          |  0
 .../poros/lowering/fuse_meshgrid.cpp          |  0
 .../{src => }/poros/lowering/fuse_meshgrid.h  |  0
 .../poros/lowering/input_param_propagate.cpp  |  0
 .../poros/lowering/link_mutable_list_pass.cpp |  0
 .../{src => }/poros/lowering/lowering_pass.h  |  0
 .../{src => }/poros/lowering/op_fuse_pass.cpp |  0
 poros/{src => }/poros/lowering/op_fuse_pass.h |  0
 .../remove_simple_type_profile_nodes.cpp      |  0
 .../lowering/replace_illegal_constant.cpp     |  0
 .../{src => }/poros/lowering/replace_pad.cpp  |  0
 .../lowering/segment_post_processing.cpp      |  0
 .../poros/lowering/segment_post_processing.h  |  0
 .../poros/lowering/try_to_freeze_aten_dim.cpp |  0
 .../poros/lowering/try_to_freeze_aten_len.cpp |  0
 .../lowering/try_to_freeze_aten_size.cpp      |  0
 .../lowering/try_to_freeze_list_construct.cpp |  0
 .../lowering/try_to_freeze_percentformat.cpp  |  0
 .../poros/lowering/unpack_certain_ops.cpp     |  0
 .../poros/lowering/unrolling_loop.cpp         |  0
 .../poros/util/graph_test_helper.cpp          |  0
 .../{src => }/poros/util/graph_test_helper.h  |  0
 poros/{src => }/poros/util/macros.h           |  0
 poros/{src => }/poros/util/poros_util.cpp     |  0
 poros/{src => }/poros/util/poros_util.h       |  0
 poros/{src => }/poros/util/test_util.cpp      |  0
 poros/{src => }/poros/util/test_util.h        |  0
 poros/python/poros/_compile.py                |  2 +-
 poros/python/poros/_input_convert.py          | 35 +------
 poros/third_party/gflags                      |  1 +
 poros/third_party/googletest                  |  1 +
 poros/unittest/CMakeLists.txt                 | 10 +-
 poros/unittest/converter/reduce_test.cpp      | 95 ++++++++++++++++++-
 175 files changed, 257 insertions(+), 73 deletions(-)
 create mode 100644 .gitmodules
 rename poros/{src => }/poros/compile/compile.cpp (99%)
 rename poros/{src => }/poros/compile/compile.h (100%)
 rename poros/{src => }/poros/compile/graph_prewarm.cpp (100%)
 rename poros/{src => }/poros/compile/graph_prewarm.h (100%)
 rename poros/{src => }/poros/compile/graph_segment.cpp (98%)
 rename poros/{src => }/poros/compile/graph_segment.h (100%)
 rename poros/{src => }/poros/compile/ivalues_analysis.cpp (100%)
 rename poros/{src => }/poros/compile/ivalues_analysis.h (100%)
 rename poros/{src => }/poros/compile/partition.cpp (100%)
 rename poros/{src => }/poros/compile/partition.h (100%)
 rename poros/{src => }/poros/compile/poros_module.cpp (100%)
 rename poros/{src => }/poros/compile/poros_module.h (100%)
 rename poros/{src => }/poros/compile/segment.cpp (100%)
 rename poros/{src => }/poros/compile/segment.h (100%)
 rename poros/{src => }/poros/context/poros_global.cpp (100%)
 rename poros/{src => }/poros/context/poros_global.h (100%)
 rename poros/{src => }/poros/converter/gpu/activation.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/activation.h (100%)
 rename poros/{src => }/poros/converter/gpu/add.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/add.h (100%)
 rename poros/{src => }/poros/converter/gpu/aten_eval.cpp (95%)
 rename poros/{src => }/poros/converter/gpu/aten_eval.h (100%)
 rename poros/{src => }/poros/converter/gpu/aten_trt_util.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/aten_trt_util.h (100%)
 rename poros/{src => }/poros/converter/gpu/batch_norm.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/batch_norm.h (100%)
 rename poros/{src => }/poros/converter/gpu/clone.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/clone.h (100%)
 rename poros/{src => }/poros/converter/gpu/coercion.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/coercion.h (100%)
 rename poros/{src => }/poros/converter/gpu/concat.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/concat.h (100%)
 rename poros/{src => }/poros/converter/gpu/constant.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/constant.h (100%)
 rename poros/{src => }/poros/converter/gpu/constant_pad_nd.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/constant_pad_nd.h (100%)
 rename poros/{src => }/poros/converter/gpu/converter_util.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/converter_util.h (100%)
 rename poros/{src => }/poros/converter/gpu/convolution.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/convolution.h (100%)
 rename poros/{src => }/poros/converter/gpu/einsum.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/einsum.h (100%)
 rename poros/{src => }/poros/converter/gpu/element_wise.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/element_wise.h (100%)
 rename poros/{src => }/poros/converter/gpu/expand.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/expand.h (100%)
 rename poros/{src => }/poros/converter/gpu/generate.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/generate.h (100%)
 rename poros/{src => }/poros/converter/gpu/gpu_converter.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/gpu_converter.h (100%)
 rename poros/{src => }/poros/converter/gpu/group_norm.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/group_norm.h (100%)
 rename poros/{src => }/poros/converter/gpu/interpolate.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/interpolate.h (100%)
 rename poros/{src => }/poros/converter/gpu/layer_norm.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/layer_norm.h (100%)
 rename poros/{src => }/poros/converter/gpu/linear.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/linear.h (100%)
 rename poros/{src => }/poros/converter/gpu/list.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/list.h (100%)
 rename poros/{src => }/poros/converter/gpu/logical.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/logical.h (100%)
 rename poros/{src => }/poros/converter/gpu/lstm.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/lstm.h (100%)
 rename poros/{src => }/poros/converter/gpu/lstm_cell.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/lstm_cell.h (100%)
 rename poros/{src => }/poros/converter/gpu/matrix_multiply.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/matrix_multiply.h (100%)
 rename poros/{src => }/poros/converter/gpu/meshgrid.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/meshgrid.h (100%)
 rename poros/{src => }/poros/converter/gpu/mul_div.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/mul_div.h (100%)
 rename poros/{src => }/poros/converter/gpu/non_converterable.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/non_converterable.h (100%)
 rename poros/{src => }/poros/converter/gpu/norm.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/norm.h (100%)
 rename poros/{src => }/poros/converter/gpu/plugins/interpolate_plugin.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/plugins/interpolate_plugin.h (100%)
 rename poros/{src => }/poros/converter/gpu/pooling.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/pooling.h (100%)
 rename poros/{src => }/poros/converter/gpu/reduce.cpp (80%)
 rename poros/{src => }/poros/converter/gpu/reduce.h (89%)
 rename poros/{src => }/poros/converter/gpu/reflection_pad.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/reflection_pad.h (100%)
 rename poros/{src => }/poros/converter/gpu/replication_pad.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/replication_pad.h (100%)
 rename poros/{src => }/poros/converter/gpu/roll.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/roll.h (100%)
 rename poros/{src => }/poros/converter/gpu/select.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/select.h (100%)
 rename poros/{src => }/poros/converter/gpu/shape_handle.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/shape_handle.h (100%)
 rename poros/{src => }/poros/converter/gpu/shuffle.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/shuffle.h (100%)
 rename poros/{src => }/poros/converter/gpu/softmax.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/softmax.h (100%)
 rename poros/{src => }/poros/converter/gpu/squeeze.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/squeeze.h (100%)
 rename poros/{src => }/poros/converter/gpu/stack.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/stack.h (100%)
 rename poros/{src => }/poros/converter/gpu/to.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/to.h (100%)
 rename poros/{src => }/poros/converter/gpu/topk.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/topk.h (100%)
 rename poros/{src => }/poros/converter/gpu/unary.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/unary.h (100%)
 rename poros/{src => }/poros/converter/gpu/weight.cpp (100%)
 rename poros/{src => }/poros/converter/gpu/weight.h (100%)
 rename poros/{src => }/poros/converter/iconverter.h (98%)
 rename poros/{src => }/poros/engine/engine.cpp (100%)
 rename poros/{src => }/poros/engine/engine_context.h (100%)
 rename poros/{src => }/poros/engine/iengine.h (100%)
 rename poros/{src => }/poros/engine/tensorrt_engine.cpp (100%)
 rename poros/{src => }/poros/engine/tensorrt_engine.h (100%)
 rename poros/{src => }/poros/engine/trtengine_util.cpp (100%)
 rename poros/{src => }/poros/engine/trtengine_util.h (100%)
 rename poros/{src => }/poros/iplugin/plugin_create.cpp (100%)
 rename poros/{src => }/poros/iplugin/plugin_create.h (100%)
 rename poros/{src => }/poros/log/poros_logging.h (100%)
 rename poros/{src => }/poros/log/tensorrt_logging.cpp (100%)
 rename poros/{src => }/poros/log/tensorrt_logging.h (100%)
 rename poros/{src => }/poros/lowering/eliminate_exception_pass.cpp (100%)
 rename poros/{src => }/poros/lowering/eliminate_maxpoll_with_indices.cpp (100%)
 rename poros/{src => }/poros/lowering/eliminate_simple_useless_nodes.cpp (100%)
 rename poros/{src => }/poros/lowering/eliminate_some_dict.cpp (100%)
 rename poros/{src => }/poros/lowering/eliminate_some_list.cpp (100%)
 rename poros/{src => }/poros/lowering/eliminate_subgraph_uesless_nodes.cpp (100%)
 rename poros/{src => }/poros/lowering/eliminate_useless_copy.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_clip.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_clip.h (100%)
 rename poros/{src => }/poros/lowering/fuse_conv_bn.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_conv_bn.h (100%)
 rename poros/{src => }/poros/lowering/fuse_conv_mul.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_conv_mul.h (100%)
 rename poros/{src => }/poros/lowering/fuse_copy.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_copy.h (100%)
 rename poros/{src => }/poros/lowering/fuse_gelu.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_gelu.h (100%)
 rename poros/{src => }/poros/lowering/fuse_hard_swish.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_hard_swish.h (100%)
 rename poros/{src => }/poros/lowering/fuse_meshgrid.cpp (100%)
 rename poros/{src => }/poros/lowering/fuse_meshgrid.h (100%)
 rename poros/{src => }/poros/lowering/input_param_propagate.cpp (100%)
 rename poros/{src => }/poros/lowering/link_mutable_list_pass.cpp (100%)
 rename poros/{src => }/poros/lowering/lowering_pass.h (100%)
 rename poros/{src => }/poros/lowering/op_fuse_pass.cpp (100%)
 rename poros/{src => }/poros/lowering/op_fuse_pass.h (100%)
 rename poros/{src => }/poros/lowering/remove_simple_type_profile_nodes.cpp (100%)
 rename poros/{src => }/poros/lowering/replace_illegal_constant.cpp (100%)
 rename poros/{src => }/poros/lowering/replace_pad.cpp (100%)
 rename poros/{src => }/poros/lowering/segment_post_processing.cpp (100%)
 rename poros/{src => }/poros/lowering/segment_post_processing.h (100%)
 rename poros/{src => }/poros/lowering/try_to_freeze_aten_dim.cpp (100%)
 rename poros/{src => }/poros/lowering/try_to_freeze_aten_len.cpp (100%)
 rename poros/{src => }/poros/lowering/try_to_freeze_aten_size.cpp (100%)
 rename poros/{src => }/poros/lowering/try_to_freeze_list_construct.cpp (100%)
 rename poros/{src => }/poros/lowering/try_to_freeze_percentformat.cpp (100%)
 rename poros/{src => }/poros/lowering/unpack_certain_ops.cpp (100%)
 rename poros/{src => }/poros/lowering/unrolling_loop.cpp (100%)
 rename poros/{src => }/poros/util/graph_test_helper.cpp (100%)
 rename poros/{src => }/poros/util/graph_test_helper.h (100%)
 rename poros/{src => }/poros/util/macros.h (100%)
 rename poros/{src => }/poros/util/poros_util.cpp (100%)
 rename poros/{src => }/poros/util/poros_util.h (100%)
 rename poros/{src => }/poros/util/test_util.cpp (100%)
 rename poros/{src => }/poros/util/test_util.h (100%)
 create mode 160000 poros/third_party/gflags
 create mode 160000 poros/third_party/googletest

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..b9019f3f0
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "poros/third_party/googletest"]
+	path = poros/third_party/googletest
+	url = https://github.com/google/googletest.git
+[submodule "poros/third_party/gflags"]
+	path = poros/third_party/gflags
+	url = https://github.com/gflags/gflags.git
diff --git a/poros/CMakeLists.txt b/poros/CMakeLists.txt
index 0ed93f256..b332f85c7 100755
--- a/poros/CMakeLists.txt
+++ b/poros/CMakeLists.txt
@@ -1,8 +1,6 @@
 cmake_minimum_required(VERSION 3.21)
 project(poros)
 set(CMAKE_CXX_STANDARD 14)
-add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
-
 
 option(BUILD_STATIC "build lib${PROJECT_NAME}.a static lib" OFF)
 option(BUILD_KERNEL "build lib${PROJECT_NAME}-kernel.so shared lib" OFF)
@@ -11,7 +9,17 @@ option(BUILD_TOOL "build ${PROJECT_NAME}-tool, an executable binary output" OFF)
 option(TEST "build for test. copy '.so' to site-packages automatically after compile" OFF)
 option(DEBUG "build for debug. add '-g' flag to gcc for detailed debug information" ON)
 option(UT "build for unit test" OFF)
+option(ABI "build ${PROJECT_NAME} with application binary interface (ABI) is on" OFF)
 
+# abi configuration
+if (NOT ABI)
+    add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+else ()
+    if (BUILD_TOOL OR UT)
+        message(FATAL_ERROR "${PROJECT_NAME}-tool or unit test are not supported when abi is on.")
+    endif()
+    add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
+endif ()
 
 # minimum requirements
 set(PYTHON_MINIMUM_VERSION 3.6)
@@ -49,33 +57,33 @@ find_package(CUDNN ${CUDNN_MINIMUM_VERSION} REQUIRED)
 
 ## release headers
 # engine
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/engine/iengine.h" "${PROJECT_SOURCE_DIR}/src/poros/engine/engine_context.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/engine/iengine.h" "${PROJECT_SOURCE_DIR}/poros/engine/engine_context.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/engine")
 # compile
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/compile/poros_module.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/compile/poros_module.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/compile")
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/compile/compile.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/compile/compile.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/compile")
 # converter
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/converter/iconverter.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/converter/iconverter.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/converter")
 # iplugin
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/iplugin/*.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/iplugin/*.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/iplugin")
 ## context
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/context/*.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/context/*.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/context")
 ## context
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/context/*.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/context/*.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/context")
 ## lowering
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/lowering/*.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/lowering/*.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/lowering")
 ## util
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/util/*.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/util/*.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/util")
 ## log
-file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/log/*.h")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/poros/log/*.h")
 file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/log")
 
 
@@ -83,8 +91,8 @@ include_directories(${TORCH_INCLUDE_DIRS})
 include_directories(${TensorRT_INCLUDE_DIRS})
 include_directories(${CUDA_INCLUDE_DIRS})
 include_directories(${CUDNN_INCLUDE_PATH})
-include_directories(src)
-include_directories(src/poros/compile)
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(poros/compile)
 
 
 add_compile_options(-D__const__= -D_GNU_SOURCE)
@@ -109,9 +117,9 @@ add_compile_options(
 
 file(
         GLOB POROS_CPP_FILES
-        "./src/poros/*/*.cpp"
-        "./src/poros/converter/*/*.cpp"
-        "./src/poros/converter/gpu/plugins/*.cpp"
+        "./poros/*/*.cpp"
+        "./poros/converter/*/*.cpp"
+        "./poros/converter/gpu/plugins/*.cpp"
 )
 
 
@@ -183,13 +191,13 @@ endif ()
 # kernel
 file(
         GLOB POROS_KERNEL_CPP_FILES
-        ./src/poros/compile/*.cpp
-        ./src/poros/context/*.cpp
-        ./src/poros/iplugin/*.cpp
-        ./src/poros/log/*.cpp
-        ./src/poros/lowering/*.cpp
-        ./src/poros/util/*.cpp
-        ./src/poros/engine/engine.cpp
+        ./poros/compile/*.cpp
+        ./poros/context/*.cpp
+        ./poros/iplugin/*.cpp
+        ./poros/log/*.cpp
+        ./poros/lowering/*.cpp
+        ./poros/util/*.cpp
+        ./poros/engine/engine.cpp
 )
 
 # kernel SHARED
diff --git a/poros/README.md b/poros/README.md
index 038ecdb04..75b36b623 100644
--- a/poros/README.md
+++ b/poros/README.md
@@ -43,7 +43,7 @@ get Poros source code:
 ```shell
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd poros
-git submodule update --init --recursive --jobs 0 -f
+git submodule update --init --recursive
 ```
 
 We strongly recommend you to prepare the building environment with anaconda3:
diff --git a/poros/src/poros/compile/compile.cpp b/poros/poros/compile/compile.cpp
similarity index 99%
rename from poros/src/poros/compile/compile.cpp
rename to poros/poros/compile/compile.cpp
index 347adf7ca..d3515dc58 100644
--- a/poros/src/poros/compile/compile.cpp
+++ b/poros/poros/compile/compile.cpp
@@ -150,15 +150,15 @@ int Compiler::compile(const torch::jit::Module& origin_module,
         opt_graph = graph_and_ivalues.first;
     }
 
-    //cpu的话 过了预处理就返回
-    if (_options.device == Device::CPU) {
-        merge_graph_to_module(opt_graph, *optimized_module, true);
-        return 0;
-    }
-
     std::shared_ptr<torch::jit::Graph> prewarm_graph = graph_prewarm(opt_graph, prewarm_datas);
     GRAPH_DUMP("prewarmed_module graph:", prewarm_graph);
 
+    //cpu的话，预热后就返回
+    if (_options.device == Device::CPU) {
+        merge_graph_to_module(prewarm_graph, *optimized_module, true);
+        return 0;
+    }
+
     //step2: try to find segments in unfold module
     //划分完子图的模型
     int ret = segment_graph(prewarm_graph);
diff --git a/poros/src/poros/compile/compile.h b/poros/poros/compile/compile.h
similarity index 100%
rename from poros/src/poros/compile/compile.h
rename to poros/poros/compile/compile.h
diff --git a/poros/src/poros/compile/graph_prewarm.cpp b/poros/poros/compile/graph_prewarm.cpp
similarity index 100%
rename from poros/src/poros/compile/graph_prewarm.cpp
rename to poros/poros/compile/graph_prewarm.cpp
diff --git a/poros/src/poros/compile/graph_prewarm.h b/poros/poros/compile/graph_prewarm.h
similarity index 100%
rename from poros/src/poros/compile/graph_prewarm.h
rename to poros/poros/compile/graph_prewarm.h
diff --git a/poros/src/poros/compile/graph_segment.cpp b/poros/poros/compile/graph_segment.cpp
similarity index 98%
rename from poros/src/poros/compile/graph_segment.cpp
rename to poros/poros/compile/graph_segment.cpp
index 472ef1111..4efc77f76 100644
--- a/poros/src/poros/compile/graph_segment.cpp
+++ b/poros/poros/compile/graph_segment.cpp
@@ -102,6 +102,24 @@ struct PorosGraphSegment {
             }
         }
 
+        // aten::__getitem__ idx参数不支持非constant类型
+        if (node->kind() == torch::jit::aten::__getitem__) {
+            if (node->inputs().size() == 2 && 
+                node->input(1)->node()->kind() != torch::jit::prim::Constant) {
+                LOG(WARNING) << "The index input of aten::__getitem__ is not supported as non-constant type.";
+                return false;
+            }
+        }
+
+        // aten::_set_item idx参数不支持非constant类型
+        if (node->kind() == torch::jit::aten::_set_item) {
+            if (node->inputs().size() == 3 && 
+                node->input(1)->node()->kind() != torch::jit::prim::Constant) {
+                LOG(WARNING) << "The index input of aten::_set_item is not supported as non-constant type.";
+                return false;
+            }
+        }
+
         if (node->kind() == kind_ || engine_->is_node_supported(node)) {
             return true;
         }
diff --git a/poros/src/poros/compile/graph_segment.h b/poros/poros/compile/graph_segment.h
similarity index 100%
rename from poros/src/poros/compile/graph_segment.h
rename to poros/poros/compile/graph_segment.h
diff --git a/poros/src/poros/compile/ivalues_analysis.cpp b/poros/poros/compile/ivalues_analysis.cpp
similarity index 100%
rename from poros/src/poros/compile/ivalues_analysis.cpp
rename to poros/poros/compile/ivalues_analysis.cpp
diff --git a/poros/src/poros/compile/ivalues_analysis.h b/poros/poros/compile/ivalues_analysis.h
similarity index 100%
rename from poros/src/poros/compile/ivalues_analysis.h
rename to poros/poros/compile/ivalues_analysis.h
diff --git a/poros/src/poros/compile/partition.cpp b/poros/poros/compile/partition.cpp
similarity index 100%
rename from poros/src/poros/compile/partition.cpp
rename to poros/poros/compile/partition.cpp
diff --git a/poros/src/poros/compile/partition.h b/poros/poros/compile/partition.h
similarity index 100%
rename from poros/src/poros/compile/partition.h
rename to poros/poros/compile/partition.h
diff --git a/poros/src/poros/compile/poros_module.cpp b/poros/poros/compile/poros_module.cpp
similarity index 100%
rename from poros/src/poros/compile/poros_module.cpp
rename to poros/poros/compile/poros_module.cpp
diff --git a/poros/src/poros/compile/poros_module.h b/poros/poros/compile/poros_module.h
similarity index 100%
rename from poros/src/poros/compile/poros_module.h
rename to poros/poros/compile/poros_module.h
diff --git a/poros/src/poros/compile/segment.cpp b/poros/poros/compile/segment.cpp
similarity index 100%
rename from poros/src/poros/compile/segment.cpp
rename to poros/poros/compile/segment.cpp
diff --git a/poros/src/poros/compile/segment.h b/poros/poros/compile/segment.h
similarity index 100%
rename from poros/src/poros/compile/segment.h
rename to poros/poros/compile/segment.h
diff --git a/poros/src/poros/context/poros_global.cpp b/poros/poros/context/poros_global.cpp
similarity index 100%
rename from poros/src/poros/context/poros_global.cpp
rename to poros/poros/context/poros_global.cpp
diff --git a/poros/src/poros/context/poros_global.h b/poros/poros/context/poros_global.h
similarity index 100%
rename from poros/src/poros/context/poros_global.h
rename to poros/poros/context/poros_global.h
diff --git a/poros/src/poros/converter/gpu/activation.cpp b/poros/poros/converter/gpu/activation.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/activation.cpp
rename to poros/poros/converter/gpu/activation.cpp
diff --git a/poros/src/poros/converter/gpu/activation.h b/poros/poros/converter/gpu/activation.h
similarity index 100%
rename from poros/src/poros/converter/gpu/activation.h
rename to poros/poros/converter/gpu/activation.h
diff --git a/poros/src/poros/converter/gpu/add.cpp b/poros/poros/converter/gpu/add.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/add.cpp
rename to poros/poros/converter/gpu/add.cpp
diff --git a/poros/src/poros/converter/gpu/add.h b/poros/poros/converter/gpu/add.h
similarity index 100%
rename from poros/src/poros/converter/gpu/add.h
rename to poros/poros/converter/gpu/add.h
diff --git a/poros/src/poros/converter/gpu/aten_eval.cpp b/poros/poros/converter/gpu/aten_eval.cpp
similarity index 95%
rename from poros/src/poros/converter/gpu/aten_eval.cpp
rename to poros/poros/converter/gpu/aten_eval.cpp
index 8dd93d9d3..c61a16400 100644
--- a/poros/src/poros/converter/gpu/aten_eval.cpp
+++ b/poros/poros/converter/gpu/aten_eval.cpp
@@ -66,6 +66,8 @@ bool AppendConverter::converter(TensorrtEngine* engine, const torch::jit::Node *
 bool GetitemConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
     at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
     POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for GetitemConverter");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "inputs[1] for GetitemConverter is not come from prim::Constant as expected");
 
     if (node->outputs()[0]->type()->str() == "Tensor") {
         //extract list
@@ -126,6 +128,8 @@ bool GetitemConverter::converter(TensorrtEngine* engine, const torch::jit::Node
 bool SetitemConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
     at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
     POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for SetitemConverter");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "inputs[1] for SetitemConverter is not come from prim::Constant as expected");
 
     size_t idx = engine->context().get_constant(inputs[1]).toInt();
 
diff --git a/poros/src/poros/converter/gpu/aten_eval.h b/poros/poros/converter/gpu/aten_eval.h
similarity index 100%
rename from poros/src/poros/converter/gpu/aten_eval.h
rename to poros/poros/converter/gpu/aten_eval.h
diff --git a/poros/src/poros/converter/gpu/aten_trt_util.cpp b/poros/poros/converter/gpu/aten_trt_util.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/aten_trt_util.cpp
rename to poros/poros/converter/gpu/aten_trt_util.cpp
diff --git a/poros/src/poros/converter/gpu/aten_trt_util.h b/poros/poros/converter/gpu/aten_trt_util.h
similarity index 100%
rename from poros/src/poros/converter/gpu/aten_trt_util.h
rename to poros/poros/converter/gpu/aten_trt_util.h
diff --git a/poros/src/poros/converter/gpu/batch_norm.cpp b/poros/poros/converter/gpu/batch_norm.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/batch_norm.cpp
rename to poros/poros/converter/gpu/batch_norm.cpp
diff --git a/poros/src/poros/converter/gpu/batch_norm.h b/poros/poros/converter/gpu/batch_norm.h
similarity index 100%
rename from poros/src/poros/converter/gpu/batch_norm.h
rename to poros/poros/converter/gpu/batch_norm.h
diff --git a/poros/src/poros/converter/gpu/clone.cpp b/poros/poros/converter/gpu/clone.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/clone.cpp
rename to poros/poros/converter/gpu/clone.cpp
diff --git a/poros/src/poros/converter/gpu/clone.h b/poros/poros/converter/gpu/clone.h
similarity index 100%
rename from poros/src/poros/converter/gpu/clone.h
rename to poros/poros/converter/gpu/clone.h
diff --git a/poros/src/poros/converter/gpu/coercion.cpp b/poros/poros/converter/gpu/coercion.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/coercion.cpp
rename to poros/poros/converter/gpu/coercion.cpp
diff --git a/poros/src/poros/converter/gpu/coercion.h b/poros/poros/converter/gpu/coercion.h
similarity index 100%
rename from poros/src/poros/converter/gpu/coercion.h
rename to poros/poros/converter/gpu/coercion.h
diff --git a/poros/src/poros/converter/gpu/concat.cpp b/poros/poros/converter/gpu/concat.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/concat.cpp
rename to poros/poros/converter/gpu/concat.cpp
diff --git a/poros/src/poros/converter/gpu/concat.h b/poros/poros/converter/gpu/concat.h
similarity index 100%
rename from poros/src/poros/converter/gpu/concat.h
rename to poros/poros/converter/gpu/concat.h
diff --git a/poros/src/poros/converter/gpu/constant.cpp b/poros/poros/converter/gpu/constant.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/constant.cpp
rename to poros/poros/converter/gpu/constant.cpp
diff --git a/poros/src/poros/converter/gpu/constant.h b/poros/poros/converter/gpu/constant.h
similarity index 100%
rename from poros/src/poros/converter/gpu/constant.h
rename to poros/poros/converter/gpu/constant.h
diff --git a/poros/src/poros/converter/gpu/constant_pad_nd.cpp b/poros/poros/converter/gpu/constant_pad_nd.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/constant_pad_nd.cpp
rename to poros/poros/converter/gpu/constant_pad_nd.cpp
diff --git a/poros/src/poros/converter/gpu/constant_pad_nd.h b/poros/poros/converter/gpu/constant_pad_nd.h
similarity index 100%
rename from poros/src/poros/converter/gpu/constant_pad_nd.h
rename to poros/poros/converter/gpu/constant_pad_nd.h
diff --git a/poros/src/poros/converter/gpu/converter_util.cpp b/poros/poros/converter/gpu/converter_util.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/converter_util.cpp
rename to poros/poros/converter/gpu/converter_util.cpp
diff --git a/poros/src/poros/converter/gpu/converter_util.h b/poros/poros/converter/gpu/converter_util.h
similarity index 100%
rename from poros/src/poros/converter/gpu/converter_util.h
rename to poros/poros/converter/gpu/converter_util.h
diff --git a/poros/src/poros/converter/gpu/convolution.cpp b/poros/poros/converter/gpu/convolution.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/convolution.cpp
rename to poros/poros/converter/gpu/convolution.cpp
diff --git a/poros/src/poros/converter/gpu/convolution.h b/poros/poros/converter/gpu/convolution.h
similarity index 100%
rename from poros/src/poros/converter/gpu/convolution.h
rename to poros/poros/converter/gpu/convolution.h
diff --git a/poros/src/poros/converter/gpu/einsum.cpp b/poros/poros/converter/gpu/einsum.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/einsum.cpp
rename to poros/poros/converter/gpu/einsum.cpp
diff --git a/poros/src/poros/converter/gpu/einsum.h b/poros/poros/converter/gpu/einsum.h
similarity index 100%
rename from poros/src/poros/converter/gpu/einsum.h
rename to poros/poros/converter/gpu/einsum.h
diff --git a/poros/src/poros/converter/gpu/element_wise.cpp b/poros/poros/converter/gpu/element_wise.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/element_wise.cpp
rename to poros/poros/converter/gpu/element_wise.cpp
diff --git a/poros/src/poros/converter/gpu/element_wise.h b/poros/poros/converter/gpu/element_wise.h
similarity index 100%
rename from poros/src/poros/converter/gpu/element_wise.h
rename to poros/poros/converter/gpu/element_wise.h
diff --git a/poros/src/poros/converter/gpu/expand.cpp b/poros/poros/converter/gpu/expand.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/expand.cpp
rename to poros/poros/converter/gpu/expand.cpp
diff --git a/poros/src/poros/converter/gpu/expand.h b/poros/poros/converter/gpu/expand.h
similarity index 100%
rename from poros/src/poros/converter/gpu/expand.h
rename to poros/poros/converter/gpu/expand.h
diff --git a/poros/src/poros/converter/gpu/generate.cpp b/poros/poros/converter/gpu/generate.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/generate.cpp
rename to poros/poros/converter/gpu/generate.cpp
diff --git a/poros/src/poros/converter/gpu/generate.h b/poros/poros/converter/gpu/generate.h
similarity index 100%
rename from poros/src/poros/converter/gpu/generate.h
rename to poros/poros/converter/gpu/generate.h
diff --git a/poros/src/poros/converter/gpu/gpu_converter.cpp b/poros/poros/converter/gpu/gpu_converter.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/gpu_converter.cpp
rename to poros/poros/converter/gpu/gpu_converter.cpp
diff --git a/poros/src/poros/converter/gpu/gpu_converter.h b/poros/poros/converter/gpu/gpu_converter.h
similarity index 100%
rename from poros/src/poros/converter/gpu/gpu_converter.h
rename to poros/poros/converter/gpu/gpu_converter.h
diff --git a/poros/src/poros/converter/gpu/group_norm.cpp b/poros/poros/converter/gpu/group_norm.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/group_norm.cpp
rename to poros/poros/converter/gpu/group_norm.cpp
diff --git a/poros/src/poros/converter/gpu/group_norm.h b/poros/poros/converter/gpu/group_norm.h
similarity index 100%
rename from poros/src/poros/converter/gpu/group_norm.h
rename to poros/poros/converter/gpu/group_norm.h
diff --git a/poros/src/poros/converter/gpu/interpolate.cpp b/poros/poros/converter/gpu/interpolate.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/interpolate.cpp
rename to poros/poros/converter/gpu/interpolate.cpp
diff --git a/poros/src/poros/converter/gpu/interpolate.h b/poros/poros/converter/gpu/interpolate.h
similarity index 100%
rename from poros/src/poros/converter/gpu/interpolate.h
rename to poros/poros/converter/gpu/interpolate.h
diff --git a/poros/src/poros/converter/gpu/layer_norm.cpp b/poros/poros/converter/gpu/layer_norm.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/layer_norm.cpp
rename to poros/poros/converter/gpu/layer_norm.cpp
diff --git a/poros/src/poros/converter/gpu/layer_norm.h b/poros/poros/converter/gpu/layer_norm.h
similarity index 100%
rename from poros/src/poros/converter/gpu/layer_norm.h
rename to poros/poros/converter/gpu/layer_norm.h
diff --git a/poros/src/poros/converter/gpu/linear.cpp b/poros/poros/converter/gpu/linear.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/linear.cpp
rename to poros/poros/converter/gpu/linear.cpp
diff --git a/poros/src/poros/converter/gpu/linear.h b/poros/poros/converter/gpu/linear.h
similarity index 100%
rename from poros/src/poros/converter/gpu/linear.h
rename to poros/poros/converter/gpu/linear.h
diff --git a/poros/src/poros/converter/gpu/list.cpp b/poros/poros/converter/gpu/list.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/list.cpp
rename to poros/poros/converter/gpu/list.cpp
diff --git a/poros/src/poros/converter/gpu/list.h b/poros/poros/converter/gpu/list.h
similarity index 100%
rename from poros/src/poros/converter/gpu/list.h
rename to poros/poros/converter/gpu/list.h
diff --git a/poros/src/poros/converter/gpu/logical.cpp b/poros/poros/converter/gpu/logical.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/logical.cpp
rename to poros/poros/converter/gpu/logical.cpp
diff --git a/poros/src/poros/converter/gpu/logical.h b/poros/poros/converter/gpu/logical.h
similarity index 100%
rename from poros/src/poros/converter/gpu/logical.h
rename to poros/poros/converter/gpu/logical.h
diff --git a/poros/src/poros/converter/gpu/lstm.cpp b/poros/poros/converter/gpu/lstm.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/lstm.cpp
rename to poros/poros/converter/gpu/lstm.cpp
diff --git a/poros/src/poros/converter/gpu/lstm.h b/poros/poros/converter/gpu/lstm.h
similarity index 100%
rename from poros/src/poros/converter/gpu/lstm.h
rename to poros/poros/converter/gpu/lstm.h
diff --git a/poros/src/poros/converter/gpu/lstm_cell.cpp b/poros/poros/converter/gpu/lstm_cell.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/lstm_cell.cpp
rename to poros/poros/converter/gpu/lstm_cell.cpp
diff --git a/poros/src/poros/converter/gpu/lstm_cell.h b/poros/poros/converter/gpu/lstm_cell.h
similarity index 100%
rename from poros/src/poros/converter/gpu/lstm_cell.h
rename to poros/poros/converter/gpu/lstm_cell.h
diff --git a/poros/src/poros/converter/gpu/matrix_multiply.cpp b/poros/poros/converter/gpu/matrix_multiply.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/matrix_multiply.cpp
rename to poros/poros/converter/gpu/matrix_multiply.cpp
diff --git a/poros/src/poros/converter/gpu/matrix_multiply.h b/poros/poros/converter/gpu/matrix_multiply.h
similarity index 100%
rename from poros/src/poros/converter/gpu/matrix_multiply.h
rename to poros/poros/converter/gpu/matrix_multiply.h
diff --git a/poros/src/poros/converter/gpu/meshgrid.cpp b/poros/poros/converter/gpu/meshgrid.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/meshgrid.cpp
rename to poros/poros/converter/gpu/meshgrid.cpp
diff --git a/poros/src/poros/converter/gpu/meshgrid.h b/poros/poros/converter/gpu/meshgrid.h
similarity index 100%
rename from poros/src/poros/converter/gpu/meshgrid.h
rename to poros/poros/converter/gpu/meshgrid.h
diff --git a/poros/src/poros/converter/gpu/mul_div.cpp b/poros/poros/converter/gpu/mul_div.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/mul_div.cpp
rename to poros/poros/converter/gpu/mul_div.cpp
diff --git a/poros/src/poros/converter/gpu/mul_div.h b/poros/poros/converter/gpu/mul_div.h
similarity index 100%
rename from poros/src/poros/converter/gpu/mul_div.h
rename to poros/poros/converter/gpu/mul_div.h
diff --git a/poros/src/poros/converter/gpu/non_converterable.cpp b/poros/poros/converter/gpu/non_converterable.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/non_converterable.cpp
rename to poros/poros/converter/gpu/non_converterable.cpp
diff --git a/poros/src/poros/converter/gpu/non_converterable.h b/poros/poros/converter/gpu/non_converterable.h
similarity index 100%
rename from poros/src/poros/converter/gpu/non_converterable.h
rename to poros/poros/converter/gpu/non_converterable.h
diff --git a/poros/src/poros/converter/gpu/norm.cpp b/poros/poros/converter/gpu/norm.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/norm.cpp
rename to poros/poros/converter/gpu/norm.cpp
diff --git a/poros/src/poros/converter/gpu/norm.h b/poros/poros/converter/gpu/norm.h
similarity index 100%
rename from poros/src/poros/converter/gpu/norm.h
rename to poros/poros/converter/gpu/norm.h
diff --git a/poros/src/poros/converter/gpu/plugins/interpolate_plugin.cpp b/poros/poros/converter/gpu/plugins/interpolate_plugin.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/plugins/interpolate_plugin.cpp
rename to poros/poros/converter/gpu/plugins/interpolate_plugin.cpp
diff --git a/poros/src/poros/converter/gpu/plugins/interpolate_plugin.h b/poros/poros/converter/gpu/plugins/interpolate_plugin.h
similarity index 100%
rename from poros/src/poros/converter/gpu/plugins/interpolate_plugin.h
rename to poros/poros/converter/gpu/plugins/interpolate_plugin.h
diff --git a/poros/src/poros/converter/gpu/pooling.cpp b/poros/poros/converter/gpu/pooling.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/pooling.cpp
rename to poros/poros/converter/gpu/pooling.cpp
diff --git a/poros/src/poros/converter/gpu/pooling.h b/poros/poros/converter/gpu/pooling.h
similarity index 100%
rename from poros/src/poros/converter/gpu/pooling.h
rename to poros/poros/converter/gpu/pooling.h
diff --git a/poros/src/poros/converter/gpu/reduce.cpp b/poros/poros/converter/gpu/reduce.cpp
similarity index 80%
rename from poros/src/poros/converter/gpu/reduce.cpp
rename to poros/poros/converter/gpu/reduce.cpp
index bf497e77c..02b8f9214 100644
--- a/poros/src/poros/converter/gpu/reduce.cpp
+++ b/poros/poros/converter/gpu/reduce.cpp
@@ -271,10 +271,75 @@ bool MaxMinConverter::converter(TensorrtEngine* engine, const torch::jit::Node *
     return true;
 }
 
+/*
+"aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> (Tensor)"*/
+bool ArgmaxArgminConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ArgmaxArgminConverter is not Tensor as expected");
+
+    // TODO: to imp dim=None
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::IntType::get())), 
+        "input[1] for ArgmaxArgminConverter is not int as expected");
+
+    //extract self
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_dims = nvdim_to_sizes(in_tensor->getDimensions());
+
+    bool is_dynamic = check_nvtensor_is_dynamic(in_tensor);
+
+    POROS_CHECK_TRUE((in_dims.size() > 1), 
+        "Converter aten::argmax error: At least 2 dimensions are required for input[0].");
+    nvinfer1::ITensor* output_indices = nullptr;
+
+    int64_t dim = 0;
+    dim = engine->context().get_constant(inputs[1]).toInt();
+    dim = dim < 0 ? in_dims.size() + dim : dim;
+    bool keep_dim = engine->context().get_constant(inputs[2]).toBool();
+    uint32_t shiftDim = 1 << dim;
+
+    // nvinfer1::TopKOperation noly support kFLOAT, so this is transfer kINT32 to kFLOAT
+    if (in_tensor->getType() == nvinfer1::DataType::kINT32) {
+        auto id_layer = engine->network()->addIdentity(*in_tensor);
+        id_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        id_layer->setName((layer_info(node) + "_IIdentityLayer_int32_to_float").c_str());
+        in_tensor = id_layer->getOutput(0);
+    }
+
+    nvinfer1::TopKOperation topk_option = (node->kind() == torch::jit::aten::argmax) ?
+                                            nvinfer1::TopKOperation::kMAX : 
+                                            nvinfer1::TopKOperation::kMIN;
+    nvinfer1::ITopKLayer* topk_layer =  engine->network()->addTopK(*in_tensor, topk_option, 1, shiftDim);
+    POROS_CHECK(topk_layer, "Unable to create TopK layer from node: " << *node);
+    topk_layer->setName((layer_info(node) + "_ITopKLayer").c_str());
+    output_indices = topk_layer->getOutput(1);
+
+    // squeeze output dim
+    if (in_tensor->getDimensions().nbDims > 1 && !keep_dim) {
+        auto shuffle_layer = engine->network()->addShuffle(*output_indices);
+        if (is_dynamic) {
+            nvinfer1::ITensor* self_shape_tensor = engine->network()->addShape(*in_tensor)->getOutput(0);
+            nvinfer1::ITensor* squeeze_output_shape = squeeze_nv_shapetensor(engine, self_shape_tensor, dim);
+            shuffle_layer->setInput(1, *squeeze_output_shape);
+        } else {
+            in_dims.erase(in_dims.begin() + dim);
+            nvinfer1::Dims squeeze_output_dims = sizes_to_nvdim(in_dims);
+            shuffle_layer->setReshapeDimensions(squeeze_output_dims);
+        }
+        output_indices = shuffle_layer->getOutput(0);
+    }
+    engine->context().set_tensor(node->outputs()[0], output_indices);
+    LOG(INFO) << "Output tensor shape: " << output_indices->getDimensions();
+    return true;
+}
+
+
 POROS_REGISTER_CONVERTER(TensorrtEngine, MeanConverter);
 POROS_REGISTER_CONVERTER(TensorrtEngine, SumConverter);
 POROS_REGISTER_CONVERTER(TensorrtEngine, ProdConverter);
 POROS_REGISTER_CONVERTER(TensorrtEngine, MaxMinConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ArgmaxArgminConverter);
 
 }  // namespace poros 
 }  // namespace mirana
diff --git a/poros/src/poros/converter/gpu/reduce.h b/poros/poros/converter/gpu/reduce.h
similarity index 89%
rename from poros/src/poros/converter/gpu/reduce.h
rename to poros/poros/converter/gpu/reduce.h
index 9da7b1a62..8bc1360d7 100644
--- a/poros/src/poros/converter/gpu/reduce.h
+++ b/poros/poros/converter/gpu/reduce.h
@@ -135,6 +135,26 @@ public:
     }
 };
 
+class ArgmaxArgminConverter : public GpuConverter {
+public:
+    ArgmaxArgminConverter() {}
+    virtual ~ArgmaxArgminConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::argmax(Tensor self, int dim, bool keepdim=False) -> (Tensor)",
+                "aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> (Tensor)",
+                "aten::argmin(Tensor self, int dim, bool keepdim=False) -> (Tensor)",
+                };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::argmax,
+                torch::jit::aten::argmin};
+    }
+};
+
 }  // namespace poros 
 }  // namespace mirana
 }  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/reflection_pad.cpp b/poros/poros/converter/gpu/reflection_pad.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/reflection_pad.cpp
rename to poros/poros/converter/gpu/reflection_pad.cpp
diff --git a/poros/src/poros/converter/gpu/reflection_pad.h b/poros/poros/converter/gpu/reflection_pad.h
similarity index 100%
rename from poros/src/poros/converter/gpu/reflection_pad.h
rename to poros/poros/converter/gpu/reflection_pad.h
diff --git a/poros/src/poros/converter/gpu/replication_pad.cpp b/poros/poros/converter/gpu/replication_pad.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/replication_pad.cpp
rename to poros/poros/converter/gpu/replication_pad.cpp
diff --git a/poros/src/poros/converter/gpu/replication_pad.h b/poros/poros/converter/gpu/replication_pad.h
similarity index 100%
rename from poros/src/poros/converter/gpu/replication_pad.h
rename to poros/poros/converter/gpu/replication_pad.h
diff --git a/poros/src/poros/converter/gpu/roll.cpp b/poros/poros/converter/gpu/roll.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/roll.cpp
rename to poros/poros/converter/gpu/roll.cpp
diff --git a/poros/src/poros/converter/gpu/roll.h b/poros/poros/converter/gpu/roll.h
similarity index 100%
rename from poros/src/poros/converter/gpu/roll.h
rename to poros/poros/converter/gpu/roll.h
diff --git a/poros/src/poros/converter/gpu/select.cpp b/poros/poros/converter/gpu/select.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/select.cpp
rename to poros/poros/converter/gpu/select.cpp
diff --git a/poros/src/poros/converter/gpu/select.h b/poros/poros/converter/gpu/select.h
similarity index 100%
rename from poros/src/poros/converter/gpu/select.h
rename to poros/poros/converter/gpu/select.h
diff --git a/poros/src/poros/converter/gpu/shape_handle.cpp b/poros/poros/converter/gpu/shape_handle.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/shape_handle.cpp
rename to poros/poros/converter/gpu/shape_handle.cpp
diff --git a/poros/src/poros/converter/gpu/shape_handle.h b/poros/poros/converter/gpu/shape_handle.h
similarity index 100%
rename from poros/src/poros/converter/gpu/shape_handle.h
rename to poros/poros/converter/gpu/shape_handle.h
diff --git a/poros/src/poros/converter/gpu/shuffle.cpp b/poros/poros/converter/gpu/shuffle.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/shuffle.cpp
rename to poros/poros/converter/gpu/shuffle.cpp
diff --git a/poros/src/poros/converter/gpu/shuffle.h b/poros/poros/converter/gpu/shuffle.h
similarity index 100%
rename from poros/src/poros/converter/gpu/shuffle.h
rename to poros/poros/converter/gpu/shuffle.h
diff --git a/poros/src/poros/converter/gpu/softmax.cpp b/poros/poros/converter/gpu/softmax.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/softmax.cpp
rename to poros/poros/converter/gpu/softmax.cpp
diff --git a/poros/src/poros/converter/gpu/softmax.h b/poros/poros/converter/gpu/softmax.h
similarity index 100%
rename from poros/src/poros/converter/gpu/softmax.h
rename to poros/poros/converter/gpu/softmax.h
diff --git a/poros/src/poros/converter/gpu/squeeze.cpp b/poros/poros/converter/gpu/squeeze.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/squeeze.cpp
rename to poros/poros/converter/gpu/squeeze.cpp
diff --git a/poros/src/poros/converter/gpu/squeeze.h b/poros/poros/converter/gpu/squeeze.h
similarity index 100%
rename from poros/src/poros/converter/gpu/squeeze.h
rename to poros/poros/converter/gpu/squeeze.h
diff --git a/poros/src/poros/converter/gpu/stack.cpp b/poros/poros/converter/gpu/stack.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/stack.cpp
rename to poros/poros/converter/gpu/stack.cpp
diff --git a/poros/src/poros/converter/gpu/stack.h b/poros/poros/converter/gpu/stack.h
similarity index 100%
rename from poros/src/poros/converter/gpu/stack.h
rename to poros/poros/converter/gpu/stack.h
diff --git a/poros/src/poros/converter/gpu/to.cpp b/poros/poros/converter/gpu/to.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/to.cpp
rename to poros/poros/converter/gpu/to.cpp
diff --git a/poros/src/poros/converter/gpu/to.h b/poros/poros/converter/gpu/to.h
similarity index 100%
rename from poros/src/poros/converter/gpu/to.h
rename to poros/poros/converter/gpu/to.h
diff --git a/poros/src/poros/converter/gpu/topk.cpp b/poros/poros/converter/gpu/topk.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/topk.cpp
rename to poros/poros/converter/gpu/topk.cpp
diff --git a/poros/src/poros/converter/gpu/topk.h b/poros/poros/converter/gpu/topk.h
similarity index 100%
rename from poros/src/poros/converter/gpu/topk.h
rename to poros/poros/converter/gpu/topk.h
diff --git a/poros/src/poros/converter/gpu/unary.cpp b/poros/poros/converter/gpu/unary.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/unary.cpp
rename to poros/poros/converter/gpu/unary.cpp
diff --git a/poros/src/poros/converter/gpu/unary.h b/poros/poros/converter/gpu/unary.h
similarity index 100%
rename from poros/src/poros/converter/gpu/unary.h
rename to poros/poros/converter/gpu/unary.h
diff --git a/poros/src/poros/converter/gpu/weight.cpp b/poros/poros/converter/gpu/weight.cpp
similarity index 100%
rename from poros/src/poros/converter/gpu/weight.cpp
rename to poros/poros/converter/gpu/weight.cpp
diff --git a/poros/src/poros/converter/gpu/weight.h b/poros/poros/converter/gpu/weight.h
similarity index 100%
rename from poros/src/poros/converter/gpu/weight.h
rename to poros/poros/converter/gpu/weight.h
diff --git a/poros/src/poros/converter/iconverter.h b/poros/poros/converter/iconverter.h
similarity index 98%
rename from poros/src/poros/converter/iconverter.h
rename to poros/poros/converter/iconverter.h
index 352892c24..58ec34b17 100644
--- a/poros/src/poros/converter/iconverter.h
+++ b/poros/poros/converter/iconverter.h
@@ -360,6 +360,9 @@ public:
     void init_unsupport_op_set() {
         try {
             std::vector<std::string> unsupport_op_vec = PorosGlobalContext::instance().get_poros_options().unsupport_op_list;
+            // 每次设置unsupport_op_list时刷新schema和nodekind set，避免用户下次编译想重新设置还保留之前不支持的op
+            _unsupport_schema_set.clear();
+            _unsupport_nodekind_set.clear();
             for (size_t i = 0; i < unsupport_op_vec.size(); i++) {
                 std::string line = unsupport_op_vec[i];
                 if (line.size() == 0) {
diff --git a/poros/src/poros/engine/engine.cpp b/poros/poros/engine/engine.cpp
similarity index 100%
rename from poros/src/poros/engine/engine.cpp
rename to poros/poros/engine/engine.cpp
diff --git a/poros/src/poros/engine/engine_context.h b/poros/poros/engine/engine_context.h
similarity index 100%
rename from poros/src/poros/engine/engine_context.h
rename to poros/poros/engine/engine_context.h
diff --git a/poros/src/poros/engine/iengine.h b/poros/poros/engine/iengine.h
similarity index 100%
rename from poros/src/poros/engine/iengine.h
rename to poros/poros/engine/iengine.h
diff --git a/poros/src/poros/engine/tensorrt_engine.cpp b/poros/poros/engine/tensorrt_engine.cpp
similarity index 100%
rename from poros/src/poros/engine/tensorrt_engine.cpp
rename to poros/poros/engine/tensorrt_engine.cpp
diff --git a/poros/src/poros/engine/tensorrt_engine.h b/poros/poros/engine/tensorrt_engine.h
similarity index 100%
rename from poros/src/poros/engine/tensorrt_engine.h
rename to poros/poros/engine/tensorrt_engine.h
diff --git a/poros/src/poros/engine/trtengine_util.cpp b/poros/poros/engine/trtengine_util.cpp
similarity index 100%
rename from poros/src/poros/engine/trtengine_util.cpp
rename to poros/poros/engine/trtengine_util.cpp
diff --git a/poros/src/poros/engine/trtengine_util.h b/poros/poros/engine/trtengine_util.h
similarity index 100%
rename from poros/src/poros/engine/trtengine_util.h
rename to poros/poros/engine/trtengine_util.h
diff --git a/poros/src/poros/iplugin/plugin_create.cpp b/poros/poros/iplugin/plugin_create.cpp
similarity index 100%
rename from poros/src/poros/iplugin/plugin_create.cpp
rename to poros/poros/iplugin/plugin_create.cpp
diff --git a/poros/src/poros/iplugin/plugin_create.h b/poros/poros/iplugin/plugin_create.h
similarity index 100%
rename from poros/src/poros/iplugin/plugin_create.h
rename to poros/poros/iplugin/plugin_create.h
diff --git a/poros/src/poros/log/poros_logging.h b/poros/poros/log/poros_logging.h
similarity index 100%
rename from poros/src/poros/log/poros_logging.h
rename to poros/poros/log/poros_logging.h
diff --git a/poros/src/poros/log/tensorrt_logging.cpp b/poros/poros/log/tensorrt_logging.cpp
similarity index 100%
rename from poros/src/poros/log/tensorrt_logging.cpp
rename to poros/poros/log/tensorrt_logging.cpp
diff --git a/poros/src/poros/log/tensorrt_logging.h b/poros/poros/log/tensorrt_logging.h
similarity index 100%
rename from poros/src/poros/log/tensorrt_logging.h
rename to poros/poros/log/tensorrt_logging.h
diff --git a/poros/src/poros/lowering/eliminate_exception_pass.cpp b/poros/poros/lowering/eliminate_exception_pass.cpp
similarity index 100%
rename from poros/src/poros/lowering/eliminate_exception_pass.cpp
rename to poros/poros/lowering/eliminate_exception_pass.cpp
diff --git a/poros/src/poros/lowering/eliminate_maxpoll_with_indices.cpp b/poros/poros/lowering/eliminate_maxpoll_with_indices.cpp
similarity index 100%
rename from poros/src/poros/lowering/eliminate_maxpoll_with_indices.cpp
rename to poros/poros/lowering/eliminate_maxpoll_with_indices.cpp
diff --git a/poros/src/poros/lowering/eliminate_simple_useless_nodes.cpp b/poros/poros/lowering/eliminate_simple_useless_nodes.cpp
similarity index 100%
rename from poros/src/poros/lowering/eliminate_simple_useless_nodes.cpp
rename to poros/poros/lowering/eliminate_simple_useless_nodes.cpp
diff --git a/poros/src/poros/lowering/eliminate_some_dict.cpp b/poros/poros/lowering/eliminate_some_dict.cpp
similarity index 100%
rename from poros/src/poros/lowering/eliminate_some_dict.cpp
rename to poros/poros/lowering/eliminate_some_dict.cpp
diff --git a/poros/src/poros/lowering/eliminate_some_list.cpp b/poros/poros/lowering/eliminate_some_list.cpp
similarity index 100%
rename from poros/src/poros/lowering/eliminate_some_list.cpp
rename to poros/poros/lowering/eliminate_some_list.cpp
diff --git a/poros/src/poros/lowering/eliminate_subgraph_uesless_nodes.cpp b/poros/poros/lowering/eliminate_subgraph_uesless_nodes.cpp
similarity index 100%
rename from poros/src/poros/lowering/eliminate_subgraph_uesless_nodes.cpp
rename to poros/poros/lowering/eliminate_subgraph_uesless_nodes.cpp
diff --git a/poros/src/poros/lowering/eliminate_useless_copy.cpp b/poros/poros/lowering/eliminate_useless_copy.cpp
similarity index 100%
rename from poros/src/poros/lowering/eliminate_useless_copy.cpp
rename to poros/poros/lowering/eliminate_useless_copy.cpp
diff --git a/poros/src/poros/lowering/fuse_clip.cpp b/poros/poros/lowering/fuse_clip.cpp
similarity index 100%
rename from poros/src/poros/lowering/fuse_clip.cpp
rename to poros/poros/lowering/fuse_clip.cpp
diff --git a/poros/src/poros/lowering/fuse_clip.h b/poros/poros/lowering/fuse_clip.h
similarity index 100%
rename from poros/src/poros/lowering/fuse_clip.h
rename to poros/poros/lowering/fuse_clip.h
diff --git a/poros/src/poros/lowering/fuse_conv_bn.cpp b/poros/poros/lowering/fuse_conv_bn.cpp
similarity index 100%
rename from poros/src/poros/lowering/fuse_conv_bn.cpp
rename to poros/poros/lowering/fuse_conv_bn.cpp
diff --git a/poros/src/poros/lowering/fuse_conv_bn.h b/poros/poros/lowering/fuse_conv_bn.h
similarity index 100%
rename from poros/src/poros/lowering/fuse_conv_bn.h
rename to poros/poros/lowering/fuse_conv_bn.h
diff --git a/poros/src/poros/lowering/fuse_conv_mul.cpp b/poros/poros/lowering/fuse_conv_mul.cpp
similarity index 100%
rename from poros/src/poros/lowering/fuse_conv_mul.cpp
rename to poros/poros/lowering/fuse_conv_mul.cpp
diff --git a/poros/src/poros/lowering/fuse_conv_mul.h b/poros/poros/lowering/fuse_conv_mul.h
similarity index 100%
rename from poros/src/poros/lowering/fuse_conv_mul.h
rename to poros/poros/lowering/fuse_conv_mul.h
diff --git a/poros/src/poros/lowering/fuse_copy.cpp b/poros/poros/lowering/fuse_copy.cpp
similarity index 100%
rename from poros/src/poros/lowering/fuse_copy.cpp
rename to poros/poros/lowering/fuse_copy.cpp
diff --git a/poros/src/poros/lowering/fuse_copy.h b/poros/poros/lowering/fuse_copy.h
similarity index 100%
rename from poros/src/poros/lowering/fuse_copy.h
rename to poros/poros/lowering/fuse_copy.h
diff --git a/poros/src/poros/lowering/fuse_gelu.cpp b/poros/poros/lowering/fuse_gelu.cpp
similarity index 100%
rename from poros/src/poros/lowering/fuse_gelu.cpp
rename to poros/poros/lowering/fuse_gelu.cpp
diff --git a/poros/src/poros/lowering/fuse_gelu.h b/poros/poros/lowering/fuse_gelu.h
similarity index 100%
rename from poros/src/poros/lowering/fuse_gelu.h
rename to poros/poros/lowering/fuse_gelu.h
diff --git a/poros/src/poros/lowering/fuse_hard_swish.cpp b/poros/poros/lowering/fuse_hard_swish.cpp
similarity index 100%
rename from poros/src/poros/lowering/fuse_hard_swish.cpp
rename to poros/poros/lowering/fuse_hard_swish.cpp
diff --git a/poros/src/poros/lowering/fuse_hard_swish.h b/poros/poros/lowering/fuse_hard_swish.h
similarity index 100%
rename from poros/src/poros/lowering/fuse_hard_swish.h
rename to poros/poros/lowering/fuse_hard_swish.h
diff --git a/poros/src/poros/lowering/fuse_meshgrid.cpp b/poros/poros/lowering/fuse_meshgrid.cpp
similarity index 100%
rename from poros/src/poros/lowering/fuse_meshgrid.cpp
rename to poros/poros/lowering/fuse_meshgrid.cpp
diff --git a/poros/src/poros/lowering/fuse_meshgrid.h b/poros/poros/lowering/fuse_meshgrid.h
similarity index 100%
rename from poros/src/poros/lowering/fuse_meshgrid.h
rename to poros/poros/lowering/fuse_meshgrid.h
diff --git a/poros/src/poros/lowering/input_param_propagate.cpp b/poros/poros/lowering/input_param_propagate.cpp
similarity index 100%
rename from poros/src/poros/lowering/input_param_propagate.cpp
rename to poros/poros/lowering/input_param_propagate.cpp
diff --git a/poros/src/poros/lowering/link_mutable_list_pass.cpp b/poros/poros/lowering/link_mutable_list_pass.cpp
similarity index 100%
rename from poros/src/poros/lowering/link_mutable_list_pass.cpp
rename to poros/poros/lowering/link_mutable_list_pass.cpp
diff --git a/poros/src/poros/lowering/lowering_pass.h b/poros/poros/lowering/lowering_pass.h
similarity index 100%
rename from poros/src/poros/lowering/lowering_pass.h
rename to poros/poros/lowering/lowering_pass.h
diff --git a/poros/src/poros/lowering/op_fuse_pass.cpp b/poros/poros/lowering/op_fuse_pass.cpp
similarity index 100%
rename from poros/src/poros/lowering/op_fuse_pass.cpp
rename to poros/poros/lowering/op_fuse_pass.cpp
diff --git a/poros/src/poros/lowering/op_fuse_pass.h b/poros/poros/lowering/op_fuse_pass.h
similarity index 100%
rename from poros/src/poros/lowering/op_fuse_pass.h
rename to poros/poros/lowering/op_fuse_pass.h
diff --git a/poros/src/poros/lowering/remove_simple_type_profile_nodes.cpp b/poros/poros/lowering/remove_simple_type_profile_nodes.cpp
similarity index 100%
rename from poros/src/poros/lowering/remove_simple_type_profile_nodes.cpp
rename to poros/poros/lowering/remove_simple_type_profile_nodes.cpp
diff --git a/poros/src/poros/lowering/replace_illegal_constant.cpp b/poros/poros/lowering/replace_illegal_constant.cpp
similarity index 100%
rename from poros/src/poros/lowering/replace_illegal_constant.cpp
rename to poros/poros/lowering/replace_illegal_constant.cpp
diff --git a/poros/src/poros/lowering/replace_pad.cpp b/poros/poros/lowering/replace_pad.cpp
similarity index 100%
rename from poros/src/poros/lowering/replace_pad.cpp
rename to poros/poros/lowering/replace_pad.cpp
diff --git a/poros/src/poros/lowering/segment_post_processing.cpp b/poros/poros/lowering/segment_post_processing.cpp
similarity index 100%
rename from poros/src/poros/lowering/segment_post_processing.cpp
rename to poros/poros/lowering/segment_post_processing.cpp
diff --git a/poros/src/poros/lowering/segment_post_processing.h b/poros/poros/lowering/segment_post_processing.h
similarity index 100%
rename from poros/src/poros/lowering/segment_post_processing.h
rename to poros/poros/lowering/segment_post_processing.h
diff --git a/poros/src/poros/lowering/try_to_freeze_aten_dim.cpp b/poros/poros/lowering/try_to_freeze_aten_dim.cpp
similarity index 100%
rename from poros/src/poros/lowering/try_to_freeze_aten_dim.cpp
rename to poros/poros/lowering/try_to_freeze_aten_dim.cpp
diff --git a/poros/src/poros/lowering/try_to_freeze_aten_len.cpp b/poros/poros/lowering/try_to_freeze_aten_len.cpp
similarity index 100%
rename from poros/src/poros/lowering/try_to_freeze_aten_len.cpp
rename to poros/poros/lowering/try_to_freeze_aten_len.cpp
diff --git a/poros/src/poros/lowering/try_to_freeze_aten_size.cpp b/poros/poros/lowering/try_to_freeze_aten_size.cpp
similarity index 100%
rename from poros/src/poros/lowering/try_to_freeze_aten_size.cpp
rename to poros/poros/lowering/try_to_freeze_aten_size.cpp
diff --git a/poros/src/poros/lowering/try_to_freeze_list_construct.cpp b/poros/poros/lowering/try_to_freeze_list_construct.cpp
similarity index 100%
rename from poros/src/poros/lowering/try_to_freeze_list_construct.cpp
rename to poros/poros/lowering/try_to_freeze_list_construct.cpp
diff --git a/poros/src/poros/lowering/try_to_freeze_percentformat.cpp b/poros/poros/lowering/try_to_freeze_percentformat.cpp
similarity index 100%
rename from poros/src/poros/lowering/try_to_freeze_percentformat.cpp
rename to poros/poros/lowering/try_to_freeze_percentformat.cpp
diff --git a/poros/src/poros/lowering/unpack_certain_ops.cpp b/poros/poros/lowering/unpack_certain_ops.cpp
similarity index 100%
rename from poros/src/poros/lowering/unpack_certain_ops.cpp
rename to poros/poros/lowering/unpack_certain_ops.cpp
diff --git a/poros/src/poros/lowering/unrolling_loop.cpp b/poros/poros/lowering/unrolling_loop.cpp
similarity index 100%
rename from poros/src/poros/lowering/unrolling_loop.cpp
rename to poros/poros/lowering/unrolling_loop.cpp
diff --git a/poros/src/poros/util/graph_test_helper.cpp b/poros/poros/util/graph_test_helper.cpp
similarity index 100%
rename from poros/src/poros/util/graph_test_helper.cpp
rename to poros/poros/util/graph_test_helper.cpp
diff --git a/poros/src/poros/util/graph_test_helper.h b/poros/poros/util/graph_test_helper.h
similarity index 100%
rename from poros/src/poros/util/graph_test_helper.h
rename to poros/poros/util/graph_test_helper.h
diff --git a/poros/src/poros/util/macros.h b/poros/poros/util/macros.h
similarity index 100%
rename from poros/src/poros/util/macros.h
rename to poros/poros/util/macros.h
diff --git a/poros/src/poros/util/poros_util.cpp b/poros/poros/util/poros_util.cpp
similarity index 100%
rename from poros/src/poros/util/poros_util.cpp
rename to poros/poros/util/poros_util.cpp
diff --git a/poros/src/poros/util/poros_util.h b/poros/poros/util/poros_util.h
similarity index 100%
rename from poros/src/poros/util/poros_util.h
rename to poros/poros/util/poros_util.h
diff --git a/poros/src/poros/util/test_util.cpp b/poros/poros/util/test_util.cpp
similarity index 100%
rename from poros/src/poros/util/test_util.cpp
rename to poros/poros/util/test_util.cpp
diff --git a/poros/src/poros/util/test_util.h b/poros/poros/util/test_util.h
similarity index 100%
rename from poros/src/poros/util/test_util.h
rename to poros/poros/util/test_util.h
diff --git a/poros/python/poros/_compile.py b/poros/python/poros/_compile.py
index 056a3b0be..0df9f2093 100644
--- a/poros/python/poros/_compile.py
+++ b/poros/python/poros/_compile.py
@@ -73,7 +73,7 @@ def compile(module, prewarm_inputs, poros_options):
     Args:
         module (torch.nn.Module / torch.jit.ScriptModule): Source module
         input (list of tensor input): prewarmed data.
-        poros_options(PorosOptions / Dict of settings): compile settings for poros
+        poros_options(PorosOptions): compile settings for poros
     Returns:
         PorosModule: Compiled Module of poros, 
                     when run it will partially execute via inlined engine (which is TensorRT)
diff --git a/poros/python/poros/_input_convert.py b/poros/python/poros/_input_convert.py
index 39f6bcdc3..23eebfc5d 100644
--- a/poros/python/poros/_input_convert.py
+++ b/poros/python/poros/_input_convert.py
@@ -61,6 +61,8 @@ def convert_prewarm_inputs(prewarm_inputs):
     else:
         raise TypeError("prewarm_inputs for poros should be torch.Tensor or wraped as tuple or inputs-lists, fix it")
     return wraped_prewarm_inputs     
+    # info = poros._C.PreWarmDatas()
+    # info.set_data(prewarm_inputs)
 
 def convert_poros_option(poros_option):
     # type: Dict[str, Any] -> poros._C.PorosOptions
@@ -70,39 +72,6 @@ def convert_poros_option(poros_option):
     option = poros._C.PorosOptions()
     if poros_option is None:
         #default situation. if user do not set the poros_option
-        return option
-    elif isinstance(poros_option, dict):
-        if "debug" in poros_option:
-            assert isinstance(poros_option["debug"], bool)
-            option.debug = poros_option["debug"]
-
-        if "use_fp16" in poros_option:
-            assert isinstance(poros_option["use_fp16"], bool)
-            option.use_fp16 = poros_option["use_fp16"]
-        
-        if "max_workspace_size" in poros_option:
-            assert type(poros_option["max_workspace_size"]) is int
-            option.max_workspace_size = poros_option["max_workspace_size"]
-
-        if "device" in poros_option:
-            option.device = _parse_device(poros_option["device"])
-        
-        if "is_dynamic" in poros_option:
-            assert isinstance(poros_option["is_dynamic"], bool)
-            option.is_dynamic = poros_option["is_dynamic"]
-
-        if "long_to_int" in poros_option:
-            assert isinstance(poros_option["long_to_int"], bool)
-            option.long_to_int = poros_option["long_to_int"]
-        
-        if "device_id" in poros_option:
-            assert type(poros_option["device_id"]) is int
-            option.device_id = poros_option["device_id"]
-
-        if "preprocess_mode" in poros_option:
-            assert type(poros_option["preprocess_mode"]) is int
-            option.preprocess_mode= poros_option["preprocess_mode"]
-
         return option
     elif isinstance(poros_option, PorosOptions):
         return poros_option.to_internal()
diff --git a/poros/third_party/gflags b/poros/third_party/gflags
new file mode 160000
index 000000000..a738fdf93
--- /dev/null
+++ b/poros/third_party/gflags
@@ -0,0 +1 @@
+Subproject commit a738fdf9338412f83ab3f26f31ac11ed3f3ec4bd
diff --git a/poros/third_party/googletest b/poros/third_party/googletest
new file mode 160000
index 000000000..7a7231c44
--- /dev/null
+++ b/poros/third_party/googletest
@@ -0,0 +1 @@
+Subproject commit 7a7231c442484be389fdf01594310349ca0e42a8
diff --git a/poros/unittest/CMakeLists.txt b/poros/unittest/CMakeLists.txt
index 52f44495f..4c5f30346 100644
--- a/poros/unittest/CMakeLists.txt
+++ b/poros/unittest/CMakeLists.txt
@@ -10,11 +10,11 @@ set(GRAPHTEST "graph_test"  )
 file(
         GLOB UT_FILES
         "./op_fuser/*.cpp"
-        "../src/poros/lowering/fuse_*.cpp"
+        "../poros/lowering/fuse_*.cpp"
 )
 list(APPEND UT_FILES
-        "../src/poros/lowering/op_fuse_pass.cpp"
-        "../src/poros/util/graph_test_helper.cpp")
+        "../poros/lowering/op_fuse_pass.cpp"
+        "../poros/util/graph_test_helper.cpp")
 
 add_executable(${GRAPHTEST} ${UT_FILES})
 target_link_libraries(${GRAPHTEST} gtest_main)
@@ -28,8 +28,8 @@ set(UNITTEST "unit_test"  )
 
 file(
         GLOB UT_FILES
-        "../src/poros/*/*.cpp"
-        "../src/poros/converter/*/*.cpp"
+        "../poros/*/*.cpp"
+        "../poros/converter/*/*.cpp"
         "./converter/*.cpp"
 )
 
diff --git a/poros/unittest/converter/reduce_test.cpp b/poros/unittest/converter/reduce_test.cpp
index 81c8f9514..076bc5287 100644
--- a/poros/unittest/converter/reduce_test.cpp
+++ b/poros/unittest/converter/reduce_test.cpp
@@ -29,9 +29,16 @@ static void reduce_test_helper(const std::string& graph_IR,
                             std::vector<int64_t> shape1,
                             bool single_input = true,
                             std::vector<int64_t> shape2 = {4, 4},
-                            bool single_output = true){
+                            bool single_output = true,
+                            bool int_flag = false){
     std::vector<at::Tensor> input_data;
-    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+
+    if(int_flag) {
+        auto options_pyt_long = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+        input_data.push_back(at::randint(1000, shape1, options_pyt_long));
+    } else {
+        input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    }
 
     if (!single_input){
         input_data.push_back(at::randn(shape2, {at::kCUDA}));
@@ -90,6 +97,24 @@ static std::string gen_min_max_dim_graph(const std::string& op, const std::strin
         return (%3, %4))IR";
 }
 
+static std::string gen_argmin_argmax_graph(const std::string& op, const std::string& dim, const std::string& keepdim) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+        %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+        %3 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+        return (%3))IR";
+}
+
+static std::string gen_argmin_argmax_dim_none_graph(const std::string& op, const std::string& keepdim) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : None = prim::Constant()
+        %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+        %3 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+        return (%3))IR";
+}
+
 static std::string gen_mean_sum_dim_graph(const std::string& op, const std::string& dim, const std::string& keepdim) {
     return R"IR(
         graph(%0 : Tensor):
@@ -359,4 +384,68 @@ TEST(Converters, ATenMinDimDynamicConvertsCorrectly) {
     
     ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
     ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[1], poros_output[1], 2e-6));
-}
\ No newline at end of file
+}
+
+TEST(Converters, ArgmaxConvertersCorrectly) {
+    // aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> (Tensor)
+    baidu::mirana::poros::ArgmaxArgminConverter argmaxargminconverter;
+
+    const auto graph_IR1 = gen_argmin_argmax_graph("argmax", "0", "0");
+    reduce_test_helper(graph_IR1, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR2 = gen_argmin_argmax_graph("argmax", "1", "0");
+    reduce_test_helper(graph_IR2, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR3 = gen_argmin_argmax_graph("argmax", "2", "0");
+    reduce_test_helper(graph_IR3, &argmaxargminconverter, {4, 4, 6}, true, {}, true);
+    const auto graph_IR4 = gen_argmin_argmax_graph("argmax", "3", "0");
+    reduce_test_helper(graph_IR4, &argmaxargminconverter, {4, 4, 6, 8}, true, {}, true);
+
+    const auto graph_IR5 = gen_argmin_argmax_graph("argmax", "0", "1");
+    reduce_test_helper(graph_IR5, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR6 = gen_argmin_argmax_graph("argmax", "1", "1");
+    reduce_test_helper(graph_IR6, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR7 = gen_argmin_argmax_graph("argmax", "-1", "1");
+    reduce_test_helper(graph_IR7, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR8 = gen_argmin_argmax_graph("argmax", "-1", "0");
+    reduce_test_helper(graph_IR8, &argmaxargminconverter, {4, 4}, true, {}, true);
+
+    // test input tensor of int type
+    const auto graph_IR9 = gen_argmin_argmax_graph("argmax", "1", "0");
+    reduce_test_helper(graph_IR9, &argmaxargminconverter, {4, 4}, true, {}, true, true);
+    const auto graph_IR10 = gen_argmin_argmax_graph("argmax", "-1", "0");
+    reduce_test_helper(graph_IR10, &argmaxargminconverter, {4, 4}, true, {}, true, true);
+}
+
+TEST(Converters, ArgminConvertersCorrectly) {
+    // aten::argmin(Tensor self, int? dim=None, bool keepdim=False) -> (Tensor)
+    baidu::mirana::poros::ArgmaxArgminConverter argmaxargminconverter;
+
+    const auto graph_IR1 = gen_argmin_argmax_graph("argmin", "0", "0");
+    reduce_test_helper(graph_IR1, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR2 = gen_argmin_argmax_graph("argmin", "1", "0");
+    reduce_test_helper(graph_IR2, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR3 = gen_argmin_argmax_graph("argmin", "2", "0");
+    reduce_test_helper(graph_IR3, &argmaxargminconverter, {4, 4, 6}, true, {}, true);
+    const auto graph_IR4 = gen_argmin_argmax_graph("argmin", "3", "0");
+    reduce_test_helper(graph_IR4, &argmaxargminconverter, {4, 4, 6, 8}, true, {}, true);
+
+    const auto graph_IR5 = gen_argmin_argmax_graph("argmin", "0", "1");
+    reduce_test_helper(graph_IR5, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR6 = gen_argmin_argmax_graph("argmin", "1", "1");
+    reduce_test_helper(graph_IR6, &argmaxargminconverter, {4, 4}, true, {}, true);
+    const auto graph_IR7 = gen_argmin_argmax_graph("argmin", "-1", "1");
+    reduce_test_helper(graph_IR7, &argmaxargminconverter, {4, 4}, true, {}, true);
+
+    // test input tensor of int type
+    const auto graph_IR9 = gen_argmin_argmax_graph("argmin", "1", "0");
+    reduce_test_helper(graph_IR9, &argmaxargminconverter, {4, 4}, true, {}, true, true);
+    const auto graph_IR10 = gen_argmin_argmax_graph("argmin", "-1", "0");
+    reduce_test_helper(graph_IR10, &argmaxargminconverter, {4, 4}, true, {}, true, true);
+}
+
+// TODO: to imp dim=None
+// TEST(Converters, ArgmaxNoneDimConvertersCorrectly) {
+//     // aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> (Tensor)
+//     baidu::mirana::poros::ArgmaxArgminConverter argmaxargminconverter;
+//     const auto graph_IR1 = gen_argmin_argmax_dim_none_graph("argmax", "0");
+//     reduce_test_helper(graph_IR1, &argmaxargminconverter, {4, 4}, true, {}, true);
+// }
\ No newline at end of file

From 5f8157c39876d22871be2f02c9da9a266fe20d1f Mon Sep 17 00:00:00 2001
From: TianShaoqing <45099683+TianShaoqing@users.noreply.github.com>
Date: Tue, 21 Feb 2023 13:35:30 +0800
Subject: [PATCH 07/20] update poros README.md (#1393)

Co-authored-by: tianshaoqing <tianshaoqing@baidu.com>
---
 poros/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poros/README.md b/poros/README.md
index 75b36b623..7eaacafba 100644
--- a/poros/README.md
+++ b/poros/README.md
@@ -1,10 +1,10 @@
-# Poros AI Inference Accelerator
+# Poros AI Compiler
 
 ## Description
 
-Poros is an AI Inference Accelerator for deep learning framework. It can provide significantly lower inference latency comparing with original model, and provide much flexibility for dynamic graphs.
-Poros mainly works on the TorchScript IR currently, that means it supports the models from PyTorch, ONNX, TensorFlow and any other framework that can be converted to TorchScript. also, we are planting to support more IRs in the future.
-Poros is designed to supports multiple hardware backends conveniently, For now, Poros supports GPU and XPU (BAIDU-Kunlun) Device, It's welcomed to add additional devices.
+Poros is an AI Compiler for deep learning framework. It can provide significantly lower inference latency comparing with original model, and provide much flexibility for dynamic graphs.
+Poros mainly works on the TorchScript IR currently, that means it supports the models from PyTorch, ONNX, TensorFlow and any other framework that can be converted to TorchScript. Also, we are planning to support more IRs in the future.
+Poros is designed to supports multiple hardware backends conveniently. For now, Poros has supported GPU and XPU (BAIDU-Kunlun) Device. It's welcomed to add additional devices.
 
 ## How It Works
 

From 42817ddc18dfcbeb35d58134260ea4183d3d8553 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Tue, 21 Feb 2023 15:41:37 +0800
Subject: [PATCH 08/20] [Doc] Update cpp benchmark docs for CPU/GPU (#1377)

* [Benchmark] Init benchmark precision api

* [Benchmark] Init benchmark precision api

* [Benchmark] Add benchmark precision api

* [Benchmark] Calculate the statis of diff

* [Benchmark] Calculate the statis of diff

* [Benchmark] Calculate the statis of diff

* [Benchmark] Calculate the statis of diff

* [Benchmark] Calculate the statis of diff

* [Benchmark] Add SplitDataLine utils

* [Benchmark] Add LexSortByXY func

* [Benchmark] Add LexSortByXY func

* [Benchmark] Add LexSortDetectionResultByXY func

* [Benchmark] Add LexSortDetectionResultByXY func

* [Benchmark] Add tensor diff presicion test

* [Benchmark] fixed conflicts

* [Benchmark] fixed calc tensor diff

* fixed build bugs

* fixed ci bugs when WITH_TESTING=ON

* [Docs] init cpp benchmark docs

* [Doc] update cpp benchmark docs

* [Doc] update cpp benchmark docs

* [Doc] update cpp  benchmark docs

* [Doc] update cpp  benchmark docs
---
 benchmark/cpp/README.md                 | 137 ++++++++++++++++++++++++
 benchmark/cpp/flags.h                   |   2 +
 benchmark/cpp/run_benchmark_ppyolov8.sh |   0
 benchmark/python/README.md              |   4 +-
 4 files changed, 141 insertions(+), 2 deletions(-)
 create mode 100644 benchmark/cpp/README.md
 create mode 100644 benchmark/cpp/run_benchmark_ppyolov8.sh

diff --git a/benchmark/cpp/README.md b/benchmark/cpp/README.md
new file mode 100644
index 000000000..abdbabaf6
--- /dev/null
+++ b/benchmark/cpp/README.md
@@ -0,0 +1,137 @@
+# FastDeploy C++ Benchmarks
+
+## 1. 编译选项  
+以下选项为benchmark相关的编译选项，在编译用来跑benchmark的sdk时，必须开启。  
+
+|选项|需要设置的值|说明|
+|---|---|---|  
+| ENABLE_BENCHMARK  | ON | 默认OFF, 是否打开BENCHMARK模式 |
+| ENABLE_VISION     | ON | 默认OFF，是否编译集成视觉模型的部署模块 |
+| ENABLE_TEXT       | ON | 默认OFF，是否编译集成文本NLP模型的部署模块 |  
+
+运行FastDeploy C++ Benchmark，需先准备好相应的环境，并在ENABLE_BENCHMARK=ON模式下从源码编译FastDeploy C++ SDK. 以下将按照硬件维度，来说明相应的系统环境要求。不同环境下的详细要求，请参考[FastDeploy环境要求](../../docs/cn/build_and_install)  
+
+## 2. Benchmark 参数设置说明  
+
+<div id="参数设置说明"></div>  
+
+
+| 参数                 | 作用                                        |
+| -------------------- | ------------------------------------------ |
+| --model              | 模型路径                                     |
+| --image              | 图片路径    |
+| --device             | 选择 CPU/GPU/XPU，默认为 CPU  |
+| --cpu_thread_nums     | CPU 线程数，默认为 8      |
+| --device_id          | GPU/XPU 卡号，默认为 0 |
+| --warmup           | 跑benchmark的warmup次数，默认为 200 |
+| --repeat           | 跑benchmark的循环次数，默认为 1000 |  
+| --profile_mode      | 指定需要测试性能的模式，可选值为`[runtime, end2end]`，默认为 runtime |  
+| --include_h2d_d2h   | 是否把H2D+D2H的耗时统计在内，该参数只在profile_mode为runtime时有效，默认为 false |  
+| --backend            | 指定后端类型，有default, ort, ov, trt, paddle, paddle_trt, lite 等，为default时，会自动选择最优后端，推荐设置为显式设置明确的backend。默认为 default   |
+| --use_fp16    | 是否开启fp16，当前只对 trt, paddle-trt, lite后端有效，默认为 false |
+| --collect_memory_info    | 是否记录 cpu/gpu memory信息，默认 false  |
+| --sampling_interval    | 记录 cpu/gpu memory信息采样时间间隔，单位ms，默认为 50  |  
+
+## 3. X86_64 CPU 和 NVIDIA GPU 环境下运行 Benchmark
+
+### 3.1 环境准备  
+
+Linux上编译需满足:
+  - gcc/g++ >= 5.4(推荐8.2)
+  - cmake >= 3.18.0
+  - CUDA >= 11.2
+  - cuDNN >= 8.2
+  - TensorRT >= 8.5
+
+在GPU上编译FastDeploy需要准备好相应的CUDA环境以及TensorRT，详细文档请参考[GPU编译文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/gpu.md)。  
+
+### 3.2 编译FastDeploy C++ SDK  
+```bash
+# 源码编译SDK
+git clone https://github.com/PaddlePaddle/FastDeploy.git -b develop
+cd FastDeploy
+mkdir build && cd build
+cmake .. -DWITH_GPU=ON \
+         -DENABLE_ORT_BACKEND=ON \
+         -DENABLE_PADDLE_BACKEND=ON \
+         -DENABLE_OPENVINO_BACKEND=ON \
+         -DENABLE_TRT_BACKEND=ON \
+         -DENABLE_VISION=ON \
+         -DENABLE_TEXT=ON \
+         -DENABLE_BENCHMARK=ON \  # 开启benchmark模式
+         -DTRT_DIRECTORY=/Paddle/TensorRT-8.5.2.2 \
+         -DCUDA_DIRECTORY=/usr/local/cuda \
+         -DCMAKE_INSTALL_PREFIX=${PWD}/compiled_fastdeploy_sdk
+
+make -j12
+make install  
+
+# 配置SDK路径
+cd ..  
+export FD_GPU_SDK=${PWD}/build/compiled_fastdeploy_sdk
+```  
+### 3.3 编译 Benchmark 示例  
+```bash  
+cd benchmark/cpp
+mkdir build && cd build  
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${FD_GPU_SDK}  
+make -j4
+```
+
+### 3.4 运行 Benchmark 示例  
+
+在X86 CPU + NVIDIA GPU下，FastDeploy 目前支持多种推理后端，下面以 PaddleYOLOv8 为例，跑出多后端在 CPU/GPU 对应 benchmark 数据。
+
+- 下载模型文件和测试图片  
+```bash  
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov8_s_500e_coco.tgz  
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+tar -zxvf yolov8_s_500e_coco.tgz
+```
+
+- 运行 yolov8 benchmark 示例  
+
+```bash  
+
+# 统计性能  
+# CPU
+# Paddle Inference
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device cpu --cpu_thread_nums 8 --backend paddle --profile_mode runtime
+
+# ONNX Runtime
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device cpu --cpu_thread_nums 8 --backend ort --profile_mode runtime
+
+# OpenVINO
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device cpu --cpu_thread_nums 8 --backend ov --profile_mode runtime
+
+# GPU
+# Paddle Inference
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device gpu --device_id 0 --backend paddle --profile_mode runtime --warmup 200 --repeat 2000
+
+# Paddle Inference + TensorRT
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device gpu --device_id 0 --backend paddle_trt --profile_mode runtime --warmup 200 --repeat 2000
+
+# Paddle Inference + TensorRT + FP16
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device gpu --device_id 0 --backend paddle --profile_mode runtime --warmup 200 --repeat 2000 --use_fp16
+
+# ONNX Runtime
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device gpu --device_id 0 --backend ort --profile_mode runtime --warmup 200 --repeat 2000
+
+# TensorRT
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device gpu --device_id 0 --backend paddle --profile_mode runtime --warmup 200 --repeat 2000
+
+# TensorRT + FP16
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device gpu --device_id 0 --backend trt --profile_mode runtime --warmup 200 --repeat 2000 --use_fp16
+
+# 统计内存显存占用  
+# 增加--collect_memory_info选项
+./benchmark_ppyolov8 --model yolov8_s_500e_coco --image 000000014439.jpg --device cpu --cpu_thread_nums 8 --backend paddle --profile_mode runtime --collect_memory_info
+```
+注意，为避免对性能统计产生影响，测试性能时，最好不要开启内存显存统计的功能，当指定--collect_memory_info参数时，只有内存显存参数是稳定可靠的。更多参数设置，请参考[参数设置说明](#参数设置说明)
+
+
+## 4. ARM CPU 环境下运行 Benchmark
+- TODO
+
+## 5. 昆仑芯 XPU 环境下运行 Benchmark
+- TODO
diff --git a/benchmark/cpp/flags.h b/benchmark/cpp/flags.h
index 4802abe8a..59361162f 100755
--- a/benchmark/cpp/flags.h
+++ b/benchmark/cpp/flags.h
@@ -63,6 +63,7 @@ static void PrintUsage() {
 }
 
 static void PrintBenchmarkInfo() {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
   // Get model name
   std::vector<std::string> model_names;
   fastdeploy::benchmark::Split(FLAGS_model, model_names, sep);
@@ -97,5 +98,6 @@ static void PrintBenchmarkInfo() {
        << "ms" << std::endl;
   }
   std::cout << ss.str() << std::endl;
+#endif
   return;
 }
diff --git a/benchmark/cpp/run_benchmark_ppyolov8.sh b/benchmark/cpp/run_benchmark_ppyolov8.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/python/README.md b/benchmark/python/README.md
index b1f96c1be..eef8d4fc0 100644
--- a/benchmark/python/README.md
+++ b/benchmark/python/README.md
@@ -2,8 +2,8 @@
 
 在跑benchmark前，需确认以下两个步骤
 
-* 1. 软硬件环境满足要求，参考[FastDeploy环境要求](..//docs/cn/build_and_install/download_prebuilt_libraries.md)
-* 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../docs/cn/build_and_install/download_prebuilt_libraries.md)
+* 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+* 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../docs/cn/build_and_install/download_prebuilt_libraries.md)
 
 FastDeploy 目前支持多种推理后端，下面以 PaddleClas MobileNetV1 为例，跑出多后端在 CPU/GPU 对应 benchmark 数据
 

From 18e33bae5c43e4bc42ad0d6015adbdbea04ca874 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Tue, 21 Feb 2023 17:01:32 +0800
Subject: [PATCH 09/20] [Other] Optimize runtime module (#1356)

* Optimize runtime

* fix error

* [Backend] Add option to print tensorrt conversion log (#1386)

Add option to print tensorrt conversion log

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>

---------

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
---
 .../runtime/backends/paddle/paddle_backend.cc | 27 +++++++++
 .../runtime/backends/paddle/paddle_backend.h  | 13 +++--
 fastdeploy/runtime/backends/tensorrt/option.h |  3 +
 .../backends/tensorrt/option_pybind.cc        |  1 +
 .../runtime/backends/tensorrt/trt_backend.cc  | 18 ++++--
 fastdeploy/runtime/backends/tensorrt/utils.h  | 18 ++++--
 fastdeploy/runtime/runtime.cc                 | 57 ++-----------------
 7 files changed, 70 insertions(+), 67 deletions(-)

diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.cc b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
index dc804e926..1fc45e990 100755
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -98,6 +98,33 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
   }
 }
 
+bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
+  if (!(Supported(runtime_option.model_format, Backend::PDINFER) && Supported(runtime_option.device, Backend::PDINFER))) {
+    return false;
+  }
+
+  auto option = runtime_option;
+  option.paddle_infer_option.model_file = runtime_option.model_file;
+  option.paddle_infer_option.params_file = runtime_option.params_file;
+  option.paddle_infer_option.model_from_memory_ = runtime_option.model_from_memory_;
+  option.paddle_infer_option.device = runtime_option.device;
+  option.paddle_infer_option.device_id = runtime_option.device_id;
+  option.paddle_infer_option.enable_pinned_memory = runtime_option.enable_pinned_memory;
+  option.paddle_infer_option.external_stream_ = runtime_option.external_stream_;
+  option.paddle_infer_option.trt_option = runtime_option.trt_option;
+  option.paddle_infer_option.trt_option.gpu_id = runtime_option.device_id;
+  if (option.model_from_memory_) {
+    return InitFromPaddle(option.model_file, option.params_file, option.paddle_infer_option);
+  } else {
+    std::string model_buffer = "";
+    std::string params_buffer = "";
+    FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer), "Failed to read model file from %s.", option.model_file.c_str());
+    FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer), "Failed to read parameters file from %s.", option.params_file.c_str());
+    return InitFromPaddle(model_buffer, params_buffer, option.paddle_infer_option);
+  }
+  return false;
+}
+
 bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
                                    const std::string& params_buffer,
                                    const PaddleBackendOption& option) {
diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.h b/fastdeploy/runtime/backends/paddle/paddle_backend.h
index 60079fed6..f662ca2b6 100755
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.h
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.h
@@ -54,12 +54,7 @@ class PaddleBackend : public BaseBackend {
  public:
   PaddleBackend() {}
   virtual ~PaddleBackend() = default;
-  void BuildOption(const PaddleBackendOption& option);
-
-  bool InitFromPaddle(const std::string& model_buffer,
-                     const std::string& params_buffer,
-                     const PaddleBackendOption& option = PaddleBackendOption());
-
+  bool Init(const RuntimeOption& option);
   bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
              bool copy_to_fd = true) override;
 
@@ -77,6 +72,12 @@ class PaddleBackend : public BaseBackend {
   std::vector<TensorInfo> GetOutputInfos() override;
 
  private:
+  void BuildOption(const PaddleBackendOption& option);
+
+  bool InitFromPaddle(const std::string& model_buffer,
+                     const std::string& params_buffer,
+                     const PaddleBackendOption& option = PaddleBackendOption());
+
   void
   CollectShapeRun(paddle_infer::Predictor* predictor,
                   const std::map<std::string, std::vector<int>>& shape) const;
diff --git a/fastdeploy/runtime/backends/tensorrt/option.h b/fastdeploy/runtime/backends/tensorrt/option.h
index 5cee0a7e3..ff28e3e3b 100755
--- a/fastdeploy/runtime/backends/tensorrt/option.h
+++ b/fastdeploy/runtime/backends/tensorrt/option.h
@@ -30,6 +30,9 @@ struct TrtBackendOption {
   /// `max_workspace_size` for TensorRT
   size_t max_workspace_size = 1 << 30;
 
+  /// Enable log while converting onnx model to tensorrt
+  bool enable_log_info = false;
+
   /*
    * @brief Enable half precison inference, on some device not support half precision, it will fallback to float32 mode
    */
diff --git a/fastdeploy/runtime/backends/tensorrt/option_pybind.cc b/fastdeploy/runtime/backends/tensorrt/option_pybind.cc
index d781256a5..f46f27f95 100644
--- a/fastdeploy/runtime/backends/tensorrt/option_pybind.cc
+++ b/fastdeploy/runtime/backends/tensorrt/option_pybind.cc
@@ -21,6 +21,7 @@ void BindTrtOption(pybind11::module& m) {
   pybind11::class_<TrtBackendOption>(m, "TrtBackendOption")
       .def(pybind11::init())
       .def_readwrite("enable_fp16", &TrtBackendOption::enable_fp16)
+      .def_readwrite("enable_log_info", &TrtBackendOption::enable_log_info)
       .def_readwrite("max_batch_size", &TrtBackendOption::max_batch_size)
       .def_readwrite("max_workspace_size",
                      &TrtBackendOption::max_workspace_size)
diff --git a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
index 74bd3ae4f..99ccbe4c7 100644
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
@@ -114,6 +114,13 @@ bool TrtBackend::LoadTrtCache(const std::string& trt_engine_file) {
 }
 
 bool TrtBackend::Init(const RuntimeOption& runtime_option) {
+  auto trt_option = runtime_option.trt_option;
+  trt_option.model_file = runtime_option.model_file;
+  trt_option.params_file = runtime_option.params_file;
+  trt_option.model_format = runtime_option.model_format;
+  trt_option.gpu_id = runtime_option.device_id;
+  trt_option.enable_pinned_memory = runtime_option.enable_pinned_memory;
+  trt_option.external_stream_ = runtime_option.external_stream_;
   if (runtime_option.device != Device::GPU) {
     FDERROR << "TrtBackend only supports Device::GPU, but now it's "
             << runtime_option.device << "." << std::endl;
@@ -130,7 +137,7 @@ bool TrtBackend::Init(const RuntimeOption& runtime_option) {
     if (runtime_option.model_from_memory_) {
       return InitFromPaddle(runtime_option.model_file,
                             runtime_option.params_file,
-                            runtime_option.trt_option);
+                            trt_option);
     } else {
       std::string model_buffer;
       std::string params_buffer;
@@ -141,17 +148,17 @@ bool TrtBackend::Init(const RuntimeOption& runtime_option) {
                "Failed to read parameters file %s.",
                runtime_option.params_file.c_str());
       return InitFromPaddle(model_buffer, params_buffer,
-                            runtime_option.trt_option);
+                            trt_option);
     }
   } else {
     if (runtime_option.model_from_memory_) {
-      return InitFromOnnx(runtime_option.model_file, runtime_option.trt_option);
+      return InitFromOnnx(runtime_option.model_file, trt_option);
     } else {
       std::string model_buffer;
       FDASSERT(ReadBinaryFromFile(runtime_option.model_file, &model_buffer),
                "Failed to read model file %s.",
                runtime_option.model_file.c_str());
-      return InitFromOnnx(model_buffer, runtime_option.trt_option);
+      return InitFromOnnx(model_buffer, trt_option);
     }
   }
   return true;
@@ -525,6 +532,9 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs,
 }
 
 bool TrtBackend::BuildTrtEngine() {
+  if (option_.enable_log_info) {
+    FDTrtLogger::Get()->SetLog(true, true);
+  }
   auto config =
       FDUniquePtr<nvinfer1::IBuilderConfig>(builder_->createBuilderConfig());
   if (!config) {
diff --git a/fastdeploy/runtime/backends/tensorrt/utils.h b/fastdeploy/runtime/backends/tensorrt/utils.h
index 3d4c11f31..b2fe8ee99 100644
--- a/fastdeploy/runtime/backends/tensorrt/utils.h
+++ b/fastdeploy/runtime/backends/tensorrt/utils.h
@@ -220,20 +220,30 @@ class FDTrtLogger : public nvinfer1::ILogger {
     logger = new FDTrtLogger();
     return logger;
   }
+  void SetLog(bool enable_info = false, bool enable_warning = false) {
+    enable_info_ = enable_info;
+    enable_warning_ = enable_warning;
+  }
+
   void log(nvinfer1::ILogger::Severity severity,
            const char* msg) noexcept override {
     if (severity == nvinfer1::ILogger::Severity::kINFO) {
-      // Disable this log
-      //      FDINFO << msg << std::endl;
+      if (enable_info_) {
+        FDINFO << msg << std::endl;
+      }
     } else if (severity == nvinfer1::ILogger::Severity::kWARNING) {
-      // Disable this log
-      //      FDWARNING << msg << std::endl;
+      if (enable_warning_) {
+        FDWARNING << msg << std::endl;
+      }
     } else if (severity == nvinfer1::ILogger::Severity::kERROR) {
       FDERROR << msg << std::endl;
     } else if (severity == nvinfer1::ILogger::Severity::kINTERNAL_ERROR) {
       FDASSERT(false, "%s", msg);
     }
   }
+ private:
+  bool enable_info_ = false;
+  bool enable_warning_ = false;
 };
 
 struct ShapeRangeInfo {
diff --git a/fastdeploy/runtime/runtime.cc b/fastdeploy/runtime/runtime.cc
index 0e6eecf32..df000c9ac 100644
--- a/fastdeploy/runtime/runtime.cc
+++ b/fastdeploy/runtime/runtime.cc
@@ -154,19 +154,10 @@ bool Runtime::Init(const RuntimeOption& _option) {
   } else if (option.backend == Backend::SOPHGOTPU) {
     CreateSophgoNPUBackend();
   } else if (option.backend == Backend::POROS) {
-    FDASSERT(option.device == Device::CPU || option.device == Device::GPU,
-             "Backend::POROS only supports Device::CPU/Device::GPU.");
-    FDASSERT(option.model_format == ModelFormat::TORCHSCRIPT,
-             "Backend::POROS only supports model format of "
-             "ModelFormat::TORCHSCRIPT.");
-    FDINFO << "Runtime initialized with Backend::POROS in " << option.device
-           << "." << std::endl;
-    return true;
+    CreatePorosBackend();
   } else {
-    FDERROR << "Runtime only support "
-               "Backend::ORT/Backend::TRT/Backend::PDINFER/Backend::POROS as "
-               "backend now."
-            << std::endl;
+    std::string msg = Str(GetAvailableBackends());
+    FDERROR << "The compiled FastDeploy only supports " << msg << ", " << option.backend << " is not supported now." << std::endl;
     return false;
   }
   backend_->benchmark_option_ = option.benchmark_option;
@@ -264,43 +255,9 @@ void Runtime::ReleaseModelMemoryBuffer() {
 }
 
 void Runtime::CreatePaddleBackend() {
-  FDASSERT(
-      option.device == Device::CPU || option.device == Device::GPU ||
-          option.device == Device::IPU,
-      "Backend::PDINFER only supports Device::CPU/Device::GPU/Device::IPU.");
-  FDASSERT(
-      option.model_format == ModelFormat::PADDLE,
-      "Backend::PDINFER only supports model format of ModelFormat::PADDLE.");
 #ifdef ENABLE_PADDLE_BACKEND
-  option.paddle_infer_option.model_file = option.model_file;
-  option.paddle_infer_option.params_file = option.params_file;
-  option.paddle_infer_option.model_from_memory_ = option.model_from_memory_;
-  option.paddle_infer_option.device = option.device;
-  option.paddle_infer_option.device_id = option.device_id;
-  option.paddle_infer_option.enable_pinned_memory = option.enable_pinned_memory;
-  option.paddle_infer_option.external_stream_ = option.external_stream_;
-  option.paddle_infer_option.trt_option = option.trt_option;
-  option.paddle_infer_option.trt_option.gpu_id = option.device_id;
   backend_ = utils::make_unique<PaddleBackend>();
-  auto casted_backend = dynamic_cast<PaddleBackend*>(backend_.get());
-
-  if (option.model_from_memory_) {
-    FDASSERT(
-        casted_backend->InitFromPaddle(option.model_file, option.params_file,
-                                       option.paddle_infer_option),
-        "Load model from Paddle failed while initliazing PaddleBackend.");
-    ReleaseModelMemoryBuffer();
-  } else {
-    std::string model_buffer = "";
-    std::string params_buffer = "";
-    FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
-             "Fail to read binary from model file");
-    FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
-             "Fail to read binary from parameter file");
-    FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
-                                            option.paddle_infer_option),
-             "Load model from Paddle failed while initliazing PaddleBackend.");
-  }
+  FDASSERT(backend_->Init(option), "Failed to initialized Paddle Inference backend.");
 #else
   FDASSERT(false,
            "PaddleBackend is not available, please compiled with "
@@ -339,12 +296,6 @@ void Runtime::CreateOrtBackend() {
 
 void Runtime::CreateTrtBackend() {
 #ifdef ENABLE_TRT_BACKEND
-  option.trt_option.model_file = option.model_file;
-  option.trt_option.params_file = option.params_file;
-  option.trt_option.model_format = option.model_format;
-  option.trt_option.gpu_id = option.device_id;
-  option.trt_option.enable_pinned_memory = option.enable_pinned_memory;
-  option.trt_option.external_stream_ = option.external_stream_;
   backend_ = utils::make_unique<TrtBackend>();
   FDASSERT(backend_->Init(option), "Failed to initialize TensorRT backend.");
 #else

From 721e6efb8124ff0c8da32831a02c2f23aaa177d9 Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Tue, 21 Feb 2023 18:01:13 +0800
Subject: [PATCH 10/20] [Benchmark] Add ClassifyDiff to compare ClassifyResult
 diff (#1381)

* add GPL lisence

* add GPL-3.0 lisence

* add GPL-3.0 lisence

* add GPL-3.0 lisence

* support yolov8

* add pybind for yolov8

* add yolov8 readme

* add cpp benchmark

* add cpu and gpu mem

* public part split

* add runtime mode

* fixed bugs

* add cpu_thread_nums

* deal with comments

* deal with comments

* deal with comments

* rm useless code

* add FASTDEPLOY_DECL

* add FASTDEPLOY_DECL

* fixed for windows

* mv rss to pss

* mv rss to pss

* Update utils.cc

* use thread to collect mem

* Add ResourceUsageMonitor

* rm useless code

* fixed bug

* fixed typo

* update ResourceUsageMonitor

* fixed bug

* fixed bug

* add note for ResourceUsageMonitor

* deal with comments

* add macros

* deal with comments

* deal with comments

* deal with comments

* re-lint

* rm pmap and use mem api

* rm pmap and use mem api

* add mem api

* Add PrintBenchmarkInfo func

* Add PrintBenchmarkInfo func

* Add PrintBenchmarkInfo func

* deal with comments

* fixed enable_paddle_to_trt

* add log for paddle_trt

* support ppcls benchmark

* use new trt option api

* update benchmark info

* simplify benchmark.cc

* simplify benchmark.cc

* deal with comments

* Add ppseg && ppocr benchmark

* add OCR rec img

* add ocr benchmark

* fixed trt shape

* add trt shape

* resolve conflict

* add ENABLE_BENCHMARK define

* Add ClassifyDiff

* Add Resize for ClassifyResult

* deal with comments

---------

Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
---
 benchmark/cpp/benchmark_ppcls.cc              | 26 ++++-
 benchmark/cpp/benchmark_ppocr.cc              |  7 ++
 benchmark/cpp/benchmark_precision_ppyolov8.cc | 18 ++--
 benchmark/cpp/flags.h                         |  6 --
 fastdeploy/benchmark/utils.cc                 | 94 +++++++++++++++++--
 fastdeploy/benchmark/utils.h                  | 17 +++-
 fastdeploy/vision/common/result.cc            |  5 +
 fastdeploy/vision/common/result.h             |  3 +
 8 files changed, 150 insertions(+), 26 deletions(-)
 mode change 100644 => 100755 fastdeploy/benchmark/utils.cc

diff --git a/benchmark/cpp/benchmark_ppcls.cc b/benchmark/cpp/benchmark_ppcls.cc
index 734a09a48..f925d52c6 100755
--- a/benchmark/cpp/benchmark_ppcls.cc
+++ b/benchmark/cpp/benchmark_ppcls.cc
@@ -16,6 +16,9 @@
 #include "macros.h"
 #include "option.h"
 
+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
 int main(int argc, char* argv[]) {
 #if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
   // Initialization
@@ -31,9 +34,28 @@ int main(int argc, char* argv[]) {
   auto model_file = FLAGS_model + sep + "inference.pdmodel";
   auto params_file = FLAGS_model + sep + "inference.pdiparams";
   auto config_file = FLAGS_model + sep + "inference_cls.yaml";
-  auto model_ppcls = fastdeploy::vision::classification::PaddleClasModel(
+  auto model_ppcls = vision::classification::PaddleClasModel(
       model_file, params_file, config_file, option);
-  fastdeploy::vision::ClassifyResult res;
+  vision::ClassifyResult res;
+  // Run once at least
+  model_ppcls.Predict(im, &res);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  // Save result to -> disk.
+  std::string cls_result_path = "ppcls_result.txt";
+  benchmark::ResultManager::SaveClassifyResult(res, cls_result_path);
+  // Load result from <- disk.
+  vision::ClassifyResult res_loaded;
+  benchmark::ResultManager::LoadClassifyResult(&res_loaded, cls_result_path);
+  // Calculate diff between two results.
+  auto cls_diff =
+      benchmark::ResultManager::CalculateDiffStatis(&res, &res_loaded);
+  std::cout << "Labels diff: mean=" << cls_diff.labels.mean
+            << ", max=" << cls_diff.labels.max
+            << ", min=" << cls_diff.labels.min << std::endl;
+  std::cout << "Scores diff: mean=" << cls_diff.scores.mean
+            << ", max=" << cls_diff.scores.max
+            << ", min=" << cls_diff.scores.min << std::endl;
   BENCHMARK_MODEL(model_ppcls, model_ppcls.Predict(im, &res))
 #endif
   return 0;
diff --git a/benchmark/cpp/benchmark_ppocr.cc b/benchmark/cpp/benchmark_ppocr.cc
index 398d0feb0..e81080c54 100755
--- a/benchmark/cpp/benchmark_ppocr.cc
+++ b/benchmark/cpp/benchmark_ppocr.cc
@@ -16,6 +16,13 @@
 #include "macros.h"
 #include "option.h"
 
+// Only for ppocr
+DEFINE_string(det_model, "", "Path of Detection model of PPOCR.");
+DEFINE_string(cls_model, "", "Path of Classification model of PPOCR.");
+DEFINE_string(rec_model, "", "Path of Recognization model of PPOCR.");
+DEFINE_string(rec_label_file, "", "Path of Recognization label file of PPOCR.");
+DEFINE_string(image_rec, "", "Path of Recognization img file of PPOCR.");
+
 int main(int argc, char* argv[]) {
 #if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
   // Initialization
diff --git a/benchmark/cpp/benchmark_precision_ppyolov8.cc b/benchmark/cpp/benchmark_precision_ppyolov8.cc
index caea3be19..e5caf8004 100644
--- a/benchmark/cpp/benchmark_precision_ppyolov8.cc
+++ b/benchmark/cpp/benchmark_precision_ppyolov8.cc
@@ -46,11 +46,15 @@ int main(int argc, char* argv[]) {
   // Calculate diff between two results.
   auto det_diff =
       benchmark::ResultManager::CalculateDiffStatis(&res, &res_loaded);
-  std::cout << "diff: mean=" << det_diff.mean << ",max=" << det_diff.max
-            << ",min=" << det_diff.min << std::endl;
+  std::cout << "Boxes diff: mean=" << det_diff.boxes.mean
+            << ", max=" << det_diff.boxes.max << ", min=" << det_diff.boxes.min
+            << std::endl;
+  std::cout << "Label_ids diff: mean=" << det_diff.labels.mean
+            << ", max=" << det_diff.labels.max
+            << ", min=" << det_diff.labels.min << std::endl;
   // 2. Test tensor diff
   std::cout << "=============== Test tensor diff =================\n";
-  std::vector<vision::DetectionResult> bacth_res;
+  std::vector<vision::DetectionResult> batch_res;
   std::vector<fastdeploy::FDTensor> input_tensors, output_tensors;
   std::vector<cv::Mat> imgs;
   imgs.push_back(im);
@@ -62,7 +66,7 @@ int main(int argc, char* argv[]) {
   input_tensors[2].name = "im_shape";
   input_tensors.pop_back();
   model_ppyolov8.Infer(input_tensors, &output_tensors);
-  model_ppyolov8.GetPostprocessor().Run(output_tensors, &bacth_res);
+  model_ppyolov8.GetPostprocessor().Run(output_tensors, &batch_res);
   // Save tensor to -> disk.
   auto& tensor_dump = output_tensors[0];
   std::string det_tensor_path = "ppyolov8_tensor.txt";
@@ -73,9 +77,9 @@ int main(int argc, char* argv[]) {
   // Calculate diff between two tensors.
   auto det_tensor_diff = benchmark::ResultManager::CalculateDiffStatis(
       &tensor_dump, &tensor_loaded);
-  std::cout << "diff: mean=" << det_tensor_diff.mean
-            << ",max=" << det_tensor_diff.max << ",min=" << det_tensor_diff.min
-            << std::endl;
+  std::cout << "Tensor diff: mean=" << det_tensor_diff.data.mean
+            << ", max=" << det_tensor_diff.data.max
+            << ", min=" << det_tensor_diff.data.min << std::endl;
   // 3. Run profiling
   BENCHMARK_MODEL(model_ppyolov8, model_ppyolov8.Predict(im, &res))
   auto vis_im = vision::VisDetection(im, res);
diff --git a/benchmark/cpp/flags.h b/benchmark/cpp/flags.h
index 59361162f..e32e39eab 100755
--- a/benchmark/cpp/flags.h
+++ b/benchmark/cpp/flags.h
@@ -44,12 +44,6 @@ DEFINE_bool(
 DEFINE_bool(
     collect_memory_info, false, "Whether to collect memory info");
 DEFINE_int32(sampling_interval, 50, "How often to collect memory info(ms).");
-// Only for ppocr
-DEFINE_string(det_model, "", "Path of Detection model of PPOCR.");
-DEFINE_string(cls_model, "", "Path of Classification model of PPOCR.");
-DEFINE_string(rec_model, "", "Path of Recognization model of PPOCR.");
-DEFINE_string(rec_label_file, "", "Path of Recognization label file of PPOCR.");
-DEFINE_string(image_rec, "", "Path of Recognization img file of PPOCR.");
 
 static void PrintUsage() {
   std::cout << "Usage: infer_demo --model model_path --image img_path --device "
diff --git a/fastdeploy/benchmark/utils.cc b/fastdeploy/benchmark/utils.cc
old mode 100644
new mode 100755
index 825cb5977..e4b4e8a84
--- a/fastdeploy/benchmark/utils.cc
+++ b/fastdeploy/benchmark/utils.cc
@@ -321,8 +321,8 @@ TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
       tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
     }
     TensorDiff diff;
-    CalculateStatisInfo<int64_t>(tensor_diff.data(), numel, &(diff.mean),
-                                 &(diff.max), &(diff.min));
+    CalculateStatisInfo<int64_t>(tensor_diff.data(), numel, &(diff.data.mean),
+                                 &(diff.data.max), &(diff.data.min));
     return diff;
   } else if (dtype == FDDataType::INT32) {
     std::vector<int32_t> tensor_diff(numel);
@@ -332,8 +332,8 @@ TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
       tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
     }
     TensorDiff diff;
-    CalculateStatisInfo<float>(tensor_diff.data(), numel, &(diff.mean),
-                               &(diff.max), &(diff.min));
+    CalculateStatisInfo<float>(tensor_diff.data(), numel, &(diff.data.mean),
+                               &(diff.data.max), &(diff.data.min));
     return diff;
   } else {  // FP32
     std::vector<float> tensor_diff(numel);
@@ -343,8 +343,8 @@ TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
       tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
     }
     TensorDiff diff;
-    CalculateStatisInfo<float>(tensor_diff.data(), numel, &(diff.mean),
-                               &(diff.max), &(diff.min));
+    CalculateStatisInfo<float>(tensor_diff.data(), numel, &(diff.data.mean),
+                               &(diff.data.max), &(diff.data.min));
     return diff;
   }
 }
@@ -399,6 +399,42 @@ bool ResultManager::SaveDetectionResult(const vision::DetectionResult& res,
   return true;
 }
 
+bool ResultManager::SaveClassifyResult(const vision::ClassifyResult& res,
+                                       const std::string& path) {
+  if (res.label_ids.empty()) {
+    FDERROR << "ClassifyResult can not be empty!" << std::endl;
+    return false;
+  }
+  std::ofstream fs(path, std::ios::out);
+  if (!fs.is_open()) {
+    FDERROR << "Fail to open file:" << path << std::endl;
+    return false;
+  }
+  fs.precision(20);
+  // label_ids
+  fs << "label_ids" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.label_ids.size(); ++i) {
+    if (i < res.label_ids.size() - 1) {
+      fs << res.label_ids[i] << VALUE_SEP;
+    } else {
+      fs << res.label_ids[i];
+    }
+  }
+  fs << "\n";
+  // scores
+  fs << "scores" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.scores.size(); ++i) {
+    if (i < res.scores.size() - 1) {
+      fs << res.scores[i] << VALUE_SEP;
+    } else {
+      fs << res.scores[i];
+    }
+  }
+  fs << "\n";
+  fs.close();
+  return true;
+}
+
 bool ResultManager::LoadDetectionResult(vision::DetectionResult* res,
                                         const std::string& path) {
   if (!CheckFileExists(path)) {
@@ -432,6 +468,28 @@ bool ResultManager::LoadDetectionResult(vision::DetectionResult* res,
   return true;
 }
 
+bool ResultManager::LoadClassifyResult(vision::ClassifyResult* res,
+                                       const std::string& path) {
+  if (!CheckFileExists(path)) {
+    FDERROR << "Can't found file from" << path << std::endl;
+    return false;
+  }
+  auto lines = ReadLines(path);
+  std::map<std::string, std::vector<std::string>> data;
+  // label_ids
+  data = SplitDataLine(lines[0]);
+  res->Resize(data.begin()->second.size());
+  for (int i = 0; i < data.begin()->second.size(); ++i) {
+    res->label_ids[i] = std::stoi(data.begin()->second[i]);
+  }
+  // scores
+  data = SplitDataLine(lines[1]);
+  for (int i = 0; i < data.begin()->second.size(); ++i) {
+    res->scores[i] = std::stof(data.begin()->second[i]);
+  }
+  return true;
+}
+
 DetectionDiff ResultManager::CalculateDiffStatis(vision::DetectionResult* lhs,
                                                  vision::DetectionResult* rhs,
                                                  float score_threshold) {
@@ -469,11 +527,29 @@ DetectionDiff ResultManager::CalculateDiffStatis(vision::DetectionResult* lhs,
   CalculateStatisInfo<int32_t>(labels_diff.data(), labels_diff.size(),
                                &(diff.labels.mean), &(diff.labels.max),
                                &(diff.labels.min));
-  diff.mean = diff.boxes.mean;
-  diff.max = diff.boxes.max;
-  diff.min = diff.boxes.min;
   return diff;
 }
+
+ClassifyDiff ResultManager::CalculateDiffStatis(vision::ClassifyResult* lhs,
+                                                vision::ClassifyResult* rhs) {
+  const int class_nums = std::min(lhs->label_ids.size(), rhs->label_ids.size());
+  std::vector<float> scores_diff;
+  std::vector<int32_t> labels_diff;
+  for (int i = 0; i < class_nums; ++i) {
+    scores_diff.push_back(lhs->scores[i] - rhs->scores[i]);
+    labels_diff.push_back(lhs->label_ids[i] - rhs->label_ids[i]);
+  }
+
+  ClassifyDiff diff;
+  CalculateStatisInfo<float>(scores_diff.data(), scores_diff.size(),
+                             &(diff.scores.mean), &(diff.scores.max),
+                             &(diff.scores.min));
+  CalculateStatisInfo<int32_t>(labels_diff.data(), labels_diff.size(),
+                               &(diff.labels.mean), &(diff.labels.max),
+                               &(diff.labels.min));
+  return diff;
+}
+
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 
diff --git a/fastdeploy/benchmark/utils.h b/fastdeploy/benchmark/utils.h
index fc7835745..918844b51 100755
--- a/fastdeploy/benchmark/utils.h
+++ b/fastdeploy/benchmark/utils.h
@@ -101,14 +101,21 @@ struct FASTDEPLOY_DECL EvalStatis {
   double max = -1.0;
 };
 
-struct FASTDEPLOY_DECL TensorDiff: public BaseDiff, public EvalStatis {};
+struct FASTDEPLOY_DECL TensorDiff: public BaseDiff {
+  EvalStatis data;
+};
 
 #if defined(ENABLE_VISION)
-struct FASTDEPLOY_DECL DetectionDiff: public BaseDiff, public EvalStatis {
+struct FASTDEPLOY_DECL DetectionDiff: public BaseDiff {
   EvalStatis boxes;
   EvalStatis scores;
   EvalStatis labels;
 };
+
+struct FASTDEPLOY_DECL ClassifyDiff: public BaseDiff {
+  EvalStatis scores;
+  EvalStatis labels;
+};
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 
@@ -127,10 +134,16 @@ struct FASTDEPLOY_DECL ResultManager {
                                   const std::string& path);
   static bool LoadDetectionResult(vision::DetectionResult* res,
                                   const std::string& path);
+  static bool SaveClassifyResult(const vision::ClassifyResult& res,
+                                 const std::string& path);
+  static bool LoadClassifyResult(vision::ClassifyResult* res,
+                                 const std::string& path);
   /// Calculate diff value between two basic results.
   static DetectionDiff CalculateDiffStatis(vision::DetectionResult* lhs,
                                            vision::DetectionResult* rhs,
                                            float score_threshold = 0.3f);
+  static ClassifyDiff CalculateDiffStatis(vision::ClassifyResult* lhs,
+                                          vision::ClassifyResult* rhs);
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 };
diff --git a/fastdeploy/vision/common/result.cc b/fastdeploy/vision/common/result.cc
index d48d9ddc4..1c8d20c0f 100755
--- a/fastdeploy/vision/common/result.cc
+++ b/fastdeploy/vision/common/result.cc
@@ -26,6 +26,11 @@ void ClassifyResult::Clear() {
   scores.clear();
 }
 
+void ClassifyResult::Resize(int size) {
+  label_ids.resize(size);
+  scores.resize(size);
+}
+
 std::string ClassifyResult::Str() {
   std::string out;
   out = "ClassifyResult(\nlabel_ids: ";
diff --git a/fastdeploy/vision/common/result.h b/fastdeploy/vision/common/result.h
index 7c4efde23..6b40bf314 100755
--- a/fastdeploy/vision/common/result.h
+++ b/fastdeploy/vision/common/result.h
@@ -51,6 +51,9 @@ struct FASTDEPLOY_DECL ClassifyResult : public BaseResult {
   std::vector<float> scores;
   ResultType type = ResultType::CLASSIFY;
 
+  /// Resize ClassifyResult data buffer
+  void Resize(int size);
+
   /// Clear ClassifyResult
   void Clear();
 

From cc9031c17e5c0819f7142d9859197d390064fc5f Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Tue, 21 Feb 2023 20:49:13 +0800
Subject: [PATCH 11/20] Update README.md

---
 tutorials/multi_thread/cpp/single_model/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/multi_thread/cpp/single_model/README.md b/tutorials/multi_thread/cpp/single_model/README.md
index eb56c22d7..74a000f2d 100644
--- a/tutorials/multi_thread/cpp/single_model/README.md
+++ b/tutorials/multi_thread/cpp/single_model/README.md
@@ -1,6 +1,6 @@
 English | [中文](README_CN.md)
 
-# Example of PaddleClas models Python Deployment
+# Example of PaddleClas models C++ Deployment
 
 This directory provides example file `multi_thread.cc` to fast deploy PaddleClas models on CPU/GPU and GPU accelerated by TensorRT.
 

From dd527388bc227f836612352bfcb99aa82b041fe9 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Wed, 22 Feb 2023 11:28:04 +0800
Subject: [PATCH 12/20] Update option.h

---
 fastdeploy/runtime/backends/paddle/option.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fastdeploy/runtime/backends/paddle/option.h b/fastdeploy/runtime/backends/paddle/option.h
index fd2975cc9..652b72401 100755
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -67,10 +67,12 @@ struct PaddleBackendOption {
   /// initialize memory size(MB) for GPU
   int gpu_mem_init_size = 100;
 
+  /// Disable type of operators run on TensorRT
   void DisableTrtOps(const std::vector<std::string>& ops) {
     trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
   }
 
+  /// Delete pass by name
   void DeletePass(const std::string& pass_name) {
     delete_pass_names.push_back(pass_name);
   }

From 2f8d9c9a57b25f6c5d983b0db8c9ca4b69447cdd Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Wed, 22 Feb 2023 14:42:21 +0800
Subject: [PATCH 13/20] [Benchmark]Add SegmentationDiff to compare
 SegmentationResult diff  (#1404)

* avoid mem copy for cpp benchmark

* set CMAKE_BUILD_TYPE to Release

* Add SegmentationDiff

* change pointer to reference

* fixed bug

* cast uint8 to int32
---
 benchmark/cpp/benchmark_ppcls.cc              |   2 +-
 benchmark/cpp/benchmark_ppseg.cc              |  31 +++-
 benchmark/cpp/benchmark_precision_ppyolov8.cc |   6 +-
 fastdeploy/benchmark/utils.cc                 | 155 ++++++++++++++----
 fastdeploy/benchmark/utils.h                  |  27 ++-
 fastdeploy/runtime/backends/tensorrt/utils.h  |   1 +
 6 files changed, 177 insertions(+), 45 deletions(-)
 mode change 100755 => 100644 fastdeploy/benchmark/utils.cc
 mode change 100644 => 100755 fastdeploy/runtime/backends/tensorrt/utils.h

diff --git a/benchmark/cpp/benchmark_ppcls.cc b/benchmark/cpp/benchmark_ppcls.cc
index f925d52c6..b8bfcc989 100755
--- a/benchmark/cpp/benchmark_ppcls.cc
+++ b/benchmark/cpp/benchmark_ppcls.cc
@@ -49,7 +49,7 @@ int main(int argc, char* argv[]) {
   benchmark::ResultManager::LoadClassifyResult(&res_loaded, cls_result_path);
   // Calculate diff between two results.
   auto cls_diff =
-      benchmark::ResultManager::CalculateDiffStatis(&res, &res_loaded);
+      benchmark::ResultManager::CalculateDiffStatis(res, res_loaded);
   std::cout << "Labels diff: mean=" << cls_diff.labels.mean
             << ", max=" << cls_diff.labels.max
             << ", min=" << cls_diff.labels.min << std::endl;
diff --git a/benchmark/cpp/benchmark_ppseg.cc b/benchmark/cpp/benchmark_ppseg.cc
index 23b98b3f5..02968cf2a 100755
--- a/benchmark/cpp/benchmark_ppseg.cc
+++ b/benchmark/cpp/benchmark_ppseg.cc
@@ -16,6 +16,9 @@
 #include "macros.h"
 #include "option.h"
 
+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
 int main(int argc, char* argv[]) {
 #if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
   // Initialization
@@ -34,11 +37,33 @@ int main(int argc, char* argv[]) {
     option.trt_option.SetShape("x", {1, 3, 192, 192}, {1, 3, 192, 192},
                                {1, 3, 192, 192});
   }
-  auto model_ppseg = fastdeploy::vision::segmentation::PaddleSegModel(
+  auto model_ppseg = vision::segmentation::PaddleSegModel(
       model_file, params_file, config_file, option);
-  fastdeploy::vision::SegmentationResult res;
+  vision::SegmentationResult res;
+  // Run once at least
+  model_ppseg.Predict(im, &res);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  // Save result to -> disk.
+  std::string seg_result_path = "ppseg_result.txt";
+  benchmark::ResultManager::SaveSegmentationResult(res, seg_result_path);
+  // Load result from <- disk.
+  vision::SegmentationResult res_loaded;
+  benchmark::ResultManager::LoadSegmentationResult(&res_loaded,
+                                                   seg_result_path);
+  // Calculate diff between two results.
+  auto seg_diff =
+      benchmark::ResultManager::CalculateDiffStatis(res, res_loaded);
+  std::cout << "Labels diff: mean=" << seg_diff.labels.mean
+            << ", max=" << seg_diff.labels.max
+            << ", min=" << seg_diff.labels.min << std::endl;
+  if (res_loaded.contain_score_map) {
+    std::cout << "Scores diff: mean=" << seg_diff.scores.mean
+              << ", max=" << seg_diff.scores.max
+              << ", min=" << seg_diff.scores.min << std::endl;
+  }
   BENCHMARK_MODEL(model_ppseg, model_ppseg.Predict(im, &res))
-  auto vis_im = fastdeploy::vision::VisSegmentation(im, res, 0.5);
+  auto vis_im = vision::VisSegmentation(im, res, 0.5);
   cv::imwrite("vis_result.jpg", vis_im);
   std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
 #endif
diff --git a/benchmark/cpp/benchmark_precision_ppyolov8.cc b/benchmark/cpp/benchmark_precision_ppyolov8.cc
index e5caf8004..7792d98c6 100644
--- a/benchmark/cpp/benchmark_precision_ppyolov8.cc
+++ b/benchmark/cpp/benchmark_precision_ppyolov8.cc
@@ -45,7 +45,7 @@ int main(int argc, char* argv[]) {
   benchmark::ResultManager::LoadDetectionResult(&res_loaded, det_result_path);
   // Calculate diff between two results.
   auto det_diff =
-      benchmark::ResultManager::CalculateDiffStatis(&res, &res_loaded);
+      benchmark::ResultManager::CalculateDiffStatis(res, res_loaded);
   std::cout << "Boxes diff: mean=" << det_diff.boxes.mean
             << ", max=" << det_diff.boxes.max << ", min=" << det_diff.boxes.min
             << std::endl;
@@ -75,8 +75,8 @@ int main(int argc, char* argv[]) {
   fastdeploy::FDTensor tensor_loaded;
   benchmark::ResultManager::LoadFDTensor(&tensor_loaded, det_tensor_path);
   // Calculate diff between two tensors.
-  auto det_tensor_diff = benchmark::ResultManager::CalculateDiffStatis(
-      &tensor_dump, &tensor_loaded);
+  auto det_tensor_diff =
+      benchmark::ResultManager::CalculateDiffStatis(tensor_dump, tensor_loaded);
   std::cout << "Tensor diff: mean=" << det_tensor_diff.data.mean
             << ", max=" << det_tensor_diff.data.max
             << ", min=" << det_tensor_diff.data.min << std::endl;
diff --git a/fastdeploy/benchmark/utils.cc b/fastdeploy/benchmark/utils.cc
old mode 100755
new mode 100644
index e4b4e8a84..5af28e4b1
--- a/fastdeploy/benchmark/utils.cc
+++ b/fastdeploy/benchmark/utils.cc
@@ -298,16 +298,17 @@ bool ResultManager::LoadFDTensor(FDTensor* tensor, const std::string& path) {
   return true;
 }
 
-TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
-  if (lhs->Numel() != rhs->Numel() || lhs->Dtype() != rhs->Dtype()) {
+TensorDiff ResultManager::CalculateDiffStatis(const FDTensor& lhs,
+                                              const FDTensor& rhs) {
+  if (lhs.Numel() != rhs.Numel() || lhs.Dtype() != rhs.Dtype()) {
     FDASSERT(false,
              "The size and dtype of input FDTensor must be equal!"
              " But got size %d, %d, dtype %s, %s",
-             lhs->Numel(), rhs->Numel(), Str(lhs->Dtype()).c_str(),
-             Str(rhs->Dtype()).c_str())
+             lhs.Numel(), rhs.Numel(), Str(lhs.Dtype()).c_str(),
+             Str(rhs.Dtype()).c_str())
   }
-  FDDataType dtype = lhs->Dtype();
-  int numel = lhs->Numel();
+  FDDataType dtype = lhs.Dtype();
+  int numel = lhs.Numel();
   if (dtype != FDDataType::FP32 && dtype != FDDataType::INT64 &&
       dtype != FDDataType::INT32) {
     FDASSERT(false, "Only support FP32/INT64/INT32 now, but got %s",
@@ -315,8 +316,8 @@ TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
   }
   if (dtype == FDDataType::INT64) {
     std::vector<int64_t> tensor_diff(numel);
-    const int64_t* lhs_data_ptr = static_cast<const int64_t*>(lhs->CpuData());
-    const int64_t* rhs_data_ptr = static_cast<const int64_t*>(rhs->CpuData());
+    const int64_t* lhs_data_ptr = static_cast<const int64_t*>(lhs.CpuData());
+    const int64_t* rhs_data_ptr = static_cast<const int64_t*>(rhs.CpuData());
     for (int i = 0; i < numel; ++i) {
       tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
     }
@@ -326,8 +327,8 @@ TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
     return diff;
   } else if (dtype == FDDataType::INT32) {
     std::vector<int32_t> tensor_diff(numel);
-    const int32_t* lhs_data_ptr = static_cast<const int32_t*>(lhs->CpuData());
-    const int32_t* rhs_data_ptr = static_cast<const int32_t*>(rhs->CpuData());
+    const int32_t* lhs_data_ptr = static_cast<const int32_t*>(lhs.CpuData());
+    const int32_t* rhs_data_ptr = static_cast<const int32_t*>(rhs.CpuData());
     for (int i = 0; i < numel; ++i) {
       tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
     }
@@ -337,8 +338,8 @@ TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
     return diff;
   } else {  // FP32
     std::vector<float> tensor_diff(numel);
-    const float* lhs_data_ptr = static_cast<const float*>(lhs->CpuData());
-    const float* rhs_data_ptr = static_cast<const float*>(rhs->CpuData());
+    const float* lhs_data_ptr = static_cast<const float*>(lhs.CpuData());
+    const float* rhs_data_ptr = static_cast<const float*>(rhs.CpuData());
     for (int i = 0; i < numel; ++i) {
       tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
     }
@@ -435,6 +436,44 @@ bool ResultManager::SaveClassifyResult(const vision::ClassifyResult& res,
   return true;
 }
 
+bool ResultManager::SaveSegmentationResult(
+    const vision::SegmentationResult& res, const std::string& path) {
+  if (res.label_map.empty()) {
+    FDERROR << "SegmentationResult can not be empty!" << std::endl;
+    return false;
+  }
+  std::ofstream fs(path, std::ios::out);
+  if (!fs.is_open()) {
+    FDERROR << "Fail to open file:" << path << std::endl;
+    return false;
+  }
+  fs.precision(20);
+  // label_map
+  fs << "label_map" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.label_map.size(); ++i) {
+    if (i < res.label_map.size() - 1) {
+      fs << static_cast<int32_t>(res.label_map[i]) << VALUE_SEP;
+    } else {
+      fs << static_cast<int32_t>(res.label_map[i]);
+    }
+  }
+  fs << "\n";
+  // score_map
+  if (res.contain_score_map) {
+    fs << "score_map" << KEY_VALUE_SEP;
+    for (int i = 0; i < res.score_map.size(); ++i) {
+      if (i < res.score_map.size() - 1) {
+        fs << res.score_map[i] << VALUE_SEP;
+      } else {
+        fs << res.score_map[i];
+      }
+    }
+    fs << "\n";
+  }
+  fs.close();
+  return true;
+}
+
 bool ResultManager::LoadDetectionResult(vision::DetectionResult* res,
                                         const std::string& path) {
   if (!CheckFileExists(path)) {
@@ -490,32 +529,62 @@ bool ResultManager::LoadClassifyResult(vision::ClassifyResult* res,
   return true;
 }
 
-DetectionDiff ResultManager::CalculateDiffStatis(vision::DetectionResult* lhs,
-                                                 vision::DetectionResult* rhs,
-                                                 float score_threshold) {
+bool ResultManager::LoadSegmentationResult(vision::SegmentationResult* res,
+                                           const std::string& path) {
+  if (!CheckFileExists(path)) {
+    FDERROR << "Can't found file from" << path << std::endl;
+    return false;
+  }
+  auto lines = ReadLines(path);
+  if (lines.size() > 1) {
+    res->contain_score_map = true;
+  }
+  std::map<std::string, std::vector<std::string>> data;
+  // label_map
+  data = SplitDataLine(lines[0]);
+  res->Resize(data.begin()->second.size());
+  for (int i = 0; i < data.begin()->second.size(); ++i) {
+    res->label_map[i] = std::stoi(data.begin()->second[i]);
+  }
+  // score_map
+  if (lines.size() > 1) {
+    data = SplitDataLine(lines[1]);
+    for (int i = 0; i < data.begin()->second.size(); ++i) {
+      res->score_map[i] = std::stof(data.begin()->second[i]);
+    }
+  }
+  return true;
+}
+
+DetectionDiff ResultManager::CalculateDiffStatis(
+    const vision::DetectionResult& lhs, const vision::DetectionResult& rhs,
+    const float& score_threshold) {
+  vision::DetectionResult lhs_sort = lhs;
+  vision::DetectionResult rhs_sort = rhs;
   // lex sort by x(w) & y(h)
-  vision::utils::LexSortDetectionResultByXY(lhs);
-  vision::utils::LexSortDetectionResultByXY(rhs);
+  vision::utils::LexSortDetectionResultByXY(&lhs_sort);
+  vision::utils::LexSortDetectionResultByXY(&rhs_sort);
   // get value diff & trunc it by score_threshold
-  const int boxes_num = std::min(lhs->boxes.size(), rhs->boxes.size());
+  const int boxes_num = std::min(lhs_sort.boxes.size(), rhs_sort.boxes.size());
   std::vector<float> boxes_diff;
   std::vector<float> scores_diff;
   std::vector<int32_t> labels_diff;
   // TODO(qiuyanjun): process the diff of masks.
   for (int i = 0; i < boxes_num; ++i) {
-    if (lhs->scores[i] > score_threshold && rhs->scores[i] > score_threshold) {
-      scores_diff.push_back(lhs->scores[i] - rhs->scores[i]);
-      labels_diff.push_back(lhs->label_ids[i] - rhs->label_ids[i]);
-      boxes_diff.push_back(lhs->boxes[i][0] - rhs->boxes[i][0]);
-      boxes_diff.push_back(lhs->boxes[i][1] - rhs->boxes[i][1]);
-      boxes_diff.push_back(lhs->boxes[i][2] - rhs->boxes[i][2]);
-      boxes_diff.push_back(lhs->boxes[i][3] - rhs->boxes[i][3]);
+    if (lhs_sort.scores[i] > score_threshold &&
+        rhs_sort.scores[i] > score_threshold) {
+      scores_diff.push_back(lhs_sort.scores[i] - rhs_sort.scores[i]);
+      labels_diff.push_back(lhs_sort.label_ids[i] - rhs_sort.label_ids[i]);
+      boxes_diff.push_back(lhs_sort.boxes[i][0] - rhs_sort.boxes[i][0]);
+      boxes_diff.push_back(lhs_sort.boxes[i][1] - rhs_sort.boxes[i][1]);
+      boxes_diff.push_back(lhs_sort.boxes[i][2] - rhs_sort.boxes[i][2]);
+      boxes_diff.push_back(lhs_sort.boxes[i][3] - rhs_sort.boxes[i][3]);
     }
   }
   FDASSERT(boxes_diff.size() > 0,
            "Can't get any valid boxes while score_threshold is %f, "
            "The boxes.size of lhs is %d, the boxes.size of rhs is %d",
-           score_threshold, lhs->boxes.size(), rhs->boxes.size())
+           score_threshold, lhs_sort.boxes.size(), rhs_sort.boxes.size())
 
   DetectionDiff diff;
   CalculateStatisInfo<float>(boxes_diff.data(), boxes_diff.size(),
@@ -530,14 +599,14 @@ DetectionDiff ResultManager::CalculateDiffStatis(vision::DetectionResult* lhs,
   return diff;
 }
 
-ClassifyDiff ResultManager::CalculateDiffStatis(vision::ClassifyResult* lhs,
-                                                vision::ClassifyResult* rhs) {
-  const int class_nums = std::min(lhs->label_ids.size(), rhs->label_ids.size());
+ClassifyDiff ResultManager::CalculateDiffStatis(
+    const vision::ClassifyResult& lhs, const vision::ClassifyResult& rhs) {
+  const int class_nums = std::min(lhs.label_ids.size(), rhs.label_ids.size());
   std::vector<float> scores_diff;
   std::vector<int32_t> labels_diff;
   for (int i = 0; i < class_nums; ++i) {
-    scores_diff.push_back(lhs->scores[i] - rhs->scores[i]);
-    labels_diff.push_back(lhs->label_ids[i] - rhs->label_ids[i]);
+    scores_diff.push_back(lhs.scores[i] - rhs.scores[i]);
+    labels_diff.push_back(lhs.label_ids[i] - rhs.label_ids[i]);
   }
 
   ClassifyDiff diff;
@@ -550,6 +619,30 @@ ClassifyDiff ResultManager::CalculateDiffStatis(vision::ClassifyResult* lhs,
   return diff;
 }
 
+SegmentationDiff ResultManager::CalculateDiffStatis(
+    const vision::SegmentationResult& lhs,
+    const vision::SegmentationResult& rhs) {
+  const int pixel_nums = std::min(lhs.label_map.size(), rhs.label_map.size());
+  std::vector<int32_t> labels_diff;
+  std::vector<float> scores_diff;
+  for (int i = 0; i < pixel_nums; ++i) {
+    labels_diff.push_back(lhs.label_map[i] - rhs.label_map[i]);
+    if (lhs.contain_score_map && rhs.contain_score_map) {
+      scores_diff.push_back(lhs.score_map[i] - rhs.score_map[i]);
+    }
+  }
+  SegmentationDiff diff;
+  CalculateStatisInfo<int32_t>(labels_diff.data(), labels_diff.size(),
+                               &(diff.labels.mean), &(diff.labels.max),
+                               &(diff.labels.min));
+  if (lhs.contain_score_map && rhs.contain_score_map) {
+    CalculateStatisInfo<float>(scores_diff.data(), scores_diff.size(),
+                               &(diff.scores.mean), &(diff.scores.max),
+                               &(diff.scores.min));
+  }
+  return diff;
+}
+
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 
diff --git a/fastdeploy/benchmark/utils.h b/fastdeploy/benchmark/utils.h
index 918844b51..f4d608133 100755
--- a/fastdeploy/benchmark/utils.h
+++ b/fastdeploy/benchmark/utils.h
@@ -116,6 +116,12 @@ struct FASTDEPLOY_DECL ClassifyDiff: public BaseDiff {
   EvalStatis scores;
   EvalStatis labels;
 };
+
+struct FASTDEPLOY_DECL SegmentationDiff: public BaseDiff {
+  EvalStatis scores;
+  EvalStatis labels;
+};
+
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 
@@ -126,8 +132,8 @@ struct FASTDEPLOY_DECL ResultManager {
   static bool SaveFDTensor(const FDTensor& tensor, const std::string& path);
   static bool LoadFDTensor(FDTensor* tensor, const std::string& path);
   /// Calculate diff value between two FDTensor results.
-  static TensorDiff CalculateDiffStatis(FDTensor* lhs,
-                                        FDTensor* rhs);
+  static TensorDiff CalculateDiffStatis(const FDTensor& lhs,
+                                        const FDTensor& rhs);
 #if defined(ENABLE_VISION)
   /// Save & Load functions for basic results.
   static bool SaveDetectionResult(const vision::DetectionResult& res,
@@ -138,12 +144,19 @@ struct FASTDEPLOY_DECL ResultManager {
                                  const std::string& path);
   static bool LoadClassifyResult(vision::ClassifyResult* res,
                                  const std::string& path);
+  static bool SaveSegmentationResult(const vision::SegmentationResult& res,
+                                     const std::string& path);
+  static bool LoadSegmentationResult(vision::SegmentationResult* res,
+                                     const std::string& path);
   /// Calculate diff value between two basic results.
-  static DetectionDiff CalculateDiffStatis(vision::DetectionResult* lhs,
-                                           vision::DetectionResult* rhs,
-                                           float score_threshold = 0.3f);
-  static ClassifyDiff CalculateDiffStatis(vision::ClassifyResult* lhs,
-                                          vision::ClassifyResult* rhs);
+  static DetectionDiff CalculateDiffStatis(const vision::DetectionResult& lhs,
+                                           const vision::DetectionResult& rhs,
+                                           const float& score_threshold = 0.3f);
+  static ClassifyDiff CalculateDiffStatis(const vision::ClassifyResult& lhs,
+                                          const vision::ClassifyResult& rhs);
+  static SegmentationDiff CalculateDiffStatis(
+      const vision::SegmentationResult& lhs,
+      const vision::SegmentationResult& rhs);
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 };
diff --git a/fastdeploy/runtime/backends/tensorrt/utils.h b/fastdeploy/runtime/backends/tensorrt/utils.h
old mode 100644
new mode 100755
index b2fe8ee99..91402b305
--- a/fastdeploy/runtime/backends/tensorrt/utils.h
+++ b/fastdeploy/runtime/backends/tensorrt/utils.h
@@ -241,6 +241,7 @@ class FDTrtLogger : public nvinfer1::ILogger {
       FDASSERT(false, "%s", msg);
     }
   }
+
  private:
   bool enable_info_ = false;
   bool enable_warning_ = false;

From b6349d5d1dc265cd8529c0da43f1b927d6ca54aa Mon Sep 17 00:00:00 2001
From: Zeref996 <825276847@qq.com>
Date: Wed, 22 Feb 2023 14:45:42 +0800
Subject: [PATCH 14/20] refreash version 1.0.4 doc

---
 .../download_prebuilt_libraries.md            | 34 +++++++++--------
 .../download_prebuilt_libraries.md            | 38 ++++++++++---------
 2 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/docs/cn/build_and_install/download_prebuilt_libraries.md b/docs/cn/build_and_install/download_prebuilt_libraries.md
index 903b25045..afbc6b2af 100755
--- a/docs/cn/build_and_install/download_prebuilt_libraries.md
+++ b/docs/cn/build_and_install/download_prebuilt_libraries.md
@@ -20,9 +20,11 @@ FastDeploy提供各平台预编译库，供开发者直接下载安装使用。
 
 支持CPU和Nvidia GPU的部署，默认集成Paddle Inference、ONNX Runtime、OpenVINO以及TensorRT推理后端，Vision视觉模型模块，Text文本NLP模型模块
 
+版本信息：Paddle Inference==2.4-dev5，ONNXRuntime==1.12.0，OpenVINO==2022.2.0.dev20220829，TensorRT==8.5.2.2
+
 ### Python安装
 
-Release版本（当前最新1.0.3）安装
+Release版本（当前最新1.0.4）安装
 ```bash
 pip install fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
 ```
@@ -43,8 +45,8 @@ Release版本
 
 | 平台 | 文件 | 说明 |
 | :--- | :--- | :---- |
-| Linux x64 | [fastdeploy-linux-x64-gpu-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-1.0.3.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2编译产出 |
-| Windows x64 | [fastdeploy-win-x64-gpu-1.0.3.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-1.0.3.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2编译产出 |
+| Linux x64 | [fastdeploy-linux-x64-gpu-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-1.0.4.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2编译产出 |
+| Windows x64 | [fastdeploy-win-x64-gpu-1.0.4.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-1.0.4.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2编译产出 |
 
 Develop版本（Nightly build）
 
@@ -63,9 +65,11 @@ Develop版本（Nightly build）
 
 仅支持CPU部署，默认集成Paddle Inference、ONNX Runtime、OpenVINO, Vision视觉模型模块(Linux aarch64和Mac OSX下仅集成ONNX Runtime模块)， Text文本NLP模型模块。
 
+版本信息：Paddle Inference==2.4-dev5，ONNXRuntime==1.12.0，OpenVINO==2022.2.0.dev20220829
+
 ### Python安装
 
-Release版本（当前最新1.0.3）安装
+Release版本（当前最新1.0.4）安装
 ```bash
 pip install fastdeploy-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
 ```
@@ -81,23 +85,23 @@ Release版本
 
 | 平台 | 文件 | 说明 |
 | :--- | :--- | :---- |
-| Linux x64 | [fastdeploy-linux-x64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.3.tgz) | g++ 8.2编译产出 |
-| Windows x64 | [fastdeploy-win-x64-1.0.3.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-1.0.3.zip) | Visual Studio 16 2019编译产出 |
-| Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.3.tgz) | clang++ 10.0.0编译产出|
-| Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.3.tgz) | clang++ 13.0.0编译产出 |
-| Linux aarch64 | [fastdeploy-linux-aarch64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.3.tgz) | gcc 6.3编译产出 |  
-| Android armv7&v8 | [fastdeploy-android-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.3-shared.tgz) | CV API，NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |
-| Android armv7&v8 | [fastdeploy-android-with-text-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.3-shared.tgz) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 25 及 clang++编译产出, 支持arm64-v8a及armeabi-v7a |
-| Android armv7&v8 | [fastdeploy-android-with-text-only-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-only-1.0.3-shared.tgz) | 仅包含 FastTokenizer、UIE 等 Text API，NDK 25 及 clang++ 编译产出, 不包含 OpenCV 等 CV API。 支持 arm64-v8a 及 armeabi-v7a |
+| Linux x64 | [fastdeploy-linux-x64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.4.tgz) | g++ 8.2编译产出 |
+| Windows x64 | [fastdeploy-win-x64-1.0.4.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-1.0.4.zip) | Visual Studio 16 2019编译产出 |
+| Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.4.tgz) | clang++ 10.0.0编译产出|
+| Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.4.tgz) | clang++ 13.0.0编译产出 |
+| Linux aarch64 | [fastdeploy-linux-aarch64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.4.tgz) | gcc 6.3编译产出 |  
+| Android armv7&v8 | [fastdeploy-android-1.0.4-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.4-shared.tgz) | CV API，NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-with-text-1.0.4-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.4-shared.tgz) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 25 及 clang++编译产出, 支持arm64-v8a及armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-with-text-only-1.0.4-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-only-1.0.4-shared.tgz) | 仅包含 FastTokenizer、UIE 等 Text API，NDK 25 及 clang++ 编译产出, 不包含 OpenCV 等 CV API。 支持 arm64-v8a 及 armeabi-v7a |
 
 ## Java SDK安装
 
-Release版本（Java SDK 目前仅支持Android，版本为1.0.3）  
+Release版本（Java SDK 目前仅支持Android，版本为1.0.4）  
 
 | 平台 | 文件 | 说明 |
 | :--- | :--- | :---- |
-| Android Java SDK | [fastdeploy-android-sdk-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.3.aar) | CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
-| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.3.aar) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-1.0.4.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.4.aar) | CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.4.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.4.aar) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
 
 
 Develop版本（Nightly build）
diff --git a/docs/en/build_and_install/download_prebuilt_libraries.md b/docs/en/build_and_install/download_prebuilt_libraries.md
index 56bc864a0..db9ca1230 100644
--- a/docs/en/build_and_install/download_prebuilt_libraries.md
+++ b/docs/en/build_and_install/download_prebuilt_libraries.md
@@ -21,9 +21,11 @@ This document is divided into two parts:
 
 FastDeploy supports Computer Vision, Text and NLP model deployment on CPU and Nvidia GPU with Paddle Inference, ONNX Runtime, OpenVINO and TensorRT inference backends.
 
+version information: Paddle Inference==2.4-dev5, ONNXRuntime==1.12.0, OpenVINO==2022.2.0.dev20220829, TensorRT==8.5.2.2
+
 ### Python SDK
 
-Install the released version（the newest 1.0.3 for now）
+Install the released version（the newest 1.0.4 for now）
 
 ```
 pip install fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
@@ -43,12 +45,12 @@ conda config --add channels conda-forge && conda install cudatoolkit=11.2 cudnn=
 
 ### C++ SDK
 
-Install the released version（Latest 1.0.3）
+Install the released version（Latest 1.0.4）
 
 | Platform    | File                                                                                                                  | Description                                               |
 |:----------- |:--------------------------------------------------------------------------------------------------------------------- |:--------------------------------------------------------- |
-| Linux x64 | [fastdeploy-linux-x64-gpu-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-1.0.3.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2 |
-| Windows x64 | [fastdeploy-win-x64-gpu-1.0.3.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-1.0.3.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2 |
+| Linux x64 | [fastdeploy-linux-x64-gpu-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-1.0.4.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2 |
+| Windows x64 | [fastdeploy-win-x64-gpu-1.0.4.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-1.0.4.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2 |
 
 Install the Develop version（Nightly build）
 
@@ -68,9 +70,11 @@ Install the Develop version（Nightly build）
 
 FastDeploy supports computer vision, text and NLP model deployment on CPU with Paddle Inference, ONNX Runtime, OpenVINO inference backends. It should be noted that under Linux aarch64 and Mac OSX, only the ONNX Runtime is supported for now.
 
+version information: Paddle Inference==2.4-dev5, ONNXRuntime==1.12.0, OpenVINO==2022.2.0.dev20220829
+
 ### Python SDK
 
-Install the released version（Latest 1.0.3 for now）
+Install the released version（Latest 1.0.4 for now）
 
 ```
 pip install fastdeploy-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
@@ -84,27 +88,27 @@ pip install fastdeploy-python==0.0.0 -f https://www.paddlepaddle.org.cn/whl/fast
 
 ### C++ SDK
 
-Install the released version（Latest 1.0.3 for now, Android is 1.0.3）
+Install the released version（Latest 1.0.4 for now, Android is 1.0.4）
 
 | Platform      | File                                                                                                                  | Description                    |
 |:------------- |:--------------------------------------------------------------------------------------------------------------------- |:------------------------------ |
-| Linux x64 | [fastdeploy-linux-x64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.3.tgz) | g++ 8.2 |
-| Windows x64 | [fastdeploy-win-x64-1.0.3.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-1.0.3.zip) | Visual Studio 16 2019 |
-| Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.3.tgz) | clang++ 10.0.0|
-| Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.3.tgz) | clang++ 13.0.0 |
-| Linux aarch64 | [fastdeploy-osx-arm64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.3.tgz) | gcc 6.3 |  
-| Android armv7&v8 | [fastdeploy-android-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.3-shared.tgz) | CV API, NDK 25, clang++, support arm64-v8a and armeabi-v7a  |
-| Android armv7&v8 | [fastdeploy-android-with-text-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.3-shared.tgz) | contains Text API, such as FastTokenizer and UIE, CV API, NDK 25, clang++, support arm64-v8a and armeabi-v7a  |
-| Android armv7&v8 | [fastdeploy-android-with-text-only-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-only-1.0.3-shared.tgz) | only contains Text API, such as FastTokenizer and UIE, NDK 25, clang++, does not contain CV API, support arm64-v8a and armeabi-v7a  |
+| Linux x64 | [fastdeploy-linux-x64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.4.tgz) | g++ 8.2 |
+| Windows x64 | [fastdeploy-win-x64-1.0.4.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-1.0.4.zip) | Visual Studio 16 2019 |
+| Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.4.tgz) | clang++ 10.0.0|
+| Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.4.tgz) | clang++ 13.0.0 |
+| Linux aarch64 | [fastdeploy-osx-arm64-1.0.4.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.4.tgz) | gcc 6.3 |  
+| Android armv7&v8 | [fastdeploy-android-1.0.4-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.4-shared.tgz) | CV API, NDK 25, clang++, support arm64-v8a and armeabi-v7a  |
+| Android armv7&v8 | [fastdeploy-android-with-text-1.0.4-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.4-shared.tgz) | contains Text API, such as FastTokenizer and UIE, CV API, NDK 25, clang++, support arm64-v8a and armeabi-v7a  |
+| Android armv7&v8 | [fastdeploy-android-with-text-only-1.0.4-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-only-1.0.4-shared.tgz) | only contains Text API, such as FastTokenizer and UIE, NDK 25, clang++, does not contain CV API, support arm64-v8a and armeabi-v7a  |
 
 ## Java SDK
 
-Install the released version（Android is 1.0.3 pre-release）
+Install the released version（Android is 1.0.4 pre-release）
 
 | Platform | File | Description |
 | :--- | :--- | :---- |
-| Android Java SDK | [fastdeploy-android-sdk-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.3.aar) | CV API, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
-| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.3.aar) | contains Text API, such as FastTokenizer and UIE, CV API, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-1.0.4.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.4.aar) | CV API, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.4.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.4.aar) | contains Text API, such as FastTokenizer and UIE, CV API, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
 
 Install the Develop version（Nightly build）
 

From 1b1f68499ebff1d50421070168c522e0a515c261 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Wed, 22 Feb 2023 07:06:19 +0000
Subject: [PATCH 15/20] fix trt version bug

---
 serving/scripts/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/serving/scripts/build.sh b/serving/scripts/build.sh
index bf45b1820..5f16b8f22 100644
--- a/serving/scripts/build.sh
+++ b/serving/scripts/build.sh
@@ -85,6 +85,7 @@ nvidia-docker run -i --rm --name ${docker_name} \
            -v`pwd`/..:/workspace/fastdeploy \
            -e "http_proxy=${http_proxy}" \
            -e "https_proxy=${https_proxy}" \
+           -e "trt_version=${trt_version}"\
            nvcr.io/nvidia/tritonserver:21.10-py3-min \
            bash -c \
            'cd /workspace/fastdeploy/python;

From 91a1c72f98aa1a044660604b90c18408f4c72e30 Mon Sep 17 00:00:00 2001
From: Wang Xinyu <shaywxy@gmail.com>
Date: Wed, 22 Feb 2023 19:39:11 +0800
Subject: [PATCH 16/20] [CVCUDA] PP-OCR detector preprocessor integrate CV-CUDA
 (#1382)

* move manager initialized_ flag to ppcls

* update dbdetector preprocess api

* declare processor op

* ppocr detector preprocessor support cvcuda

* move cvcuda op to class member

* ppcls use manager register api

* refactor det preprocessor init api

* add set preprocessor api

* add create processor macro

* new processor call api

* ppcls preprocessor init resize on cpu

* ppocr detector preprocessor set normalize api

* revert ppcls pybind

* remove dbdetector set preprocessor

* refine dbdetector preprocessor includes

* remove mean std in py constructor

* add comments

* update comment

* Update __init__.py
---
 .../classification/ppcls/preprocessor.cc      |  11 +-
 .../classification/ppcls/preprocessor.h       |   1 +
 fastdeploy/vision/common/processors/base.cc   |  17 +-
 fastdeploy/vision/common/processors/base.h    |   9 +-
 .../vision/common/processors/center_crop.cc   |   9 +-
 .../vision/common/processors/center_crop.h    |   8 +
 .../vision/common/processors/manager.cc       |  12 +-
 fastdeploy/vision/common/processors/manager.h |   3 +-
 fastdeploy/vision/common/processors/mat.h     |   1 +
 .../vision/common/processors/mat_batch.cc     |   1 +
 .../vision/common/processors/mat_batch.h      |   1 +
 .../processors/normalize_and_permute.cu       |   2 +
 fastdeploy/vision/common/processors/pad.cc    |  54 +++
 fastdeploy/vision/common/processors/pad.h     |  19 ++
 fastdeploy/vision/common/processors/resize.cc |  11 +-
 fastdeploy/vision/common/processors/resize.h  |   8 +
 .../common/processors/resize_by_short.cc      |  16 +-
 .../common/processors/resize_by_short.h       |   8 +
 fastdeploy/vision/ocr/ppocr/dbdetector.cc     |  29 +-
 .../vision/ocr/ppocr/det_preprocessor.cc      |  94 +++---
 .../vision/ocr/ppocr/det_preprocessor.h       |  52 +--
 .../vision/ocr/ppocr/ocrmodel_pybind.cc       | 318 +++++++++++-------
 .../vision/classification/ppcls/__init__.py   |   1 -
 .../fastdeploy/vision/ocr/ppocr/__init__.py   |  93 ++---
 24 files changed, 448 insertions(+), 330 deletions(-)
 mode change 100755 => 100644 fastdeploy/vision/ocr/ppocr/dbdetector.cc
 mode change 100755 => 100644 fastdeploy/vision/ocr/ppocr/ocrmodel_pybind.cc

diff --git a/fastdeploy/vision/classification/ppcls/preprocessor.cc b/fastdeploy/vision/classification/ppcls/preprocessor.cc
index ef0da9ce5..619ba87fd 100644
--- a/fastdeploy/vision/classification/ppcls/preprocessor.cc
+++ b/fastdeploy/vision/classification/ppcls/preprocessor.cc
@@ -14,7 +14,6 @@
 
 #include "fastdeploy/vision/classification/ppcls/preprocessor.h"
 
-#include "fastdeploy/function/concat.h"
 #include "yaml-cpp/yaml.h"
 
 namespace fastdeploy {
@@ -102,13 +101,17 @@ void PaddleClasPreprocessor::DisablePermute() {
 
 bool PaddleClasPreprocessor::Apply(FDMatBatch* image_batch,
                                    std::vector<FDTensor>* outputs) {
+  if (!initialized_) {
+    FDERROR << "The preprocessor is not initialized." << std::endl;
+    return false;
+  }
   for (size_t j = 0; j < processors_.size(); ++j) {
-    ProcLib lib = ProcLib::DEFAULT;
+    image_batch->proc_lib = proc_lib_;
     if (initial_resize_on_cpu_ && j == 0 &&
         processors_[j]->Name().find("Resize") == 0) {
-      lib = ProcLib::OPENCV;
+      image_batch->proc_lib = ProcLib::OPENCV;
     }
-    if (!(*(processors_[j].get()))(image_batch, lib)) {
+    if (!(*(processors_[j].get()))(image_batch)) {
       FDERROR << "Failed to processs image in " << processors_[j]->Name() << "."
               << std::endl;
       return false;
diff --git a/fastdeploy/vision/classification/ppcls/preprocessor.h b/fastdeploy/vision/classification/ppcls/preprocessor.h
index fc347fc3d..ac2e82ef1 100644
--- a/fastdeploy/vision/classification/ppcls/preprocessor.h
+++ b/fastdeploy/vision/classification/ppcls/preprocessor.h
@@ -55,6 +55,7 @@ class FASTDEPLOY_DECL PaddleClasPreprocessor : public ProcessorManager {
 
  private:
   bool BuildPreprocessPipelineFromConfig();
+  bool initialized_ = false;
   std::vector<std::shared_ptr<Processor>> processors_;
   // for recording the switch of hwc2chw
   bool disable_permute_ = false;
diff --git a/fastdeploy/vision/common/processors/base.cc b/fastdeploy/vision/common/processors/base.cc
index 9c4a0177e..7e34d07bf 100644
--- a/fastdeploy/vision/common/processors/base.cc
+++ b/fastdeploy/vision/common/processors/base.cc
@@ -20,9 +20,9 @@
 namespace fastdeploy {
 namespace vision {
 
-bool Processor::operator()(FDMat* mat, ProcLib lib) {
-  ProcLib target = lib;
-  if (lib == ProcLib::DEFAULT) {
+bool Processor::operator()(FDMat* mat) {
+  ProcLib target = mat->proc_lib;
+  if (mat->proc_lib == ProcLib::DEFAULT) {
     target = DefaultProcLib::default_lib;
   }
   if (target == ProcLib::FLYCV) {
@@ -52,9 +52,14 @@ bool Processor::operator()(FDMat* mat, ProcLib lib) {
   return ImplByOpenCV(mat);
 }
 
-bool Processor::operator()(FDMatBatch* mat_batch, ProcLib lib) {
-  ProcLib target = lib;
-  if (lib == ProcLib::DEFAULT) {
+bool Processor::operator()(FDMat* mat, ProcLib lib) {
+  mat->proc_lib = lib;
+  return operator()(mat);
+}
+
+bool Processor::operator()(FDMatBatch* mat_batch) {
+  ProcLib target = mat_batch->proc_lib;
+  if (mat_batch->proc_lib == ProcLib::DEFAULT) {
     target = DefaultProcLib::default_lib;
   }
   if (target == ProcLib::FLYCV) {
diff --git a/fastdeploy/vision/common/processors/base.h b/fastdeploy/vision/common/processors/base.h
index 786e88672..a1c64a2c1 100644
--- a/fastdeploy/vision/common/processors/base.h
+++ b/fastdeploy/vision/common/processors/base.h
@@ -100,10 +100,13 @@ class FASTDEPLOY_DECL Processor {
     return true;
   }
 
-  virtual bool operator()(FDMat* mat, ProcLib lib = ProcLib::DEFAULT);
+  virtual bool operator()(FDMat* mat);
 
-  virtual bool operator()(FDMatBatch* mat_batch,
-                          ProcLib lib = ProcLib::DEFAULT);
+  // This function is for backward compatibility, will be removed in the near
+  // future, please use operator()(FDMat* mat) instead and set proc_lib in mat.
+  virtual bool operator()(FDMat* mat, ProcLib lib);
+
+  virtual bool operator()(FDMatBatch* mat_batch);
 };
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/center_crop.cc b/fastdeploy/vision/common/processors/center_crop.cc
index 1857f7a81..f220ac376 100644
--- a/fastdeploy/vision/common/processors/center_crop.cc
+++ b/fastdeploy/vision/common/processors/center_crop.cc
@@ -14,12 +14,6 @@
 
 #include "fastdeploy/vision/common/processors/center_crop.h"
 
-#ifdef ENABLE_CVCUDA
-#include <cvcuda/OpCustomCrop.hpp>
-
-#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
-#endif
-
 namespace fastdeploy {
 namespace vision {
 
@@ -75,9 +69,8 @@ bool CenterCrop::ImplByCvCuda(FDMat* mat) {
 
   int offset_x = static_cast<int>((mat->Width() - width_) / 2);
   int offset_y = static_cast<int>((mat->Height() - height_) / 2);
-  cvcuda::CustomCrop crop_op;
   NVCVRectI crop_roi = {offset_x, offset_y, width_, height_};
-  crop_op(mat->Stream(), src_tensor, dst_tensor, crop_roi);
+  cvcuda_crop_op_(mat->Stream(), src_tensor, dst_tensor, crop_roi);
 
   mat->SetTensor(mat->output_cache);
   mat->SetWidth(width_);
diff --git a/fastdeploy/vision/common/processors/center_crop.h b/fastdeploy/vision/common/processors/center_crop.h
index 3ca3a7391..0eddde0ed 100644
--- a/fastdeploy/vision/common/processors/center_crop.h
+++ b/fastdeploy/vision/common/processors/center_crop.h
@@ -15,6 +15,11 @@
 #pragma once
 
 #include "fastdeploy/vision/common/processors/base.h"
+#ifdef ENABLE_CVCUDA
+#include <cvcuda/OpCustomCrop.hpp>
+
+#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
+#endif
 
 namespace fastdeploy {
 namespace vision {
@@ -38,6 +43,9 @@ class FASTDEPLOY_DECL CenterCrop : public Processor {
  private:
   int height_;
   int width_;
+#ifdef ENABLE_CVCUDA
+  cvcuda::CustomCrop cvcuda_crop_op_;
+#endif
 };
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/manager.cc b/fastdeploy/vision/common/processors/manager.cc
index 070354da1..2f751ab80 100644
--- a/fastdeploy/vision/common/processors/manager.cc
+++ b/fastdeploy/vision/common/processors/manager.cc
@@ -31,14 +31,14 @@ void ProcessorManager::UseCuda(bool enable_cv_cuda, int gpu_id) {
   }
   FDASSERT(cudaStreamCreate(&stream_) == cudaSuccess,
            "[ERROR] Error occurs while creating cuda stream.");
-  DefaultProcLib::default_lib = ProcLib::CUDA;
+  proc_lib_ = ProcLib::CUDA;
 #else
   FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
 #endif
 
   if (enable_cv_cuda) {
 #ifdef ENABLE_CVCUDA
-    DefaultProcLib::default_lib = ProcLib::CVCUDA;
+    proc_lib_ = ProcLib::CVCUDA;
 #else
     FDASSERT(false, "FastDeploy didn't compile with CV-CUDA.");
 #endif
@@ -46,16 +46,11 @@ void ProcessorManager::UseCuda(bool enable_cv_cuda, int gpu_id) {
 }
 
 bool ProcessorManager::CudaUsed() {
-  return (DefaultProcLib::default_lib == ProcLib::CUDA ||
-          DefaultProcLib::default_lib == ProcLib::CVCUDA);
+  return (proc_lib_ == ProcLib::CUDA || proc_lib_ == ProcLib::CVCUDA);
 }
 
 bool ProcessorManager::Run(std::vector<FDMat>* images,
                            std::vector<FDTensor>* outputs) {
-  if (!initialized_) {
-    FDERROR << "The preprocessor is not initialized." << std::endl;
-    return false;
-  }
   if (images->size() == 0) {
     FDERROR << "The size of input images should be greater than 0."
             << std::endl;
@@ -70,6 +65,7 @@ bool ProcessorManager::Run(std::vector<FDMat>* images,
   FDMatBatch image_batch(images);
   image_batch.input_cache = &batch_input_cache_;
   image_batch.output_cache = &batch_output_cache_;
+  image_batch.proc_lib = proc_lib_;
 
   for (size_t i = 0; i < images->size(); ++i) {
     if (CudaUsed()) {
diff --git a/fastdeploy/vision/common/processors/manager.h b/fastdeploy/vision/common/processors/manager.h
index 48b5575c4..aa6dde56a 100644
--- a/fastdeploy/vision/common/processors/manager.h
+++ b/fastdeploy/vision/common/processors/manager.h
@@ -17,6 +17,7 @@
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy/vision/common/processors/mat.h"
 #include "fastdeploy/vision/common/processors/mat_batch.h"
+#include "fastdeploy/vision/common/processors/base.h"
 
 namespace fastdeploy {
 namespace vision {
@@ -78,7 +79,7 @@ class FASTDEPLOY_DECL ProcessorManager {
                      std::vector<FDTensor>* outputs) = 0;
 
  protected:
-  bool initialized_ = false;
+  ProcLib proc_lib_ = ProcLib::DEFAULT;
 
  private:
 #ifdef WITH_GPU
diff --git a/fastdeploy/vision/common/processors/mat.h b/fastdeploy/vision/common/processors/mat.h
index 13ae76abd..85f121b90 100644
--- a/fastdeploy/vision/common/processors/mat.h
+++ b/fastdeploy/vision/common/processors/mat.h
@@ -145,6 +145,7 @@ struct FASTDEPLOY_DECL Mat {
   ProcLib mat_type = ProcLib::OPENCV;
   Layout layout = Layout::HWC;
   Device device = Device::CPU;
+  ProcLib proc_lib = ProcLib::DEFAULT;
 
   // Create FD Mat from FD Tensor. This method only create a
   // new FD Mat with zero copy and it's data pointer is reference
diff --git a/fastdeploy/vision/common/processors/mat_batch.cc b/fastdeploy/vision/common/processors/mat_batch.cc
index f625d6d4d..aa154f334 100644
--- a/fastdeploy/vision/common/processors/mat_batch.cc
+++ b/fastdeploy/vision/common/processors/mat_batch.cc
@@ -67,6 +67,7 @@ FDTensor* CreateCachedGpuInputTensor(FDMatBatch* mat_batch) {
       FDTensor* tensor = CreateCachedGpuInputTensor(&(*mats)[i]);
       (*mats)[i].SetTensor(tensor);
     }
+    mat_batch->device = Device::GPU;
     return mat_batch->Tensor();
   } else {
     FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
diff --git a/fastdeploy/vision/common/processors/mat_batch.h b/fastdeploy/vision/common/processors/mat_batch.h
index 090d8bb59..9d876a911 100644
--- a/fastdeploy/vision/common/processors/mat_batch.h
+++ b/fastdeploy/vision/common/processors/mat_batch.h
@@ -60,6 +60,7 @@ struct FASTDEPLOY_DECL FDMatBatch {
   ProcLib mat_type = ProcLib::OPENCV;
   FDMatBatchLayout layout = FDMatBatchLayout::NHWC;
   Device device = Device::CPU;
+  ProcLib proc_lib = ProcLib::DEFAULT;
 
   // False: the data is stored in the mats separately
   // True: the data is stored in the fd_tensor continuously in 4 dimensions
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.cu b/fastdeploy/vision/common/processors/normalize_and_permute.cu
index 7f6320ba4..da3f4ffb1 100644
--- a/fastdeploy/vision/common/processors/normalize_and_permute.cu
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.cu
@@ -85,6 +85,8 @@ bool NormalizeAndPermute::ImplByCuda(FDMatBatch* mat_batch) {
   // NHWC -> NCHW
   std::swap(mat_batch->output_cache->shape[1],
             mat_batch->output_cache->shape[3]);
+  std::swap(mat_batch->output_cache->shape[2],
+            mat_batch->output_cache->shape[3]);
 
   // Copy alpha and beta to GPU
   gpu_alpha_.Resize({1, 1, static_cast<int>(alpha_.size())}, FDDataType::FP32,
diff --git a/fastdeploy/vision/common/processors/pad.cc b/fastdeploy/vision/common/processors/pad.cc
index 278e8d4b7..2db1fba20 100644
--- a/fastdeploy/vision/common/processors/pad.cc
+++ b/fastdeploy/vision/common/processors/pad.cc
@@ -91,6 +91,60 @@ bool Pad::ImplByFlyCV(Mat* mat) {
 }
 #endif
 
+#ifdef ENABLE_CVCUDA
+bool Pad::ImplByCvCuda(FDMat* mat) {
+  if (mat->layout != Layout::HWC) {
+    FDERROR << "Pad: The input data must be Layout::HWC format!" << std::endl;
+    return false;
+  }
+  if (mat->Channels() > 4) {
+    FDERROR << "Pad: Only support channels <= 4." << std::endl;
+    return false;
+  }
+  if (mat->Channels() != value_.size()) {
+    FDERROR << "Pad: Require input channels equals to size of padding value, "
+               "but now channels = "
+            << mat->Channels()
+            << ", the size of padding values = " << value_.size() << "."
+            << std::endl;
+    return false;
+  }
+
+  float4 value;
+  if (value_.size() == 1) {
+    value = make_float4(value_[0], 0.0f, 0.0f, 0.0f);
+  } else if (value_.size() == 2) {
+    value = make_float4(value_[0], value_[1], 0.0f, 0.0f);
+  } else if (value_.size() == 3) {
+    value = make_float4(value_[0], value_[1], value_[2], 0.0f);
+  } else {
+    value = make_float4(value_[0], value_[1], value_[2], value_[3]);
+  }
+
+  // Prepare input tensor
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
+  auto src_tensor = CreateCvCudaTensorWrapData(*src);
+
+  int height = mat->Height() + top_ + bottom_;
+  int width = mat->Height() + left_ + right_;
+
+  // Prepare output tensor
+  mat->output_cache->Resize({height, width, mat->Channels()}, mat->Type(),
+                            "output_cache", Device::GPU);
+  auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
+
+  cvcuda_pad_op_(mat->Stream(), src_tensor, dst_tensor, top_, left_,
+                 NVCV_BORDER_CONSTANT, value);
+
+  mat->SetTensor(mat->output_cache);
+  mat->SetWidth(width);
+  mat->SetHeight(height);
+  mat->device = Device::GPU;
+  mat->mat_type = ProcLib::CVCUDA;
+  return true;
+}
+#endif
+
 bool Pad::Run(Mat* mat, const int& top, const int& bottom, const int& left,
               const int& right, const std::vector<float>& value, ProcLib lib) {
   auto p = Pad(top, bottom, left, right, value);
diff --git a/fastdeploy/vision/common/processors/pad.h b/fastdeploy/vision/common/processors/pad.h
index 661632e77..5d025c720 100644
--- a/fastdeploy/vision/common/processors/pad.h
+++ b/fastdeploy/vision/common/processors/pad.h
@@ -15,6 +15,11 @@
 #pragma once
 
 #include "fastdeploy/vision/common/processors/base.h"
+#ifdef ENABLE_CVCUDA
+#include <cvcuda/OpCopyMakeBorder.hpp>
+
+#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
+#endif
 
 namespace fastdeploy {
 namespace vision {
@@ -32,6 +37,9 @@ class FASTDEPLOY_DECL Pad : public Processor {
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
   bool ImplByFlyCV(Mat* mat);
+#endif
+#ifdef ENABLE_CVCUDA
+  bool ImplByCvCuda(FDMat* mat);
 #endif
   std::string Name() { return "Pad"; }
 
@@ -39,12 +47,23 @@ class FASTDEPLOY_DECL Pad : public Processor {
                   const int& right, const std::vector<float>& value,
                   ProcLib lib = ProcLib::DEFAULT);
 
+  bool SetPaddingSize(int top, int bottom, int left, int right) {
+    top_ = top;
+    bottom_ = bottom;
+    left_ = left;
+    right_ = right;
+    return true;
+  }
+
  private:
   int top_;
   int bottom_;
   int left_;
   int right_;
   std::vector<float> value_;
+#ifdef ENABLE_CVCUDA
+  cvcuda::CopyMakeBorder cvcuda_pad_op_;
+#endif
 };
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/resize.cc b/fastdeploy/vision/common/processors/resize.cc
index 0de6ddfc7..806eab643 100644
--- a/fastdeploy/vision/common/processors/resize.cc
+++ b/fastdeploy/vision/common/processors/resize.cc
@@ -14,12 +14,6 @@
 
 #include "fastdeploy/vision/common/processors/resize.h"
 
-#ifdef ENABLE_CVCUDA
-#include <cvcuda/OpResize.hpp>
-
-#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
-#endif
-
 namespace fastdeploy {
 namespace vision {
 
@@ -152,9 +146,8 @@ bool Resize::ImplByCvCuda(FDMat* mat) {
   auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
 
   // CV-CUDA Interp value is compatible with OpenCV
-  cvcuda::Resize resize_op;
-  resize_op(mat->Stream(), src_tensor, dst_tensor,
-            NVCVInterpolationType(interp_));
+  cvcuda_resize_op_(mat->Stream(), src_tensor, dst_tensor,
+                    NVCVInterpolationType(interp_));
 
   mat->SetTensor(mat->output_cache);
   mat->SetWidth(width_);
diff --git a/fastdeploy/vision/common/processors/resize.h b/fastdeploy/vision/common/processors/resize.h
index 2b4f88a35..607287d80 100644
--- a/fastdeploy/vision/common/processors/resize.h
+++ b/fastdeploy/vision/common/processors/resize.h
@@ -15,6 +15,11 @@
 #pragma once
 
 #include "fastdeploy/vision/common/processors/base.h"
+#ifdef ENABLE_CVCUDA
+#include <cvcuda/OpResize.hpp>
+
+#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
+#endif
 
 namespace fastdeploy {
 namespace vision {
@@ -61,6 +66,9 @@ class FASTDEPLOY_DECL Resize : public Processor {
   float scale_h_ = -1.0;
   int interp_ = 1;
   bool use_scale_ = false;
+#ifdef ENABLE_CVCUDA
+  cvcuda::Resize cvcuda_resize_op_;
+#endif
 };
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/resize_by_short.cc b/fastdeploy/vision/common/processors/resize_by_short.cc
index 535652fc7..7fe644e0d 100644
--- a/fastdeploy/vision/common/processors/resize_by_short.cc
+++ b/fastdeploy/vision/common/processors/resize_by_short.cc
@@ -14,12 +14,6 @@
 
 #include "fastdeploy/vision/common/processors/resize_by_short.h"
 
-#ifdef ENABLE_CVCUDA
-#include <cvcuda/OpResize.hpp>
-
-#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
-#endif
-
 namespace fastdeploy {
 namespace vision {
 
@@ -102,9 +96,8 @@ bool ResizeByShort::ImplByCvCuda(FDMat* mat) {
   auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
 
   // CV-CUDA Interp value is compatible with OpenCV
-  cvcuda::Resize resize_op;
-  resize_op(mat->Stream(), src_tensor, dst_tensor,
-            NVCVInterpolationType(interp_));
+  cvcuda_resize_op_(mat->Stream(), src_tensor, dst_tensor,
+                    NVCVInterpolationType(interp_));
 
   mat->SetTensor(mat->output_cache);
   mat->SetWidth(width);
@@ -144,9 +137,8 @@ bool ResizeByShort::ImplByCvCuda(FDMatBatch* mat_batch) {
   CreateCvCudaImageBatchVarShape(dst_tensors, dst_batch);
 
   // CV-CUDA Interp value is compatible with OpenCV
-  cvcuda::Resize resize_op;
-  resize_op(mat_batch->Stream(), src_batch, dst_batch,
-            NVCVInterpolationType(interp_));
+  cvcuda_resize_op_(mat_batch->Stream(), src_batch, dst_batch,
+                    NVCVInterpolationType(interp_));
 
   for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
     FDMat* mat = &(*(mat_batch->mats))[i];
diff --git a/fastdeploy/vision/common/processors/resize_by_short.h b/fastdeploy/vision/common/processors/resize_by_short.h
index 99078c708..08bec6438 100644
--- a/fastdeploy/vision/common/processors/resize_by_short.h
+++ b/fastdeploy/vision/common/processors/resize_by_short.h
@@ -15,6 +15,11 @@
 #pragma once
 
 #include "fastdeploy/vision/common/processors/base.h"
+#ifdef ENABLE_CVCUDA
+#include <cvcuda/OpResize.hpp>
+
+#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
+#endif
 
 namespace fastdeploy {
 namespace vision {
@@ -49,6 +54,9 @@ class FASTDEPLOY_DECL ResizeByShort : public Processor {
   std::vector<int> max_hw_;
   int interp_;
   bool use_scale_;
+#ifdef ENABLE_CVCUDA
+  cvcuda::Resize cvcuda_resize_op_;
+#endif
 };
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/ocr/ppocr/dbdetector.cc b/fastdeploy/vision/ocr/ppocr/dbdetector.cc
old mode 100755
new mode 100644
index cd07cc262..7dd0ac84a
--- a/fastdeploy/vision/ocr/ppocr/dbdetector.cc
+++ b/fastdeploy/vision/ocr/ppocr/dbdetector.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/ocr/ppocr/dbdetector.h"
+
 #include "fastdeploy/utils/perf.h"
 #include "fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h"
 
@@ -26,11 +27,11 @@ DBDetector::DBDetector(const std::string& model_file,
                        const RuntimeOption& custom_option,
                        const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT,
-                          Backend::OPENVINO};  
-    valid_gpu_backends = {Backend::ORT, Backend::TRT};  
+    valid_cpu_backends = {Backend::ORT, Backend::OPENVINO};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
-    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::OPENVINO, Backend::LITE};
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::OPENVINO,
+                          Backend::LITE};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
     valid_kunlunxin_backends = {Backend::LITE};
     valid_ascend_backends = {Backend::LITE};
@@ -54,7 +55,8 @@ bool DBDetector::Initialize() {
 }
 
 std::unique_ptr<DBDetector> DBDetector::Clone() const {
-  std::unique_ptr<DBDetector> clone_model = utils::make_unique<DBDetector>(DBDetector(*this));
+  std::unique_ptr<DBDetector> clone_model =
+      utils::make_unique<DBDetector>(DBDetector(*this));
   clone_model->SetRuntime(clone_model->CloneRuntime());
   return clone_model;
 }
@@ -69,14 +71,15 @@ bool DBDetector::Predict(const cv::Mat& img,
   return true;
 }
 
-bool DBDetector::BatchPredict(const std::vector<cv::Mat>& images,
-                              std::vector<std::vector<std::array<int, 8>>>* det_results) {
+bool DBDetector::BatchPredict(
+    const std::vector<cv::Mat>& images,
+    std::vector<std::vector<std::array<int, 8>>>* det_results) {
   std::vector<FDMat> fd_images = WrapMat(images);
-  std::vector<std::array<int, 4>> batch_det_img_info;
-  if (!preprocessor_.Run(&fd_images, &reused_input_tensors_, &batch_det_img_info)) {
+  if (!preprocessor_.Run(&fd_images, &reused_input_tensors_)) {
     FDERROR << "Failed to preprocess input image." << std::endl;
     return false;
   }
+  auto batch_det_img_info = preprocessor_.GetBatchImgInfo();
 
   reused_input_tensors_[0].name = InputInfoOfRuntime(0).name;
   if (!Infer(reused_input_tensors_, &reused_output_tensors_)) {
@@ -84,13 +87,15 @@ bool DBDetector::BatchPredict(const std::vector<cv::Mat>& images,
     return false;
   }
 
-  if (!postprocessor_.Run(reused_output_tensors_, det_results, batch_det_img_info)) {
-    FDERROR << "Failed to postprocess the inference cls_results by runtime." << std::endl;
+  if (!postprocessor_.Run(reused_output_tensors_, det_results,
+                          *batch_det_img_info)) {
+    FDERROR << "Failed to postprocess the inference cls_results by runtime."
+            << std::endl;
     return false;
   }
   return true;
 }
 
-}  // namesapce ocr
+}  // namespace ocr
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/ocr/ppocr/det_preprocessor.cc b/fastdeploy/vision/ocr/ppocr/det_preprocessor.cc
index 28b7e47af..69687d5cd 100644
--- a/fastdeploy/vision/ocr/ppocr/det_preprocessor.cc
+++ b/fastdeploy/vision/ocr/ppocr/det_preprocessor.cc
@@ -13,9 +13,8 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/ocr/ppocr/det_preprocessor.h"
-#include "fastdeploy/utils/perf.h"
+
 #include "fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h"
-#include "fastdeploy/function/concat.h"
 
 namespace fastdeploy {
 namespace vision {
@@ -39,64 +38,61 @@ std::array<int, 4> OcrDetectorGetInfo(FDMat* img, int max_size_len) {
   resize_h = std::max(int(std::round(float(resize_h) / 32) * 32), 32);
   resize_w = std::max(int(std::round(float(resize_w) / 32) * 32), 32);
 
-  return {w,h,resize_w,resize_h};
+  return {w, h, resize_w, resize_h};
   /*
-  *ratio_h = float(resize_h) / float(h);
-  *ratio_w = float(resize_w) / float(w);
-  */
+   *ratio_h = float(resize_h) / float(h);
+   *ratio_w = float(resize_w) / float(w);
+   */
 }
-bool OcrDetectorResizeImage(FDMat* img,
-                            int resize_w,
-                            int resize_h,
-                            int max_resize_w,
-                            int max_resize_h) {
-  Resize::Run(img, resize_w, resize_h);
+
+DBDetectorPreprocessor::DBDetectorPreprocessor() {
+  resize_op_ = std::make_shared<Resize>(-1, -1);
+
   std::vector<float> value = {0, 0, 0};
-  Pad::Run(img, 0, max_resize_h-resize_h, 0, max_resize_w - resize_w, value);
+  pad_op_ = std::make_shared<Pad>(0, 0, 0, 0, value);
+
+  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
+  std::vector<float> std = {0.229f, 0.224f, 0.225f};
+  bool is_scale = true;
+  normalize_permute_op_ =
+      std::make_shared<NormalizeAndPermute>(mean, std, is_scale);
+}
+
+bool DBDetectorPreprocessor::ResizeImage(FDMat* img, int resize_w, int resize_h,
+                                         int max_resize_w, int max_resize_h) {
+  resize_op_->SetWidthAndHeight(resize_w, resize_h);
+  (*resize_op_)(img);
+
+  pad_op_->SetPaddingSize(0, max_resize_h - resize_h, 0,
+                          max_resize_w - resize_w);
+  (*pad_op_)(img);
   return true;
 }
 
-bool DBDetectorPreprocessor::Run(std::vector<FDMat>* images,
-                                 std::vector<FDTensor>* outputs,
-                                 std::vector<std::array<int, 4>>* batch_det_img_info_ptr) {
-  if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
-    return false;
-  }
+bool DBDetectorPreprocessor::Apply(FDMatBatch* image_batch,
+                                   std::vector<FDTensor>* outputs) {
   int max_resize_w = 0;
   int max_resize_h = 0;
-  std::vector<std::array<int, 4>>& batch_det_img_info = *batch_det_img_info_ptr;
-  batch_det_img_info.clear();
-  batch_det_img_info.resize(images->size());
-  for (size_t i = 0; i < images->size(); ++i) {
-    FDMat* mat = &(images->at(i));
-    batch_det_img_info[i] = OcrDetectorGetInfo(mat,max_side_len_);
-    max_resize_w = std::max(max_resize_w,batch_det_img_info[i][2]);
-    max_resize_h = std::max(max_resize_h,batch_det_img_info[i][3]);
+  batch_det_img_info_.clear();
+  batch_det_img_info_.resize(image_batch->mats->size());
+  for (size_t i = 0; i < image_batch->mats->size(); ++i) {
+    FDMat* mat = &(image_batch->mats->at(i));
+    batch_det_img_info_[i] = OcrDetectorGetInfo(mat, max_side_len_);
+    max_resize_w = std::max(max_resize_w, batch_det_img_info_[i][2]);
+    max_resize_h = std::max(max_resize_h, batch_det_img_info_[i][3]);
   }
-  for (size_t i = 0; i < images->size(); ++i) {
-    FDMat* mat = &(images->at(i));
-    OcrDetectorResizeImage(mat, batch_det_img_info[i][2],batch_det_img_info[i][3],max_resize_w,max_resize_h);
-    NormalizeAndPermute::Run(mat, mean_, scale_, is_scale_);
-    /*
-    Normalize::Run(mat, mean_, scale_, is_scale_);
-    HWC2CHW::Run(mat);
-    Cast::Run(mat, "float");
-    */
+  for (size_t i = 0; i < image_batch->mats->size(); ++i) {
+    FDMat* mat = &(image_batch->mats->at(i));
+    ResizeImage(mat, batch_det_img_info_[i][2], batch_det_img_info_[i][3],
+                max_resize_w, max_resize_h);
   }
-  // Only have 1 output Tensor.
+  (*normalize_permute_op_)(image_batch);
+
   outputs->resize(1);
-  // Concat all the preprocessed data to a batch tensor
-  std::vector<FDTensor> tensors(images->size()); 
-  for (size_t i = 0; i < images->size(); ++i) {
-    (*images)[i].ShareWithTensor(&(tensors[i]));
-    tensors[i].ExpandDim(0);
-  }
-  if (tensors.size() == 1) {
-    (*outputs)[0] = std::move(tensors[0]);
-  } else {
-    function::Concat(tensors, &((*outputs)[0]), 0);
-  }
+  FDTensor* tensor = image_batch->Tensor();
+  (*outputs)[0].SetExternalData(tensor->Shape(), tensor->Dtype(),
+                                tensor->Data(), tensor->device,
+                                tensor->device_id);
   return true;
 }
 
diff --git a/fastdeploy/vision/ocr/ppocr/det_preprocessor.h b/fastdeploy/vision/ocr/ppocr/det_preprocessor.h
index 552d0628a..fd7b77de1 100644
--- a/fastdeploy/vision/ocr/ppocr/det_preprocessor.h
+++ b/fastdeploy/vision/ocr/ppocr/det_preprocessor.h
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #pragma once
-#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/processors/manager.h"
+#include "fastdeploy/vision/common/processors/resize.h"
+#include "fastdeploy/vision/common/processors/pad.h"
+#include "fastdeploy/vision/common/processors/normalize_and_permute.h"
 #include "fastdeploy/vision/common/result.h"
 
 namespace fastdeploy {
@@ -22,43 +25,48 @@ namespace vision {
 namespace ocr {
 /*! @brief Preprocessor object for DBDetector serials model.
  */
-class FASTDEPLOY_DECL DBDetectorPreprocessor {
+class FASTDEPLOY_DECL DBDetectorPreprocessor : public ProcessorManager {
  public:
+  DBDetectorPreprocessor();
+
   /** \brief Process the input image and prepare input tensors for runtime
    *
-   * \param[in] images The input data list, all the elements are FDMat
+   * \param[in] image_batch The input image batch
    * \param[in] outputs The output tensors which will feed in runtime
-   * \param[in] batch_det_img_info_ptr The output of preprocess
    * \return true if the preprocess successed, otherwise false
    */
-  bool Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
-           std::vector<std::array<int, 4>>* batch_det_img_info_ptr);
+  virtual bool Apply(FDMatBatch* image_batch, std::vector<FDTensor>* outputs);
 
   /// Set max_side_len for the detection preprocess, default is 960
   void SetMaxSideLen(int max_side_len) { max_side_len_ = max_side_len; }
+
   /// Get max_side_len of the detection preprocess
   int GetMaxSideLen() const { return max_side_len_; }
 
-  /// Set mean value for the image normalization in detection preprocess
-  void SetMean(const std::vector<float>& mean) { mean_ = mean; }
-  /// Get mean value of the image normalization in detection preprocess
-  std::vector<float> GetMean() const { return mean_; }
+  /// Set preprocess normalize parameters, please call this API to customize
+  /// the normalize parameters, otherwise it will use the default normalize
+  /// parameters.
+  void SetNormalize(const std::vector<float>& mean = {0.485f, 0.456f, 0.406f},
+                    const std::vector<float>& std = {0.229f, 0.224f, 0.225f},
+                    bool is_scale = true) {
+    normalize_permute_op_ =
+        std::make_shared<NormalizeAndPermute>(mean, std, is_scale);
+  }
 
-  /// Set scale value for the image normalization in detection preprocess
-  void SetScale(const std::vector<float>& scale) { scale_ = scale; }
-  /// Get scale value of the image normalization in detection preprocess
-  std::vector<float> GetScale() const { return scale_; }
-
-  /// Set is_scale for the image normalization in detection preprocess
-  void SetIsScale(bool is_scale) { is_scale_ = is_scale; }
-  /// Get is_scale of the image normalization in detection preprocess
-  bool GetIsScale() const { return is_scale_; }
+  /// Get the image info of the last batch, return a list of array
+  /// {image width, image height, resize width, resize height}
+  const std::vector<std::array<int, 4>>* GetBatchImgInfo() {
+    return &batch_det_img_info_;
+  }
 
  private:
+  bool ResizeImage(FDMat* img, int resize_w, int resize_h, int max_resize_w,
+                   int max_resize_h);
   int max_side_len_ = 960;
-  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
-  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
-  bool is_scale_ = true;
+  std::vector<std::array<int, 4>> batch_det_img_info_;
+  std::shared_ptr<Resize> resize_op_;
+  std::shared_ptr<Pad> pad_op_;
+  std::shared_ptr<NormalizeAndPermute> normalize_permute_op_;
 };
 
 }  // namespace ocr
diff --git a/fastdeploy/vision/ocr/ppocr/ocrmodel_pybind.cc b/fastdeploy/vision/ocr/ppocr/ocrmodel_pybind.cc
old mode 100755
new mode 100644
index 2bcb697a8..aa77542af
--- a/fastdeploy/vision/ocr/ppocr/ocrmodel_pybind.cc
+++ b/fastdeploy/vision/ocr/ppocr/ocrmodel_pybind.cc
@@ -12,80 +12,106 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <pybind11/stl.h>
+
 #include "fastdeploy/pybind/main.h"
 
 namespace fastdeploy {
 void BindPPOCRModel(pybind11::module& m) {
   m.def("sort_boxes", [](std::vector<std::array<int, 8>>& boxes) {
-       vision::ocr::SortBoxes(&boxes);
-       return boxes;
+    vision::ocr::SortBoxes(&boxes);
+    return boxes;
   });
-  
+
   // DBDetector
-  pybind11::class_<vision::ocr::DBDetectorPreprocessor>(m, "DBDetectorPreprocessor")
+  pybind11::class_<vision::ocr::DBDetectorPreprocessor>(
+      m, "DBDetectorPreprocessor")
       .def(pybind11::init<>())
-      .def_property("max_side_len", &vision::ocr::DBDetectorPreprocessor::GetMaxSideLen, &vision::ocr::DBDetectorPreprocessor::SetMaxSideLen)
-      .def_property("mean", &vision::ocr::DBDetectorPreprocessor::GetMean, &vision::ocr::DBDetectorPreprocessor::SetMean)
-      .def_property("scale", &vision::ocr::DBDetectorPreprocessor::GetScale, &vision::ocr::DBDetectorPreprocessor::SetScale)
-      .def_property("is_scale", &vision::ocr::DBDetectorPreprocessor::GetIsScale, &vision::ocr::DBDetectorPreprocessor::SetIsScale)
-      .def("run", [](vision::ocr::DBDetectorPreprocessor& self, std::vector<pybind11::array>& im_list) {
+      .def_property("max_side_len",
+                    &vision::ocr::DBDetectorPreprocessor::GetMaxSideLen,
+                    &vision::ocr::DBDetectorPreprocessor::SetMaxSideLen)
+      .def("set_normalize",
+           [](vision::ocr::DBDetectorPreprocessor& self,
+              const std::vector<float>& mean, const std::vector<float>& std,
+              bool is_scale) { self.SetNormalize(mean, std, is_scale); })
+      .def("run", [](vision::ocr::DBDetectorPreprocessor& self,
+                     std::vector<pybind11::array>& im_list) {
         std::vector<vision::FDMat> images;
         for (size_t i = 0; i < im_list.size(); ++i) {
           images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
         }
         std::vector<FDTensor> outputs;
-        std::vector<std::array<int, 4>> batch_det_img_info;
-        self.Run(&images, &outputs, &batch_det_img_info);
-        for(size_t i = 0; i< outputs.size(); ++i){
+        self.Run(&images, &outputs);
+        auto batch_det_img_info = self.GetBatchImgInfo();
+        for (size_t i = 0; i < outputs.size(); ++i) {
           outputs[i].StopSharing();
         }
-        return std::make_pair(outputs, batch_det_img_info);
+        return std::make_pair(outputs, *batch_det_img_info);
       });
 
-  pybind11::class_<vision::ocr::DBDetectorPostprocessor>(m, "DBDetectorPostprocessor")
+  pybind11::class_<vision::ocr::DBDetectorPostprocessor>(
+      m, "DBDetectorPostprocessor")
       .def(pybind11::init<>())
-      .def_property("det_db_thresh", &vision::ocr::DBDetectorPostprocessor::GetDetDBThresh, &vision::ocr::DBDetectorPostprocessor::SetDetDBThresh) 
-      .def_property("det_db_box_thresh", &vision::ocr::DBDetectorPostprocessor::GetDetDBBoxThresh, &vision::ocr::DBDetectorPostprocessor::SetDetDBBoxThresh) 
-      .def_property("det_db_unclip_ratio", &vision::ocr::DBDetectorPostprocessor::GetDetDBUnclipRatio, &vision::ocr::DBDetectorPostprocessor::SetDetDBUnclipRatio) 
-      .def_property("det_db_score_mode", &vision::ocr::DBDetectorPostprocessor::GetDetDBScoreMode, &vision::ocr::DBDetectorPostprocessor::SetDetDBScoreMode) 
-      .def_property("use_dilation", &vision::ocr::DBDetectorPostprocessor::GetUseDilation, &vision::ocr::DBDetectorPostprocessor::SetUseDilation) 
+      .def_property("det_db_thresh",
+                    &vision::ocr::DBDetectorPostprocessor::GetDetDBThresh,
+                    &vision::ocr::DBDetectorPostprocessor::SetDetDBThresh)
+      .def_property("det_db_box_thresh",
+                    &vision::ocr::DBDetectorPostprocessor::GetDetDBBoxThresh,
+                    &vision::ocr::DBDetectorPostprocessor::SetDetDBBoxThresh)
+      .def_property("det_db_unclip_ratio",
+                    &vision::ocr::DBDetectorPostprocessor::GetDetDBUnclipRatio,
+                    &vision::ocr::DBDetectorPostprocessor::SetDetDBUnclipRatio)
+      .def_property("det_db_score_mode",
+                    &vision::ocr::DBDetectorPostprocessor::GetDetDBScoreMode,
+                    &vision::ocr::DBDetectorPostprocessor::SetDetDBScoreMode)
+      .def_property("use_dilation",
+                    &vision::ocr::DBDetectorPostprocessor::GetUseDilation,
+                    &vision::ocr::DBDetectorPostprocessor::SetUseDilation)
 
-      .def("run", [](vision::ocr::DBDetectorPostprocessor& self,
-                     std::vector<FDTensor>& inputs,
-                     const std::vector<std::array<int, 4>>& batch_det_img_info) {
-        std::vector<std::vector<std::array<int, 8>>> results;
+      .def("run",
+           [](vision::ocr::DBDetectorPostprocessor& self,
+              std::vector<FDTensor>& inputs,
+              const std::vector<std::array<int, 4>>& batch_det_img_info) {
+             std::vector<std::vector<std::array<int, 8>>> results;
 
-        if (!self.Run(inputs, &results, batch_det_img_info)) {
-          throw std::runtime_error("Failed to preprocess the input data in DBDetectorPostprocessor.");
-        }
-        return results;
-      })
-      .def("run", [](vision::ocr::DBDetectorPostprocessor& self,
-                     std::vector<pybind11::array>& input_array,
-                     const std::vector<std::array<int, 4>>& batch_det_img_info) {
-        std::vector<std::vector<std::array<int, 8>>> results;
-        std::vector<FDTensor> inputs;
-        PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
-        if (!self.Run(inputs, &results, batch_det_img_info)) {
-          throw std::runtime_error("Failed to preprocess the input data in DBDetectorPostprocessor.");
-        }
-        return results;
-      });
+             if (!self.Run(inputs, &results, batch_det_img_info)) {
+               throw std::runtime_error(
+                   "Failed to preprocess the input data in "
+                   "DBDetectorPostprocessor.");
+             }
+             return results;
+           })
+      .def("run",
+           [](vision::ocr::DBDetectorPostprocessor& self,
+              std::vector<pybind11::array>& input_array,
+              const std::vector<std::array<int, 4>>& batch_det_img_info) {
+             std::vector<std::vector<std::array<int, 8>>> results;
+             std::vector<FDTensor> inputs;
+             PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
+             if (!self.Run(inputs, &results, batch_det_img_info)) {
+               throw std::runtime_error(
+                   "Failed to preprocess the input data in "
+                   "DBDetectorPostprocessor.");
+             }
+             return results;
+           });
 
   pybind11::class_<vision::ocr::DBDetector, FastDeployModel>(m, "DBDetector")
       .def(pybind11::init<std::string, std::string, RuntimeOption,
                           ModelFormat>())
       .def(pybind11::init<>())
-      .def_property_readonly("preprocessor", &vision::ocr::DBDetector::GetPreprocessor)
-      .def_property_readonly("postprocessor", &vision::ocr::DBDetector::GetPostprocessor)
-      .def("predict", [](vision::ocr::DBDetector& self,
-                         pybind11::array& data) {
-        auto mat = PyArrayToCvMat(data);
-        std::vector<std::array<int, 8>> boxes_result;
-        self.Predict(mat, &boxes_result);
-        return boxes_result;
-      })
-      .def("batch_predict", [](vision::ocr::DBDetector& self, std::vector<pybind11::array>& data) {
+      .def_property_readonly("preprocessor",
+                             &vision::ocr::DBDetector::GetPreprocessor)
+      .def_property_readonly("postprocessor",
+                             &vision::ocr::DBDetector::GetPostprocessor)
+      .def("predict",
+           [](vision::ocr::DBDetector& self, pybind11::array& data) {
+             auto mat = PyArrayToCvMat(data);
+             std::vector<std::array<int, 8>> boxes_result;
+             self.Predict(mat, &boxes_result);
+             return boxes_result;
+           })
+      .def("batch_predict", [](vision::ocr::DBDetector& self,
+                               std::vector<pybind11::array>& data) {
         std::vector<cv::Mat> images;
         std::vector<std::vector<std::array<int, 8>>> det_results;
         for (size_t i = 0; i < data.size(); ++i) {
@@ -96,39 +122,54 @@ void BindPPOCRModel(pybind11::module& m) {
       });
 
   // Classifier
-  pybind11::class_<vision::ocr::ClassifierPreprocessor>(m, "ClassifierPreprocessor")
+  pybind11::class_<vision::ocr::ClassifierPreprocessor>(
+      m, "ClassifierPreprocessor")
       .def(pybind11::init<>())
-      .def_property("cls_image_shape", &vision::ocr::ClassifierPreprocessor::GetClsImageShape, &vision::ocr::ClassifierPreprocessor::SetClsImageShape)
-      .def_property("mean", &vision::ocr::ClassifierPreprocessor::GetMean, &vision::ocr::ClassifierPreprocessor::SetMean)
-      .def_property("scale", &vision::ocr::ClassifierPreprocessor::GetScale, &vision::ocr::ClassifierPreprocessor::SetScale)
-      .def_property("is_scale", &vision::ocr::ClassifierPreprocessor::GetIsScale, &vision::ocr::ClassifierPreprocessor::SetIsScale)
-      .def("run", [](vision::ocr::ClassifierPreprocessor& self, std::vector<pybind11::array>& im_list) {
+      .def_property("cls_image_shape",
+                    &vision::ocr::ClassifierPreprocessor::GetClsImageShape,
+                    &vision::ocr::ClassifierPreprocessor::SetClsImageShape)
+      .def_property("mean", &vision::ocr::ClassifierPreprocessor::GetMean,
+                    &vision::ocr::ClassifierPreprocessor::SetMean)
+      .def_property("scale", &vision::ocr::ClassifierPreprocessor::GetScale,
+                    &vision::ocr::ClassifierPreprocessor::SetScale)
+      .def_property("is_scale",
+                    &vision::ocr::ClassifierPreprocessor::GetIsScale,
+                    &vision::ocr::ClassifierPreprocessor::SetIsScale)
+      .def("run", [](vision::ocr::ClassifierPreprocessor& self,
+                     std::vector<pybind11::array>& im_list) {
         std::vector<vision::FDMat> images;
         for (size_t i = 0; i < im_list.size(); ++i) {
           images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
         }
         std::vector<FDTensor> outputs;
         if (!self.Run(&images, &outputs)) {
-          throw std::runtime_error("Failed to preprocess the input data in ClassifierPreprocessor.");
+          throw std::runtime_error(
+              "Failed to preprocess the input data in ClassifierPreprocessor.");
         }
-        for(size_t i = 0; i< outputs.size(); ++i){
+        for (size_t i = 0; i < outputs.size(); ++i) {
           outputs[i].StopSharing();
         }
         return outputs;
       });
 
-  pybind11::class_<vision::ocr::ClassifierPostprocessor>(m, "ClassifierPostprocessor")
+  pybind11::class_<vision::ocr::ClassifierPostprocessor>(
+      m, "ClassifierPostprocessor")
       .def(pybind11::init<>())
-      .def_property("cls_thresh", &vision::ocr::ClassifierPostprocessor::GetClsThresh, &vision::ocr::ClassifierPostprocessor::SetClsThresh) 
-      .def("run", [](vision::ocr::ClassifierPostprocessor& self,
-                     std::vector<FDTensor>& inputs) {
-        std::vector<int> cls_labels;
-        std::vector<float> cls_scores;
-        if (!self.Run(inputs, &cls_labels, &cls_scores)) {
-          throw std::runtime_error("Failed to preprocess the input data in ClassifierPostprocessor.");
-        }
-        return std::make_pair(cls_labels,cls_scores);
-      })
+      .def_property("cls_thresh",
+                    &vision::ocr::ClassifierPostprocessor::GetClsThresh,
+                    &vision::ocr::ClassifierPostprocessor::SetClsThresh)
+      .def("run",
+           [](vision::ocr::ClassifierPostprocessor& self,
+              std::vector<FDTensor>& inputs) {
+             std::vector<int> cls_labels;
+             std::vector<float> cls_scores;
+             if (!self.Run(inputs, &cls_labels, &cls_scores)) {
+               throw std::runtime_error(
+                   "Failed to preprocess the input data in "
+                   "ClassifierPostprocessor.");
+             }
+             return std::make_pair(cls_labels, cls_scores);
+           })
       .def("run", [](vision::ocr::ClassifierPostprocessor& self,
                      std::vector<pybind11::array>& input_array) {
         std::vector<FDTensor> inputs;
@@ -136,26 +177,31 @@ void BindPPOCRModel(pybind11::module& m) {
         std::vector<int> cls_labels;
         std::vector<float> cls_scores;
         if (!self.Run(inputs, &cls_labels, &cls_scores)) {
-          throw std::runtime_error("Failed to preprocess the input data in ClassifierPostprocessor.");
+          throw std::runtime_error(
+              "Failed to preprocess the input data in "
+              "ClassifierPostprocessor.");
         }
-        return std::make_pair(cls_labels,cls_scores);
+        return std::make_pair(cls_labels, cls_scores);
       });
-  
+
   pybind11::class_<vision::ocr::Classifier, FastDeployModel>(m, "Classifier")
       .def(pybind11::init<std::string, std::string, RuntimeOption,
                           ModelFormat>())
       .def(pybind11::init<>())
-      .def_property_readonly("preprocessor", &vision::ocr::Classifier::GetPreprocessor)
-      .def_property_readonly("postprocessor", &vision::ocr::Classifier::GetPostprocessor)
-      .def("predict", [](vision::ocr::Classifier& self,
-                         pybind11::array& data) {
-        auto mat = PyArrayToCvMat(data);
-        int32_t cls_label;
-        float cls_score;
-        self.Predict(mat, &cls_label, &cls_score);
-        return std::make_pair(cls_label, cls_score);
-      })
-      .def("batch_predict", [](vision::ocr::Classifier& self, std::vector<pybind11::array>& data) {
+      .def_property_readonly("preprocessor",
+                             &vision::ocr::Classifier::GetPreprocessor)
+      .def_property_readonly("postprocessor",
+                             &vision::ocr::Classifier::GetPostprocessor)
+      .def("predict",
+           [](vision::ocr::Classifier& self, pybind11::array& data) {
+             auto mat = PyArrayToCvMat(data);
+             int32_t cls_label;
+             float cls_score;
+             self.Predict(mat, &cls_label, &cls_score);
+             return std::make_pair(cls_label, cls_score);
+           })
+      .def("batch_predict", [](vision::ocr::Classifier& self,
+                               std::vector<pybind11::array>& data) {
         std::vector<cv::Mat> images;
         std::vector<int32_t> cls_labels;
         std::vector<float> cls_scores;
@@ -167,39 +213,54 @@ void BindPPOCRModel(pybind11::module& m) {
       });
 
   // Recognizer
-  pybind11::class_<vision::ocr::RecognizerPreprocessor>(m, "RecognizerPreprocessor")
-    .def(pybind11::init<>())
-    .def_property("static_shape_infer", &vision::ocr::RecognizerPreprocessor::GetStaticShapeInfer, &vision::ocr::RecognizerPreprocessor::SetStaticShapeInfer) 
-    .def_property("rec_image_shape", &vision::ocr::RecognizerPreprocessor::GetRecImageShape, &vision::ocr::RecognizerPreprocessor::SetRecImageShape)
-    .def_property("mean", &vision::ocr::RecognizerPreprocessor::GetMean, &vision::ocr::RecognizerPreprocessor::SetMean)
-    .def_property("scale", &vision::ocr::RecognizerPreprocessor::GetScale, &vision::ocr::RecognizerPreprocessor::SetScale)
-    .def_property("is_scale", &vision::ocr::RecognizerPreprocessor::GetIsScale, &vision::ocr::RecognizerPreprocessor::SetIsScale)
-    .def("run", [](vision::ocr::RecognizerPreprocessor& self, std::vector<pybind11::array>& im_list) {
-      std::vector<vision::FDMat> images;
-      for (size_t i = 0; i < im_list.size(); ++i) {
-        images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
-      }
-      std::vector<FDTensor> outputs;
-      if (!self.Run(&images, &outputs)) {
-        throw std::runtime_error("Failed to preprocess the input data in RecognizerPreprocessor.");
-      }
-      for(size_t i = 0; i< outputs.size(); ++i){
-        outputs[i].StopSharing();
-      }
-      return outputs;
-    });
-
-  pybind11::class_<vision::ocr::RecognizerPostprocessor>(m, "RecognizerPostprocessor")
-      .def(pybind11::init<std::string>())
-      .def("run", [](vision::ocr::RecognizerPostprocessor& self,
-                     std::vector<FDTensor>& inputs) {
-        std::vector<std::string> texts;
-        std::vector<float> rec_scores;
-        if (!self.Run(inputs, &texts, &rec_scores)) {
-          throw std::runtime_error("Failed to preprocess the input data in RecognizerPostprocessor.");
+  pybind11::class_<vision::ocr::RecognizerPreprocessor>(
+      m, "RecognizerPreprocessor")
+      .def(pybind11::init<>())
+      .def_property("static_shape_infer",
+                    &vision::ocr::RecognizerPreprocessor::GetStaticShapeInfer,
+                    &vision::ocr::RecognizerPreprocessor::SetStaticShapeInfer)
+      .def_property("rec_image_shape",
+                    &vision::ocr::RecognizerPreprocessor::GetRecImageShape,
+                    &vision::ocr::RecognizerPreprocessor::SetRecImageShape)
+      .def_property("mean", &vision::ocr::RecognizerPreprocessor::GetMean,
+                    &vision::ocr::RecognizerPreprocessor::SetMean)
+      .def_property("scale", &vision::ocr::RecognizerPreprocessor::GetScale,
+                    &vision::ocr::RecognizerPreprocessor::SetScale)
+      .def_property("is_scale",
+                    &vision::ocr::RecognizerPreprocessor::GetIsScale,
+                    &vision::ocr::RecognizerPreprocessor::SetIsScale)
+      .def("run", [](vision::ocr::RecognizerPreprocessor& self,
+                     std::vector<pybind11::array>& im_list) {
+        std::vector<vision::FDMat> images;
+        for (size_t i = 0; i < im_list.size(); ++i) {
+          images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
         }
-        return std::make_pair(texts, rec_scores);
-      })
+        std::vector<FDTensor> outputs;
+        if (!self.Run(&images, &outputs)) {
+          throw std::runtime_error(
+              "Failed to preprocess the input data in RecognizerPreprocessor.");
+        }
+        for (size_t i = 0; i < outputs.size(); ++i) {
+          outputs[i].StopSharing();
+        }
+        return outputs;
+      });
+
+  pybind11::class_<vision::ocr::RecognizerPostprocessor>(
+      m, "RecognizerPostprocessor")
+      .def(pybind11::init<std::string>())
+      .def("run",
+           [](vision::ocr::RecognizerPostprocessor& self,
+              std::vector<FDTensor>& inputs) {
+             std::vector<std::string> texts;
+             std::vector<float> rec_scores;
+             if (!self.Run(inputs, &texts, &rec_scores)) {
+               throw std::runtime_error(
+                   "Failed to preprocess the input data in "
+                   "RecognizerPostprocessor.");
+             }
+             return std::make_pair(texts, rec_scores);
+           })
       .def("run", [](vision::ocr::RecognizerPostprocessor& self,
                      std::vector<pybind11::array>& input_array) {
         std::vector<FDTensor> inputs;
@@ -207,7 +268,9 @@ void BindPPOCRModel(pybind11::module& m) {
         std::vector<std::string> texts;
         std::vector<float> rec_scores;
         if (!self.Run(inputs, &texts, &rec_scores)) {
-          throw std::runtime_error("Failed to preprocess the input data in RecognizerPostprocessor.");
+          throw std::runtime_error(
+              "Failed to preprocess the input data in "
+              "RecognizerPostprocessor.");
         }
         return std::make_pair(texts, rec_scores);
       });
@@ -216,17 +279,20 @@ void BindPPOCRModel(pybind11::module& m) {
       .def(pybind11::init<std::string, std::string, std::string, RuntimeOption,
                           ModelFormat>())
       .def(pybind11::init<>())
-      .def_property_readonly("preprocessor", &vision::ocr::Recognizer::GetPreprocessor)
-      .def_property_readonly("postprocessor", &vision::ocr::Recognizer::GetPostprocessor)
-      .def("predict", [](vision::ocr::Recognizer& self,
-                         pybind11::array& data) {
-        auto mat = PyArrayToCvMat(data);
-        std::string text;
-        float rec_score;
-        self.Predict(mat, &text, &rec_score);
-        return std::make_pair(text, rec_score);
-      })
-      .def("batch_predict", [](vision::ocr::Recognizer& self, std::vector<pybind11::array>& data) {
+      .def_property_readonly("preprocessor",
+                             &vision::ocr::Recognizer::GetPreprocessor)
+      .def_property_readonly("postprocessor",
+                             &vision::ocr::Recognizer::GetPostprocessor)
+      .def("predict",
+           [](vision::ocr::Recognizer& self, pybind11::array& data) {
+             auto mat = PyArrayToCvMat(data);
+             std::string text;
+             float rec_score;
+             self.Predict(mat, &text, &rec_score);
+             return std::make_pair(text, rec_score);
+           })
+      .def("batch_predict", [](vision::ocr::Recognizer& self,
+                               std::vector<pybind11::array>& data) {
         std::vector<cv::Mat> images;
         std::vector<std::string> texts;
         std::vector<float> rec_scores;
diff --git a/python/fastdeploy/vision/classification/ppcls/__init__.py b/python/fastdeploy/vision/classification/ppcls/__init__.py
index 7215bcfbc..e873a5256 100644
--- a/python/fastdeploy/vision/classification/ppcls/__init__.py
+++ b/python/fastdeploy/vision/classification/ppcls/__init__.py
@@ -46,7 +46,6 @@ class PaddleClasPreprocessor(ProcessorManager):
         When the initial operator is Resize, and input image size is large,
         maybe it's better to run resize on CPU, because the HostToDevice memcpy
         is time consuming. Set this True to run the initial resize on CPU.
-
         :param: v: True or False
         """
         self._manager.initial_resize_on_cpu(v)
diff --git a/python/fastdeploy/vision/ocr/ppocr/__init__.py b/python/fastdeploy/vision/ocr/ppocr/__init__.py
index 842532301..e19fb686e 100755
--- a/python/fastdeploy/vision/ocr/ppocr/__init__.py
+++ b/python/fastdeploy/vision/ocr/ppocr/__init__.py
@@ -37,43 +37,31 @@ class DBDetectorPreprocessor:
 
     @property
     def max_side_len(self):
+        """Get max_side_len value.
+        """
         return self._preprocessor.max_side_len
 
     @max_side_len.setter
     def max_side_len(self, value):
+        """Set max_side_len value.
+        :param: value: (int) max_side_len value
+        """
         assert isinstance(
             value, int), "The value to set `max_side_len` must be type of int."
         self._preprocessor.max_side_len = value
 
-    @property
-    def is_scale(self):
-        return self._preprocessor.is_scale
-
-    @is_scale.setter
-    def is_scale(self, value):
-        assert isinstance(
-            value, bool), "The value to set `is_scale` must be type of bool."
-        self._preprocessor.is_scale = value
-
-    @property
-    def scale(self):
-        return self._preprocessor.scale
-
-    @scale.setter
-    def scale(self, value):
-        assert isinstance(
-            value, list), "The value to set `scale` must be type of list."
-        self._preprocessor.scale = value
-
-    @property
-    def mean(self):
-        return self._preprocessor.mean
-
-    @mean.setter
-    def mean(self, value):
-        assert isinstance(
-            value, list), "The value to set `mean` must be type of list."
-        self._preprocessor.mean = value
+    def set_normalize(self,
+                      mean=[0.485, 0.456, 0.406],
+                      std=[0.229, 0.224, 0.225],
+                      is_scale=True):
+        """Set preprocess normalize parameters, please call this API to
+           customize the normalize parameters, otherwise it will use the default
+           normalize parameters.
+        :param: mean: (list of float) mean values
+        :param: std: (list of float) std values
+        :param: is_scale: (boolean) whether to scale
+        """
+        self._preprocessor.set_normalize(mean, std, is_scale)
 
 
 class DBDetectorPostprocessor:
@@ -174,6 +162,7 @@ class DBDetector(FastDeployModel):
         """Clone OCR detection model object
         :return: a new OCR detection model object
         """
+
         class DBDetectorClone(DBDetector):
             def __init__(self, model):
                 self._model = model
@@ -203,18 +192,10 @@ class DBDetector(FastDeployModel):
     def preprocessor(self):
         return self._model.preprocessor
 
-    @preprocessor.setter
-    def preprocessor(self, value):
-        self._model.preprocessor = value
-
     @property
     def postprocessor(self):
         return self._model.postprocessor
 
-    @postprocessor.setter
-    def postprocessor(self, value):
-        self._model.postprocessor = value
-
     # Det Preprocessor Property
     @property
     def max_side_len(self):
@@ -226,36 +207,6 @@ class DBDetector(FastDeployModel):
             value, int), "The value to set `max_side_len` must be type of int."
         self._model.preprocessor.max_side_len = value
 
-    @property
-    def is_scale(self):
-        return self._model.preprocessor.is_scale
-
-    @is_scale.setter
-    def is_scale(self, value):
-        assert isinstance(
-            value, bool), "The value to set `is_scale` must be type of bool."
-        self._model.preprocessor.is_scale = value
-
-    @property
-    def scale(self):
-        return self._model.preprocessor.scale
-
-    @scale.setter
-    def scale(self, value):
-        assert isinstance(
-            value, list), "The value to set `scale` must be type of list."
-        self._model.preprocessor.scale = value
-
-    @property
-    def mean(self):
-        return self._model.preprocessor.mean
-
-    @mean.setter
-    def mean(self, value):
-        assert isinstance(
-            value, list), "The value to set `mean` must be type of list."
-        self._model.preprocessor.mean = value
-
     # Det Ppstprocessor Property
     @property
     def det_db_thresh(self):
@@ -421,6 +372,7 @@ class Classifier(FastDeployModel):
         """Clone OCR classification model object
         :return: a new OCR classification model object
         """
+
         class ClassifierClone(Classifier):
             def __init__(self, model):
                 self._model = model
@@ -629,6 +581,7 @@ class Recognizer(FastDeployModel):
         """Clone OCR recognition model object
         :return: a new OCR recognition model object
         """
+
         class RecognizerClone(Recognizer):
             def __init__(self, model):
                 self._model = model
@@ -734,7 +687,7 @@ class PPOCRv3(FastDeployModel):
         assert det_model is not None and rec_model is not None, "The det_model and rec_model cannot be None."
         if cls_model is None:
             self.system_ = C.vision.ocr.PPOCRv3(det_model._model,
-                                               rec_model._model)
+                                                rec_model._model)
         else:
             self.system_ = C.vision.ocr.PPOCRv3(
                 det_model._model, cls_model._model, rec_model._model)
@@ -743,6 +696,7 @@ class PPOCRv3(FastDeployModel):
         """Clone PPOCRv3 pipeline object
         :return: a new PPOCRv3 pipeline object
         """
+
         class PPOCRv3Clone(PPOCRv3):
             def __init__(self, system):
                 self.system_ = system
@@ -809,7 +763,7 @@ class PPOCRv2(FastDeployModel):
         assert det_model is not None and rec_model is not None, "The det_model and rec_model cannot be None."
         if cls_model is None:
             self.system_ = C.vision.ocr.PPOCRv2(det_model._model,
-                                               rec_model._model)
+                                                rec_model._model)
         else:
             self.system_ = C.vision.ocr.PPOCRv2(
                 det_model._model, cls_model._model, rec_model._model)
@@ -818,6 +772,7 @@ class PPOCRv2(FastDeployModel):
         """Clone PPOCRv3 pipeline object
         :return: a new PPOCRv3 pipeline object
         """
+
         class PPOCRv2Clone(PPOCRv2):
             def __init__(self, system):
                 self.system_ = system

From 062b4fd3276b6b223a057fbf5a82bdbfe80e6cb4 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Thu, 23 Feb 2023 12:45:25 +0800
Subject: [PATCH 17/20] [Bug Fix] Fix memory leak problem for paddleseg model
 (#1421)

Fix memory leak problem for paddleseg model

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
---
 fastdeploy/vision/segmentation/ppseg/postprocessor.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/vision/segmentation/ppseg/postprocessor.cc b/fastdeploy/vision/segmentation/ppseg/postprocessor.cc
index 953cf2c68..364fe286d 100644
--- a/fastdeploy/vision/segmentation/ppseg/postprocessor.cc
+++ b/fastdeploy/vision/segmentation/ppseg/postprocessor.cc
@@ -163,16 +163,16 @@ bool PaddleSegPostprocessor::FDTensorCast2Uint8(FDTensor* infer_result,
     // cv::resize don't support `CV_8S` or `CV_32S`
     // refer to https://github.com/opencv/opencv/issues/20991
     // https://github.com/opencv/opencv/issues/7862
-    uint8_result_buffer = new std::vector<uint8_t>(
-        infer_result_buffer, infer_result_buffer + offset);
+    uint8_result_buffer->resize(offset * sizeof(int64_t));
+    memcpy(uint8_result_buffer->data(), infer_result_buffer, offset * sizeof(int64_t));
   } else if (infer_result_dtype == FDDataType::INT32) {
     const int32_t* infer_result_buffer =
         reinterpret_cast<const int32_t*>(infer_result->CpuData());
     // cv::resize don't support `CV_8S` or `CV_32S`
     // refer to https://github.com/opencv/opencv/issues/20991
     // https://github.com/opencv/opencv/issues/7862
-    uint8_result_buffer = new std::vector<uint8_t>(
-        infer_result_buffer, infer_result_buffer + offset);
+    uint8_result_buffer->resize(offset * sizeof(int32_t));
+    memcpy(uint8_result_buffer->data(), infer_result_buffer, offset * sizeof(int32_t));
   } else {
     FDASSERT(false, 
              "Require the data type for casting uint8 is int64, int32, but now "

From a1f9aa1c5a505b9a326841aa00669bba6e94f0b7 Mon Sep 17 00:00:00 2001
From: Zeref996 <53218160+Zeref996@users.noreply.github.com>
Date: Thu, 23 Feb 2023 14:59:50 +0800
Subject: [PATCH 18/20] refresh docker compile doc (#1423)

* refresh docker compile doc

* refresh docker compile doc 1
---
 serving/README.md    | 4 ++--
 serving/README_CN.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/serving/README.md b/serving/README.md
index 61b5ede11..1016607b7 100644
--- a/serving/README.md
+++ b/serving/README.md
@@ -20,7 +20,7 @@ FastDeploy builds an end-to-end serving deployment based on [Triton Inference Se
 CPU images only support Paddle/ONNX models for serving deployment on CPUs, and supported inference backends include OpenVINO, Paddle Inference, and ONNX Runtime
 
 ```shell
-docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.2-cpu-only-21.10
+docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.4-cpu-only-21.10
 ```
 
 #### GPU Image
@@ -28,7 +28,7 @@ docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.2-cpu-only-21.10
 GPU images support Paddle/ONNX models for serving deployment on GPU and CPU, and supported inference backends including OpenVINO, TensorRT, Paddle Inference, and ONNX Runtime
 
 ```
-docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.2-gpu-cuda11.4-trt8.4-21.10
+docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.4-gpu-cuda11.4-trt8.5-21.10
 ```
 
 Users can also compile the image by themselves according to their own needs, referring to the following documents:
diff --git a/serving/README_CN.md b/serving/README_CN.md
index f961f4eaf..895df1222 100644
--- a/serving/README_CN.md
+++ b/serving/README_CN.md
@@ -17,13 +17,13 @@ FastDeploy基于[Triton Inference Server](https://github.com/triton-inference-se
 #### CPU镜像
 CPU镜像仅支持Paddle/ONNX模型在CPU上进行服务化部署，支持的推理后端包括OpenVINO、Paddle Inference和ONNX Runtime
 ``` shell
-docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.2-cpu-only-21.10
+docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.4-cpu-only-21.10
 ```
 
 #### GPU镜像
 GPU镜像支持Paddle/ONNX模型在GPU/CPU上进行服务化部署，支持的推理后端包括OpenVINO、TensorRT、Paddle Inference和ONNX Runtime
 ```
-docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.2-gpu-cuda11.4-trt8.4-21.10
+docker pull registry.baidubce.com/paddlepaddle/fastdeploy:1.0.4-gpu-cuda11.4-trt8.5-21.10
 ```
 
 用户也可根据自身需求，参考如下文档自行编译镜像

From 0c664fd006b6e70fff81c4500a1c405a9b42ef0c Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Thu, 23 Feb 2023 16:03:40 +0800
Subject: [PATCH 19/20] [Other] Fix some memory leak problem (#1422)

* Fix memory leak problem for paddleseg model

* Fix bug

* Update postprocessor.cc

---------

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
---
 .../matting/ppmatting/ppmatting_pybind.cc     |  4 +-
 .../segmentation/ppseg/postprocessor.cc       | 37 +------------------
 .../vision/segmentation/ppseg/postprocessor.h |  4 --
 .../tracking/pptracking/pptracking_pybind.cc  |  4 +-
 4 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/fastdeploy/vision/matting/ppmatting/ppmatting_pybind.cc b/fastdeploy/vision/matting/ppmatting/ppmatting_pybind.cc
index 97837fa6f..a16d36f72 100644
--- a/fastdeploy/vision/matting/ppmatting/ppmatting_pybind.cc
+++ b/fastdeploy/vision/matting/ppmatting/ppmatting_pybind.cc
@@ -21,8 +21,8 @@ void BindPPMatting(pybind11::module& m) {
       .def("predict",
            [](vision::matting::PPMatting& self, pybind11::array& data) {
              auto mat = PyArrayToCvMat(data);
-             vision::MattingResult* res = new vision::MattingResult();
-             self.Predict(&mat, res);
+             vision::MattingResult res;
+             self.Predict(&mat, &res);
              return res;
            });
 }
diff --git a/fastdeploy/vision/segmentation/ppseg/postprocessor.cc b/fastdeploy/vision/segmentation/ppseg/postprocessor.cc
index 364fe286d..df8dcdba6 100644
--- a/fastdeploy/vision/segmentation/ppseg/postprocessor.cc
+++ b/fastdeploy/vision/segmentation/ppseg/postprocessor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/vision/segmentation/ppseg/postprocessor.h"
+#include "fastdeploy/function/cast.h"
 #include "yaml-cpp/yaml.h"
 
 namespace fastdeploy {
@@ -153,39 +154,6 @@ bool PaddleSegPostprocessor::ProcessWithLabelResult(const FDTensor& infer_result
   return true;
 }
 
-bool PaddleSegPostprocessor::FDTensorCast2Uint8(FDTensor* infer_result,
-                                                const int64_t& offset, 
-                                                std::vector<uint8_t>* uint8_result_buffer) {
-  FDDataType infer_result_dtype = infer_result->dtype;
-  if (infer_result_dtype == FDDataType::INT64) {
-    const int64_t* infer_result_buffer =
-        reinterpret_cast<const int64_t*>(infer_result->CpuData());
-    // cv::resize don't support `CV_8S` or `CV_32S`
-    // refer to https://github.com/opencv/opencv/issues/20991
-    // https://github.com/opencv/opencv/issues/7862
-    uint8_result_buffer->resize(offset * sizeof(int64_t));
-    memcpy(uint8_result_buffer->data(), infer_result_buffer, offset * sizeof(int64_t));
-  } else if (infer_result_dtype == FDDataType::INT32) {
-    const int32_t* infer_result_buffer =
-        reinterpret_cast<const int32_t*>(infer_result->CpuData());
-    // cv::resize don't support `CV_8S` or `CV_32S`
-    // refer to https://github.com/opencv/opencv/issues/20991
-    // https://github.com/opencv/opencv/issues/7862
-    uint8_result_buffer->resize(offset * sizeof(int32_t));
-    memcpy(uint8_result_buffer->data(), infer_result_buffer, offset * sizeof(int32_t));
-  } else {
-    FDASSERT(false, 
-             "Require the data type for casting uint8 is int64, int32, but now "
-             "it's %s.",
-             Str(infer_result_dtype).c_str());
-    return false;
-  }
-  infer_result->SetExternalData(
-      infer_result->shape, FDDataType::UINT8,
-      reinterpret_cast<void*>(uint8_result_buffer->data()));
-  return true;
-}
-
 bool PaddleSegPostprocessor::Run(
     const std::vector<FDTensor>& infer_results,
     std::vector<SegmentationResult>* results,
@@ -279,13 +247,12 @@ bool PaddleSegPostprocessor::Run(
     }
 
     FDMat mat;
-    std::vector<uint8_t> uint8_result_buffer;
     // Resize interpration 
     int interpolation = cv::INTER_LINEAR;
     if (is_resized) {
       if (infer_results_dtype == FDDataType::INT64 ||
           infer_results_dtype == FDDataType::INT32 ){
-        FDTensorCast2Uint8(&infer_result, infer_chw, &uint8_result_buffer);
+        function::Cast(infer_result, &infer_result, FDDataType::UINT8);
         // label map resize with nearest interpolation
         interpolation = cv::INTER_NEAREST;
       }
diff --git a/fastdeploy/vision/segmentation/ppseg/postprocessor.h b/fastdeploy/vision/segmentation/ppseg/postprocessor.h
index 89c8371ee..966b11645 100644
--- a/fastdeploy/vision/segmentation/ppseg/postprocessor.h
+++ b/fastdeploy/vision/segmentation/ppseg/postprocessor.h
@@ -80,10 +80,6 @@ class FASTDEPLOY_DECL PaddleSegPostprocessor {
                                       const int64_t& out_num,
                                       SegmentationResult* result);
 
-  virtual bool FDTensorCast2Uint8(FDTensor* infer_result,
-                                  const int64_t& offset,
-                                  std::vector<uint8_t>* uint8_result_buffer);
-
   bool is_with_softmax_ = false;
 
   bool is_with_argmax_ = true;
diff --git a/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc b/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc
index a5638628e..72273e838 100644
--- a/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc
+++ b/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc
@@ -28,8 +28,8 @@ void BindPPTracking(pybind11::module &m) {
          [](vision::tracking::PPTracking &self,
             pybind11::array &data) {
              auto mat = PyArrayToCvMat(data);
-             vision::MOTResult *res = new vision::MOTResult();
-             self.Predict(&mat, res);
+             vision::MOTResult res;
+             self.Predict(&mat, &res);
              return res;
          })
     .def("bind_recorder", &vision::tracking::PPTracking::BindRecorder)

From d3845eb4e1aac7d223109e58a89255d7465105c7 Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Thu, 23 Feb 2023 18:57:39 +0800
Subject: [PATCH 20/20] [Benchmark]Compare diff for OCR (#1415)

* avoid mem copy for cpp benchmark

* set CMAKE_BUILD_TYPE to Release

* Add SegmentationDiff

* change pointer to reference

* fixed bug

* cast uint8 to int32

* Add diff compare for OCR

* Add diff compare for OCR

* rm ppocr pipeline

* Add yolov5 diff compare

* Add yolov5 diff compare

* deal with comments

* deal with comments

* fixed bug

* fixed bug
---
 benchmark/cpp/CMakeLists.txt                  | 12 ++-
 benchmark/cpp/benchmark_ppocr.cc              | 97 -------------------
 benchmark/cpp/benchmark_ppocr_cls.cc          | 57 +++++++++++
 benchmark/cpp/benchmark_ppocr_det.cc          | 63 ++++++++++++
 benchmark/cpp/benchmark_ppocr_rec.cc          | 59 +++++++++++
 benchmark/cpp/benchmark_yolov5.cc             | 29 +++++-
 benchmark/cpp/run_benchmark_ppyolov8.sh       |  0
 fastdeploy/benchmark/utils.cc                 | 73 ++++++++++++++
 fastdeploy/benchmark/utils.h                  | 11 +++
 .../vision/classification/contrib/resnet.cc   | 47 ++++-----
 .../contrib/yolov5cls/preprocessor.cc         | 21 ++--
 .../contrib/fastestdet/preprocessor.cc        | 25 ++---
 .../vision/detection/contrib/nanodet_plus.cc  |  6 +-
 .../vision/detection/contrib/scaledyolov4.cc  |  6 +-
 fastdeploy/vision/detection/contrib/yolor.cc  |  6 +-
 .../detection/contrib/yolov5/preprocessor.cc  | 17 ++--
 .../vision/detection/contrib/yolov5lite.cc    |  4 +-
 .../contrib/yolov5seg/preprocessor.cc         | 17 ++--
 fastdeploy/vision/detection/contrib/yolov6.cc |  4 +-
 .../detection/contrib/yolov7/preprocessor.cc  | 17 ++--
 .../detection/contrib/yolov7end2end_ort.cc    |  5 +-
 .../detection/contrib/yolov7end2end_trt.cc    |  4 +-
 .../detection/contrib/yolov8/preprocessor.cc  |  2 +-
 fastdeploy/vision/detection/contrib/yolox.cc  |  2 +-
 .../facealign/contrib/face_landmark_1000.cc   |  2 +-
 fastdeploy/vision/facealign/contrib/pfld.cc   | 20 ++--
 fastdeploy/vision/facealign/contrib/pipnet.cc |  2 +-
 .../vision/facedet/contrib/retinaface.cc      |  6 +-
 .../vision/facedet/contrib/ultraface.cc       |  4 +-
 .../vision/facedet/contrib/yolov5face.cc      | 18 ++--
 .../contrib/yolov7face/preprocessor.cc        | 17 ++--
 .../faceid/contrib/adaface/preprocessor.cc    |  8 +-
 .../contrib/insightface/preprocessor.cc       |  2 +-
 fastdeploy/vision/headpose/contrib/fsanet.cc  | 22 ++---
 fastdeploy/vision/matting/contrib/modnet.cc   |  6 +-
 fastdeploy/vision/matting/contrib/rvm.cc      | 10 +-
 fastdeploy/vision/utils/sort_det_res.cc       | 62 ++++++++++--
 fastdeploy/vision/utils/utils.h               |  5 +-
 38 files changed, 513 insertions(+), 255 deletions(-)
 delete mode 100755 benchmark/cpp/benchmark_ppocr.cc
 create mode 100644 benchmark/cpp/benchmark_ppocr_cls.cc
 create mode 100644 benchmark/cpp/benchmark_ppocr_det.cc
 create mode 100644 benchmark/cpp/benchmark_ppocr_rec.cc
 mode change 100755 => 100644 benchmark/cpp/benchmark_yolov5.cc
 delete mode 100644 benchmark/cpp/run_benchmark_ppyolov8.sh
 mode change 100644 => 100755 fastdeploy/benchmark/utils.cc
 mode change 100755 => 100644 fastdeploy/vision/detection/contrib/scaledyolov4.cc
 mode change 100755 => 100644 fastdeploy/vision/detection/contrib/yolor.cc
 mode change 100755 => 100644 fastdeploy/vision/detection/contrib/yolov5/preprocessor.cc
 mode change 100755 => 100644 fastdeploy/vision/detection/contrib/yolov7/preprocessor.cc
 mode change 100755 => 100644 fastdeploy/vision/detection/contrib/yolov7end2end_ort.cc
 mode change 100755 => 100644 fastdeploy/vision/faceid/contrib/adaface/preprocessor.cc
 mode change 100755 => 100644 fastdeploy/vision/matting/contrib/rvm.cc
 mode change 100644 => 100755 fastdeploy/vision/utils/sort_det_res.cc
 mode change 100644 => 100755 fastdeploy/vision/utils/utils.h

diff --git a/benchmark/cpp/CMakeLists.txt b/benchmark/cpp/CMakeLists.txt
index f839eb228..a6f0b87c2 100755
--- a/benchmark/cpp/CMakeLists.txt
+++ b/benchmark/cpp/CMakeLists.txt
@@ -13,7 +13,9 @@ add_executable(benchmark_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov8.cc)
 add_executable(benchmark_ppcls ${PROJECT_SOURCE_DIR}/benchmark_ppcls.cc)
 add_executable(benchmark_precision_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_precision_ppyolov8.cc)
 add_executable(benchmark_ppseg ${PROJECT_SOURCE_DIR}/benchmark_ppseg.cc)
-add_executable(benchmark_ppocr ${PROJECT_SOURCE_DIR}/benchmark_ppocr.cc)
+add_executable(benchmark_ppocr_det ${PROJECT_SOURCE_DIR}/benchmark_ppocr_det.cc)
+add_executable(benchmark_ppocr_cls ${PROJECT_SOURCE_DIR}/benchmark_ppocr_cls.cc)
+add_executable(benchmark_ppocr_rec ${PROJECT_SOURCE_DIR}/benchmark_ppocr_rec.cc)
 
 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
   target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
@@ -21,12 +23,16 @@ if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
   target_link_libraries(benchmark_ppcls ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_precision_ppyolov8 ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_ppseg ${FASTDEPLOY_LIBS} gflags pthread)
-  target_link_libraries(benchmark_ppocr ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppocr_det ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppocr_cls ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppocr_rec ${FASTDEPLOY_LIBS} gflags pthread)
 else()
   target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ppcls ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_precision_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ppseg ${FASTDEPLOY_LIBS} gflags)
-  target_link_libraries(benchmark_ppocr ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppocr_det ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppocr_cls ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppocr_rec ${FASTDEPLOY_LIBS} gflags)
 endif()
diff --git a/benchmark/cpp/benchmark_ppocr.cc b/benchmark/cpp/benchmark_ppocr.cc
deleted file mode 100755
index e81080c54..000000000
--- a/benchmark/cpp/benchmark_ppocr.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "flags.h"
-#include "macros.h"
-#include "option.h"
-
-// Only for ppocr
-DEFINE_string(det_model, "", "Path of Detection model of PPOCR.");
-DEFINE_string(cls_model, "", "Path of Classification model of PPOCR.");
-DEFINE_string(rec_model, "", "Path of Recognization model of PPOCR.");
-DEFINE_string(rec_label_file, "", "Path of Recognization label file of PPOCR.");
-DEFINE_string(image_rec, "", "Path of Recognization img file of PPOCR.");
-
-int main(int argc, char* argv[]) {
-#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
-  // Initialization
-  auto option = fastdeploy::RuntimeOption();
-  if (!CreateRuntimeOption(&option, argc, argv, true)) {
-    return -1;
-  }
-  auto im = cv::imread(FLAGS_image);
-  auto im_rec = cv::imread(FLAGS_image_rec);
-  // Detection Model
-  auto det_model_file =
-      FLAGS_model + sep + FLAGS_det_model + sep + "inference.pdmodel";
-  auto det_params_file =
-      FLAGS_model + sep + FLAGS_det_model + sep + "inference.pdiparams";
-  // Classification Model
-  auto cls_model_file =
-      FLAGS_model + sep + FLAGS_cls_model + sep + "inference.pdmodel";
-  auto cls_params_file =
-      FLAGS_model + sep + FLAGS_cls_model + sep + "inference.pdiparams";
-  // Recognition Model
-  auto rec_model_file =
-      FLAGS_model + sep + FLAGS_rec_model + sep + "inference.pdmodel";
-  auto rec_params_file =
-      FLAGS_model + sep + FLAGS_rec_model + sep + "inference.pdiparams";
-  auto rec_label_file = FLAGS_rec_label_file;
-  if (FLAGS_backend == "paddle_trt") {
-    option.paddle_infer_option.collect_trt_shape = true;
-  }
-  auto det_option = option;
-  auto cls_option = option;
-  auto rec_option = option;
-  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
-    det_option.trt_option.SetShape("x", {1, 3, 64, 64}, {1, 3, 640, 640},
-                                   {1, 3, 960, 960});
-    cls_option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
-                                   {8, 3, 48, 1024});
-    rec_option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
-                                   {8, 3, 48, 2304});
-  }
-  auto det_model = fastdeploy::vision::ocr::DBDetector(
-      det_model_file, det_params_file, det_option);
-  auto cls_model = fastdeploy::vision::ocr::Classifier(
-      cls_model_file, cls_params_file, cls_option);
-  auto rec_model = fastdeploy::vision::ocr::Recognizer(
-      rec_model_file, rec_params_file, rec_label_file, rec_option);
-  // Only for runtime
-  if (FLAGS_profile_mode == "runtime") {
-    std::vector<std::array<int, 8>> boxes_result;
-    std::cout << "====Detection model====" << std::endl;
-    BENCHMARK_MODEL(det_model, det_model.Predict(im, &boxes_result));
-    int32_t cls_label;
-    float cls_score;
-    std::cout << "====Classification model====" << std::endl;
-    BENCHMARK_MODEL(cls_model,
-                    cls_model.Predict(im_rec, &cls_label, &cls_score));
-    std::string text;
-    float rec_score;
-    std::cout << "====Recognization model====" << std::endl;
-    BENCHMARK_MODEL(rec_model, rec_model.Predict(im_rec, &text, &rec_score));
-  }
-  auto model_ppocrv3 =
-      fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model);
-  fastdeploy::vision::OCRResult res;
-  if (FLAGS_profile_mode == "end2end") {
-    BENCHMARK_MODEL(model_ppocrv3, model_ppocrv3.Predict(im, &res))
-  }
-  auto vis_im = fastdeploy::vision::VisOcr(im, res);
-  cv::imwrite("vis_result.jpg", vis_im);
-  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
-#endif
-  return 0;
-}
\ No newline at end of file
diff --git a/benchmark/cpp/benchmark_ppocr_cls.cc b/benchmark/cpp/benchmark_ppocr_cls.cc
new file mode 100644
index 000000000..0ddd939bc
--- /dev/null
+++ b/benchmark/cpp/benchmark_ppocr_cls.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  // Classification Model
+  auto cls_model_file = FLAGS_model + sep + "inference.pdmodel";
+  auto cls_params_file = FLAGS_model + sep + "inference.pdiparams";
+  if (FLAGS_backend == "paddle_trt") {
+    option.paddle_infer_option.collect_trt_shape = true;
+  }
+  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
+    option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
+                               {8, 3, 48, 1024});
+  }
+  auto model_ppocr_cls = fastdeploy::vision::ocr::Classifier(
+      cls_model_file, cls_params_file, option);
+  int32_t res_label;
+  float res_score;
+  // Run once at least
+  model_ppocr_cls.Predict(im, &res_label, &res_score);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  int32_t res_label_expect = 0;
+  float res_score_expect = 1.0;
+  // Calculate diff between two results.
+  auto ppocr_cls_label_diff = res_label - res_label_expect;
+  auto ppocr_cls_score_diff = res_score - res_score_expect;
+  std::cout << "PPOCR Cls label diff: " << ppocr_cls_label_diff << std::endl;
+  std::cout << "PPOCR Cls score diff: " << abs(ppocr_cls_score_diff)
+            << std::endl;
+  BENCHMARK_MODEL(model_ppocr_cls,
+                  model_ppocr_cls.Predict(im, &res_label, &res_score));
+#endif
+  return 0;
+}
\ No newline at end of file
diff --git a/benchmark/cpp/benchmark_ppocr_det.cc b/benchmark/cpp/benchmark_ppocr_det.cc
new file mode 100644
index 000000000..f98b1c9f3
--- /dev/null
+++ b/benchmark/cpp/benchmark_ppocr_det.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  // Detection Model
+  auto det_model_file = FLAGS_model + sep + "inference.pdmodel";
+  auto det_params_file = FLAGS_model + sep + "inference.pdiparams";
+  if (FLAGS_backend == "paddle_trt") {
+    option.paddle_infer_option.collect_trt_shape = true;
+  }
+  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
+    option.trt_option.SetShape("x", {1, 3, 64, 64}, {1, 3, 640, 640},
+                               {1, 3, 960, 960});
+  }
+  auto model_ppocr_det =
+      vision::ocr::DBDetector(det_model_file, det_params_file, option);
+  std::vector<std::array<int, 8>> res;
+  // Run once at least
+  model_ppocr_det.Predict(im, &res);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  // Save result to -> disk.
+  std::string ppocr_det_result_path = "ppocr_det_result.txt";
+  benchmark::ResultManager::SaveOCRDetResult(res, ppocr_det_result_path);
+  // Load result from <- disk.
+  std::vector<std::array<int, 8>> res_loaded;
+  benchmark::ResultManager::LoadOCRDetResult(&res_loaded,
+                                             ppocr_det_result_path);
+  // Calculate diff between two results.
+  auto ppocr_det_diff =
+      benchmark::ResultManager::CalculateDiffStatis(res, res_loaded);
+  std::cout << "PPOCR Boxes diff: mean=" << ppocr_det_diff.boxes.mean
+            << ", max=" << ppocr_det_diff.boxes.max
+            << ", min=" << ppocr_det_diff.boxes.min << std::endl;
+  BENCHMARK_MODEL(model_ppocr_det, model_ppocr_det.Predict(im, &res));
+#endif
+  return 0;
+}
\ No newline at end of file
diff --git a/benchmark/cpp/benchmark_ppocr_rec.cc b/benchmark/cpp/benchmark_ppocr_rec.cc
new file mode 100644
index 000000000..71bb6b353
--- /dev/null
+++ b/benchmark/cpp/benchmark_ppocr_rec.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+DEFINE_string(rec_label_file, "", "Path of Recognization label file of PPOCR.");
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  // Recognition Model
+  auto rec_model_file = FLAGS_model + sep + "inference.pdmodel";
+  auto rec_params_file = FLAGS_model + sep + "inference.pdiparams";
+  if (FLAGS_backend == "paddle_trt") {
+    option.paddle_infer_option.collect_trt_shape = true;
+  }
+  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
+    option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
+                               {8, 3, 48, 2304});
+  }
+  auto model_ppocr_rec = fastdeploy::vision::ocr::Recognizer(
+      rec_model_file, rec_params_file, FLAGS_rec_label_file, option);
+  std::string text;
+  float rec_score;
+  // Run once at least
+  model_ppocr_rec.Predict(im, &text, &rec_score);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  std::string text_expect = "上海斯格威铂尔大酒店";
+  float res_score_expect = 0.993308;
+  // Calculate diff between two results.
+  auto ppocr_rec_text_diff = text.compare(text_expect);
+  auto ppocr_rec_score_diff = rec_score - res_score_expect;
+  std::cout << "PPOCR Rec text diff: " << ppocr_rec_text_diff << std::endl;
+  std::cout << "PPOCR Rec score diff: " << abs(ppocr_rec_score_diff)
+            << std::endl;
+  BENCHMARK_MODEL(model_ppocr_rec,
+                  model_ppocr_rec.Predict(im, &text, &rec_score));
+#endif
+  return 0;
+}
\ No newline at end of file
diff --git a/benchmark/cpp/benchmark_yolov5.cc b/benchmark/cpp/benchmark_yolov5.cc
old mode 100755
new mode 100644
index 07c36e31e..848851de9
--- a/benchmark/cpp/benchmark_yolov5.cc
+++ b/benchmark/cpp/benchmark_yolov5.cc
@@ -16,6 +16,9 @@
 #include "macros.h"
 #include "option.h"
 
+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
 int main(int argc, char* argv[]) {
 #if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
   // Initialization
@@ -24,11 +27,29 @@ int main(int argc, char* argv[]) {
     return -1;
   }
   auto im = cv::imread(FLAGS_image);
-  auto model_yolov5 =
-      fastdeploy::vision::detection::YOLOv5(FLAGS_model, "", option);
-  fastdeploy::vision::DetectionResult res;
+  auto model_yolov5 = vision::detection::YOLOv5(FLAGS_model, "", option);
+  vision::DetectionResult res;
+  // Run once at least
+  model_yolov5.Predict(im, &res);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  // Save result to -> disk.
+  std::string det_result_path = "yolov5_result.txt";
+  benchmark::ResultManager::SaveDetectionResult(res, det_result_path);
+  // Load result from <- disk.
+  vision::DetectionResult res_loaded;
+  benchmark::ResultManager::LoadDetectionResult(&res_loaded, det_result_path);
+  // Calculate diff between two results.
+  auto det_diff =
+      benchmark::ResultManager::CalculateDiffStatis(res, res_loaded);
+  std::cout << "Boxes diff: mean=" << det_diff.boxes.mean
+            << ", max=" << det_diff.boxes.max << ", min=" << det_diff.boxes.min
+            << std::endl;
+  std::cout << "Label_ids diff: mean=" << det_diff.labels.mean
+            << ", max=" << det_diff.labels.max
+            << ", min=" << det_diff.labels.min << std::endl;
   BENCHMARK_MODEL(model_yolov5, model_yolov5.Predict(im, &res))
-  auto vis_im = fastdeploy::vision::VisDetection(im, res);
+  auto vis_im = vision::VisDetection(im, res);
   cv::imwrite("vis_result.jpg", vis_im);
   std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
 #endif
diff --git a/benchmark/cpp/run_benchmark_ppyolov8.sh b/benchmark/cpp/run_benchmark_ppyolov8.sh
deleted file mode 100644
index e69de29bb..000000000
diff --git a/fastdeploy/benchmark/utils.cc b/fastdeploy/benchmark/utils.cc
old mode 100644
new mode 100755
index 5af28e4b1..a66bdb6c0
--- a/fastdeploy/benchmark/utils.cc
+++ b/fastdeploy/benchmark/utils.cc
@@ -474,6 +474,34 @@ bool ResultManager::SaveSegmentationResult(
   return true;
 }
 
+bool ResultManager::SaveOCRDetResult(const std::vector<std::array<int, 8>>& res,
+                                     const std::string& path) {
+  if (res.empty()) {
+    FDERROR << "OCRDetResult can not be empty!" << std::endl;
+    return false;
+  }
+  std::ofstream fs(path, std::ios::out);
+  if (!fs.is_open()) {
+    FDERROR << "Fail to open file:" << path << std::endl;
+    return false;
+  }
+  fs.precision(20);
+  // boxes
+  fs << "boxes" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.size(); ++i) {
+    for (int j = 0; j < 8; ++j) {
+      if ((i == res.size() - 1) && (j == 7)) {
+        fs << res[i][j];
+      } else {
+        fs << res[i][j] << VALUE_SEP;
+      }
+    }
+  }
+  fs << "\n";
+  fs.close();
+  return true;
+}
+
 bool ResultManager::LoadDetectionResult(vision::DetectionResult* res,
                                         const std::string& path) {
   if (!CheckFileExists(path)) {
@@ -556,6 +584,26 @@ bool ResultManager::LoadSegmentationResult(vision::SegmentationResult* res,
   return true;
 }
 
+bool ResultManager::LoadOCRDetResult(std::vector<std::array<int, 8>>* res,
+                                     const std::string& path) {
+  if (!CheckFileExists(path)) {
+    FDERROR << "Can't found file from" << path << std::endl;
+    return false;
+  }
+  auto lines = ReadLines(path);
+  std::map<std::string, std::vector<std::string>> data;
+  // boxes
+  data = SplitDataLine(lines[0]);
+  int boxes_num = data.begin()->second.size() / 8;
+  res->resize(boxes_num);
+  for (int i = 0; i < boxes_num; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      (*res)[i][j] = std::stoi(data.begin()->second[i * 8 + j]);
+    }
+  }
+  return true;
+}
+
 DetectionDiff ResultManager::CalculateDiffStatis(
     const vision::DetectionResult& lhs, const vision::DetectionResult& rhs,
     const float& score_threshold) {
@@ -643,6 +691,31 @@ SegmentationDiff ResultManager::CalculateDiffStatis(
   return diff;
 }
 
+OCRDetDiff ResultManager::CalculateDiffStatis(
+    const std::vector<std::array<int, 8>>& lhs,
+    const std::vector<std::array<int, 8>>& rhs) {
+  const int boxes_nums = std::min(lhs.size(), rhs.size());
+  std::vector<std::array<int, 8>> lhs_sort = lhs;
+  std::vector<std::array<int, 8>> rhs_sort = rhs;
+  // lex sort by x(w) & y(h)
+  vision::utils::LexSortOCRDetResultByXY(&lhs_sort);
+  vision::utils::LexSortOCRDetResultByXY(&rhs_sort);
+  // get value diff
+  const int boxes_num = std::min(lhs_sort.size(), rhs_sort.size());
+  std::vector<float> boxes_diff;
+  for (int i = 0; i < boxes_num; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      boxes_diff.push_back(lhs_sort[i][j] - rhs_sort[i][j]);
+    }
+  }
+
+  OCRDetDiff diff;
+  CalculateStatisInfo<float>(boxes_diff.data(), boxes_diff.size(),
+                             &(diff.boxes.mean), &(diff.boxes.max),
+                             &(diff.boxes.min));
+  return diff;
+}
+
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 
diff --git a/fastdeploy/benchmark/utils.h b/fastdeploy/benchmark/utils.h
index f4d608133..2ad0ae4aa 100755
--- a/fastdeploy/benchmark/utils.h
+++ b/fastdeploy/benchmark/utils.h
@@ -122,6 +122,10 @@ struct FASTDEPLOY_DECL SegmentationDiff: public BaseDiff {
   EvalStatis labels;
 };
 
+struct FASTDEPLOY_DECL OCRDetDiff: public BaseDiff {
+  EvalStatis boxes;
+};
+
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 
@@ -148,6 +152,10 @@ struct FASTDEPLOY_DECL ResultManager {
                                      const std::string& path);
   static bool LoadSegmentationResult(vision::SegmentationResult* res,
                                      const std::string& path);
+  static bool SaveOCRDetResult(const std::vector<std::array<int, 8>>& res,
+                               const std::string& path);
+  static bool LoadOCRDetResult(std::vector<std::array<int, 8>>* res,
+                               const std::string& path);
   /// Calculate diff value between two basic results.
   static DetectionDiff CalculateDiffStatis(const vision::DetectionResult& lhs,
                                            const vision::DetectionResult& rhs,
@@ -157,6 +165,9 @@ struct FASTDEPLOY_DECL ResultManager {
   static SegmentationDiff CalculateDiffStatis(
       const vision::SegmentationResult& lhs,
       const vision::SegmentationResult& rhs);
+  static OCRDetDiff CalculateDiffStatis(
+      const std::vector<std::array<int, 8>>& lhs,
+      const std::vector<std::array<int, 8>>& rhs);
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 };
diff --git a/fastdeploy/vision/classification/contrib/resnet.cc b/fastdeploy/vision/classification/contrib/resnet.cc
index 2eed67992..fffbeada6 100644
--- a/fastdeploy/vision/classification/contrib/resnet.cc
+++ b/fastdeploy/vision/classification/contrib/resnet.cc
@@ -13,23 +13,22 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/classification/contrib/resnet.h"
-#include "fastdeploy/vision/utils/utils.h"
 #include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/utils/utils.h"
 
 namespace fastdeploy {
 namespace vision {
 namespace classification {
 
-ResNet::ResNet(const std::string& model_file,
-               const std::string& params_file,
+ResNet::ResNet(const std::string& model_file, const std::string& params_file,
                const RuntimeOption& custom_option,
                const ModelFormat& model_format) {
   // In constructor, the 3 steps below are necessary.
   // 1. set the Backend 2. set RuntimeOption 3. call Initialize()
 
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT, Backend::OPENVINO}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT};  
+    valid_cpu_backends = {Backend::ORT, Backend::OPENVINO};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER};
     valid_gpu_backends = {Backend::PDINFER};
@@ -42,7 +41,6 @@ ResNet::ResNet(const std::string& model_file,
 }
 
 bool ResNet::Initialize() {
-
   // In this function, the 3 steps below are necessary.
   // 1. assign values to the global variables 2. call InitRuntime()
 
@@ -57,14 +55,15 @@ bool ResNet::Initialize() {
   return true;
 }
 
-
 bool ResNet::Preprocess(Mat* mat, FDTensor* output) {
+  // In this function, the preprocess need be implemented according to the
+  // original Repos,
+  // The result of preprocess has to be saved in FDTensor variable, because the
+  // input of Infer() need to be std::vector<FDTensor>.
+  // 1. Resize 2. BGR2RGB 3. Normalize 4. HWC2CHW 5. Put the result into
+  // FDTensor variable.
 
-// In this function, the preprocess need be implemented according to the original Repos,
-// The result of preprocess has to be saved in FDTensor variable, because the input of Infer() need to be std::vector<FDTensor>.
-// 1. Resize 2. BGR2RGB 3. Normalize 4. HWC2CHW 5. Put the result into FDTensor variable.
-        
-  if (mat->Height()!=size[0] || mat->Width()!=size[1]){
+  if (mat->Height() != size[0] || mat->Width() != size[1]) {
     int interp = cv::INTER_LINEAR;
     Resize::Run(mat, size[1], size[0], -1, -1, interp);
   }
@@ -75,20 +74,23 @@ bool ResNet::Preprocess(Mat* mat, FDTensor* output) {
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
-bool ResNet::Postprocess(FDTensor& infer_result,
-                                  ClassifyResult* result, int topk) {
-
-  // In this function, the postprocess need be implemented according to the original Repos,
-  // Finally the reslut of postprocess should be saved in ClassifyResult variable.
-  // 1. Softmax 2. Choose topk labels 3. Put the result into ClassifyResult variable.
+bool ResNet::Postprocess(FDTensor& infer_result, ClassifyResult* result,
+                         int topk) {
+  // In this function, the postprocess need be implemented according to the
+  // original Repos,
+  // Finally the reslut of postprocess should be saved in ClassifyResult
+  // variable.
+  // 1. Softmax 2. Choose topk labels 3. Put the result into ClassifyResult
+  // variable.
 
   int num_classes = infer_result.shape[1];
   function::Softmax(infer_result, &infer_result);
-  const float* infer_result_buffer = reinterpret_cast<float*>(infer_result.Data());
+  const float* infer_result_buffer =
+      reinterpret_cast<float*>(infer_result.Data());
   topk = std::min(num_classes, topk);
   result->label_ids =
       utils::TopKIndices(infer_result_buffer, num_classes, topk);
@@ -100,8 +102,8 @@ bool ResNet::Postprocess(FDTensor& infer_result,
 }
 
 bool ResNet::Predict(cv::Mat* im, ClassifyResult* result, int topk) {
-
-  // In this function, the Preprocess(), Infer(), and Postprocess() are called sequentially.
+  // In this function, the Preprocess(), Infer(), and Postprocess() are called
+  // sequentially.
 
   Mat mat(*im);
   std::vector<FDTensor> processed_data(1);
@@ -128,7 +130,6 @@ bool ResNet::Predict(cv::Mat* im, ClassifyResult* result, int topk) {
   return true;
 }
 
-
 }  // namespace classification
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc b/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc
index e252ba0ee..35b3e17bb 100644
--- a/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc
@@ -20,18 +20,19 @@ namespace vision {
 namespace classification {
 
 YOLOv5ClsPreprocessor::YOLOv5ClsPreprocessor() {
-  size_ = {224, 224}; //{h,w}
+  size_ = {224, 224};  //{h,w}
 }
 
-bool YOLOv5ClsPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
-            std::map<std::string, std::array<float, 2>>* im_info) {
+bool YOLOv5ClsPreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
   // Record the shape of image and the shape of preprocessed image
   (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
                                static_cast<float>(mat->Width())};
 
   // process after image load
   double ratio = (size_[0] * 1.0) / std::max(static_cast<float>(mat->Height()),
-                                            static_cast<float>(mat->Width()));
+                                             static_cast<float>(mat->Width()));
 
   // yolov5cls's preprocess steps
   // 1. CenterCrop
@@ -54,20 +55,22 @@ bool YOLOv5ClsPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                 static_cast<float>(mat->Width())};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
-bool YOLOv5ClsPreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
-                             std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+bool YOLOv5ClsPreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
   if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   ims_info->resize(images->size());
   outputs->resize(1);
   // Concat all the preprocessed data to a batch tensor
-  std::vector<FDTensor> tensors(images->size()); 
+  std::vector<FDTensor> tensors(images->size());
   for (size_t i = 0; i < images->size(); ++i) {
     if (!Preprocess(&(*images)[i], &tensors[i], &(*ims_info)[i])) {
       FDERROR << "Failed to preprocess input image." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/fastestdet/preprocessor.cc b/fastdeploy/vision/detection/contrib/fastestdet/preprocessor.cc
index f4ff11e8f..7b8fcc399 100644
--- a/fastdeploy/vision/detection/contrib/fastestdet/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/fastestdet/preprocessor.cc
@@ -20,26 +20,27 @@ namespace vision {
 namespace detection {
 
 FastestDetPreprocessor::FastestDetPreprocessor() {
-  size_ = {352, 352}; //{h,w}
+  size_ = {352, 352};  //{h,w}
 }
 
-bool FastestDetPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
-            std::map<std::string, std::array<float, 2>>* im_info) {
+bool FastestDetPreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
   // Record the shape of image and the shape of preprocessed image
   (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
                                static_cast<float>(mat->Width())};
 
   // process after image load
   double ratio = (size_[0] * 1.0) / std::max(static_cast<float>(mat->Height()),
-                                            static_cast<float>(mat->Width()));
+                                             static_cast<float>(mat->Width()));
 
   // fastestdet's preprocess steps
   // 1. resize
   // 2. convert_and_permute(swap_rb=false)
-  Resize::Run(mat, size_[0], size_[1]); //resize
+  Resize::Run(mat, size_[0], size_[1]);  // resize
   std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
   std::vector<float> beta = {0.0f, 0.0f, 0.0f};
-  //convert to float and HWC2CHW
+  // convert to float and HWC2CHW
   ConvertAndPermute::Run(mat, alpha, beta, false);
 
   // Record output shape of preprocessed image
@@ -47,20 +48,22 @@ bool FastestDetPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                 static_cast<float>(mat->Width())};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
-bool FastestDetPreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
-                             std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+bool FastestDetPreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
   if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   ims_info->resize(images->size());
   outputs->resize(1);
   // Concat all the preprocessed data to a batch tensor
-  std::vector<FDTensor> tensors(images->size()); 
+  std::vector<FDTensor> tensors(images->size());
   for (size_t i = 0; i < images->size(); ++i) {
     if (!Preprocess(&(*images)[i], &tensors[i], &(*ims_info)[i])) {
       FDERROR << "Failed to preprocess input image." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/nanodet_plus.cc b/fastdeploy/vision/detection/contrib/nanodet_plus.cc
index 2babae49c..0b89cdbe2 100644
--- a/fastdeploy/vision/detection/contrib/nanodet_plus.cc
+++ b/fastdeploy/vision/detection/contrib/nanodet_plus.cc
@@ -117,8 +117,8 @@ NanoDetPlus::NanoDetPlus(const std::string& model_file,
                          const RuntimeOption& custom_option,
                          const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT};  
+    valid_cpu_backends = {Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -182,7 +182,7 @@ bool NanoDetPlus::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/detection/contrib/scaledyolov4.cc b/fastdeploy/vision/detection/contrib/scaledyolov4.cc
old mode 100755
new mode 100644
index 8678ea181..88c34352b
--- a/fastdeploy/vision/detection/contrib/scaledyolov4.cc
+++ b/fastdeploy/vision/detection/contrib/scaledyolov4.cc
@@ -62,8 +62,8 @@ ScaledYOLOv4::ScaledYOLOv4(const std::string& model_file,
                            const RuntimeOption& custom_option,
                            const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT};  
+    valid_cpu_backends = {Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER};
     valid_gpu_backends = {Backend::PDINFER};
@@ -144,7 +144,7 @@ bool ScaledYOLOv4::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/detection/contrib/yolor.cc b/fastdeploy/vision/detection/contrib/yolor.cc
old mode 100755
new mode 100644
index dd4ef728a..cad66eb08
--- a/fastdeploy/vision/detection/contrib/yolor.cc
+++ b/fastdeploy/vision/detection/contrib/yolor.cc
@@ -61,8 +61,8 @@ YOLOR::YOLOR(const std::string& model_file, const std::string& params_file,
              const RuntimeOption& custom_option,
              const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT};  
-    valid_gpu_backends = {Backend::ORT, Backend::TRT}; 
+    valid_cpu_backends = {Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER};
     valid_gpu_backends = {Backend::PDINFER};
@@ -142,7 +142,7 @@ bool YOLOR::Preprocess(Mat* mat, FDTensor* output,
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/detection/contrib/yolov5/preprocessor.cc b/fastdeploy/vision/detection/contrib/yolov5/preprocessor.cc
old mode 100755
new mode 100644
index 846e25131..658987b75
--- a/fastdeploy/vision/detection/contrib/yolov5/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5/preprocessor.cc
@@ -64,8 +64,9 @@ void YOLOv5Preprocessor::LetterBox(FDMat* mat) {
   }
 }
 
-bool YOLOv5Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
-            std::map<std::string, std::array<float, 2>>* im_info) {
+bool YOLOv5Preprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
   // Record the shape of image and the shape of preprocessed image
   (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
                                static_cast<float>(mat->Width())};
@@ -82,20 +83,22 @@ bool YOLOv5Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                 static_cast<float>(mat->Width())};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
-bool YOLOv5Preprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
-                             std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+bool YOLOv5Preprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
   if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   ims_info->resize(images->size());
   outputs->resize(1);
   // Concat all the preprocessed data to a batch tensor
-  std::vector<FDTensor> tensors(images->size()); 
+  std::vector<FDTensor> tensors(images->size());
   for (size_t i = 0; i < images->size(); ++i) {
     if (!Preprocess(&(*images)[i], &tensors[i], &(*ims_info)[i])) {
       FDERROR << "Failed to preprocess input image." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/yolov5lite.cc b/fastdeploy/vision/detection/contrib/yolov5lite.cc
index be4116eed..8d8f325dc 100644
--- a/fastdeploy/vision/detection/contrib/yolov5lite.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5lite.cc
@@ -195,7 +195,7 @@ bool YOLOv5Lite::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
@@ -253,7 +253,7 @@ bool YOLOv5Lite::CudaPreprocess(
   output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                           input_tensor_cuda_buffer_device_);
   output->device = Device::GPU;
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 #else
   FDERROR << "CUDA src code was not enabled." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/yolov5seg/preprocessor.cc b/fastdeploy/vision/detection/contrib/yolov5seg/preprocessor.cc
index b880ed337..e5bd82630 100644
--- a/fastdeploy/vision/detection/contrib/yolov5seg/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5seg/preprocessor.cc
@@ -64,8 +64,9 @@ void YOLOv5SegPreprocessor::LetterBox(FDMat* mat) {
   }
 }
 
-bool YOLOv5SegPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
-            std::map<std::string, std::array<float, 2>>* im_info) {
+bool YOLOv5SegPreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
   // Record the shape of image and the shape of preprocessed image
   (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
                                static_cast<float>(mat->Width())};
@@ -82,20 +83,22 @@ bool YOLOv5SegPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                 static_cast<float>(mat->Width())};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
-bool YOLOv5SegPreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
-                             std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+bool YOLOv5SegPreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
   if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   ims_info->resize(images->size());
   outputs->resize(1);
   // Concat all the preprocessed data to a batch tensor
-  std::vector<FDTensor> tensors(images->size()); 
+  std::vector<FDTensor> tensors(images->size());
   for (size_t i = 0; i < images->size(); ++i) {
     if (!Preprocess(&(*images)[i], &tensors[i], &(*ims_info)[i])) {
       FDERROR << "Failed to preprocess input image." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/yolov6.cc b/fastdeploy/vision/detection/contrib/yolov6.cc
index cae9ce3a6..bf3368242 100644
--- a/fastdeploy/vision/detection/contrib/yolov6.cc
+++ b/fastdeploy/vision/detection/contrib/yolov6.cc
@@ -168,7 +168,7 @@ bool YOLOv6::Preprocess(Mat* mat, FDTensor* output,
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
@@ -226,7 +226,7 @@ bool YOLOv6::CudaPreprocess(
   output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                           input_tensor_cuda_buffer_device_);
   output->device = Device::GPU;
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 #else
   FDERROR << "CUDA src code was not enabled." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/yolov7/preprocessor.cc b/fastdeploy/vision/detection/contrib/yolov7/preprocessor.cc
old mode 100755
new mode 100644
index 91e22f32b..3374e16bb
--- a/fastdeploy/vision/detection/contrib/yolov7/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov7/preprocessor.cc
@@ -64,8 +64,9 @@ void YOLOv7Preprocessor::LetterBox(FDMat* mat) {
   }
 }
 
-bool YOLOv7Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
-            std::map<std::string, std::array<float, 2>>* im_info) {
+bool YOLOv7Preprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
   // Record the shape of image and the shape of preprocessed image
   (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
                                static_cast<float>(mat->Width())};
@@ -82,20 +83,22 @@ bool YOLOv7Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                 static_cast<float>(mat->Width())};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
-bool YOLOv7Preprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
-                             std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+bool YOLOv7Preprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
   if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   ims_info->resize(images->size());
   outputs->resize(1);
   // Concat all the preprocessed data to a batch tensor
-  std::vector<FDTensor> tensors(images->size()); 
+  std::vector<FDTensor> tensors(images->size());
   for (size_t i = 0; i < images->size(); ++i) {
     if (!Preprocess(&(*images)[i], &tensors[i], &(*ims_info)[i])) {
       FDERROR << "Failed to preprocess input image." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/yolov7end2end_ort.cc b/fastdeploy/vision/detection/contrib/yolov7end2end_ort.cc
old mode 100755
new mode 100644
index daf4ee66b..af7ff0e5c
--- a/fastdeploy/vision/detection/contrib/yolov7end2end_ort.cc
+++ b/fastdeploy/vision/detection/contrib/yolov7end2end_ort.cc
@@ -137,7 +137,7 @@ bool YOLOv7End2EndORT::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
@@ -235,7 +235,8 @@ bool YOLOv7End2EndORT::Predict(cv::Mat* im, DetectionResult* result,
     return false;
   }
 
-  if (!Postprocess(reused_output_tensors_[0], result, im_info, conf_threshold)) {
+  if (!Postprocess(reused_output_tensors_[0], result, im_info,
+                   conf_threshold)) {
     FDERROR << "Failed to post process." << std::endl;
     return false;
   }
diff --git a/fastdeploy/vision/detection/contrib/yolov7end2end_trt.cc b/fastdeploy/vision/detection/contrib/yolov7end2end_trt.cc
index 49961df65..e969771a2 100644
--- a/fastdeploy/vision/detection/contrib/yolov7end2end_trt.cc
+++ b/fastdeploy/vision/detection/contrib/yolov7end2end_trt.cc
@@ -169,7 +169,7 @@ bool YOLOv7End2EndTRT::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
@@ -227,7 +227,7 @@ bool YOLOv7End2EndTRT::CudaPreprocess(
   output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                           input_tensor_cuda_buffer_device_);
   output->device = Device::GPU;
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 #else
   FDERROR << "CUDA src code was not enabled." << std::endl;
diff --git a/fastdeploy/vision/detection/contrib/yolov8/preprocessor.cc b/fastdeploy/vision/detection/contrib/yolov8/preprocessor.cc
index 1c6d9f62c..ebb8b28cd 100644
--- a/fastdeploy/vision/detection/contrib/yolov8/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov8/preprocessor.cc
@@ -83,7 +83,7 @@ bool YOLOv8Preprocessor::Preprocess(
                                 static_cast<float>(mat->Width())};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/detection/contrib/yolox.cc b/fastdeploy/vision/detection/contrib/yolox.cc
index c1c071826..e7d931c42 100755
--- a/fastdeploy/vision/detection/contrib/yolox.cc
+++ b/fastdeploy/vision/detection/contrib/yolox.cc
@@ -129,7 +129,7 @@ bool YOLOX::Preprocess(Mat* mat, FDTensor* output,
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/facealign/contrib/face_landmark_1000.cc b/fastdeploy/vision/facealign/contrib/face_landmark_1000.cc
index f7b689575..0b914fb05 100644
--- a/fastdeploy/vision/facealign/contrib/face_landmark_1000.cc
+++ b/fastdeploy/vision/facealign/contrib/face_landmark_1000.cc
@@ -70,7 +70,7 @@ bool FaceLandmark1000::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/facealign/contrib/pfld.cc b/fastdeploy/vision/facealign/contrib/pfld.cc
index 5978f10b7..d57427090 100644
--- a/fastdeploy/vision/facealign/contrib/pfld.cc
+++ b/fastdeploy/vision/facealign/contrib/pfld.cc
@@ -22,13 +22,12 @@ namespace vision {
 
 namespace facealign {
 
-PFLD::PFLD(const std::string& model_file,
-           const std::string& params_file,
+PFLD::PFLD(const std::string& model_file, const std::string& params_file,
            const RuntimeOption& custom_option,
            const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT}; 
+    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -71,12 +70,13 @@ bool PFLD::Preprocess(Mat* mat, FDTensor* output,
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
-bool PFLD::Postprocess(FDTensor& infer_result, FaceAlignmentResult* result,
-                       const std::map<std::string, std::array<int, 2>>& im_info) {
+bool PFLD::Postprocess(
+    FDTensor& infer_result, FaceAlignmentResult* result,
+    const std::map<std::string, std::array<int, 2>>& im_info) {
   FDASSERT(infer_result.shape[0] == 1, "Only support batch = 1 now.");
   if (infer_result.dtype != FDDataType::FP32) {
     FDERROR << "Only support post process with float32 data." << std::endl;
@@ -84,8 +84,7 @@ bool PFLD::Postprocess(FDTensor& infer_result, FaceAlignmentResult* result,
   }
 
   auto iter_in = im_info.find("input_shape");
-  FDASSERT(iter_in != im_info.end(),
-           "Cannot find input_shape from im_info.");
+  FDASSERT(iter_in != im_info.end(), "Cannot find input_shape from im_info.");
   int in_h = iter_in->second[0];
   int in_w = iter_in->second[1];
 
@@ -97,8 +96,7 @@ bool PFLD::Postprocess(FDTensor& infer_result, FaceAlignmentResult* result,
     x = std::min(std::max(0.f, x), 1.0f);
     y = std::min(std::max(0.f, y), 1.0f);
     // decode landmarks (default 106 landmarks)
-    result->landmarks.emplace_back(
-        std::array<float, 2>{x * in_w, y * in_h});
+    result->landmarks.emplace_back(std::array<float, 2>{x * in_w, y * in_h});
   }
 
   return true;
diff --git a/fastdeploy/vision/facealign/contrib/pipnet.cc b/fastdeploy/vision/facealign/contrib/pipnet.cc
index 27ec35c0d..3af16fa91 100644
--- a/fastdeploy/vision/facealign/contrib/pipnet.cc
+++ b/fastdeploy/vision/facealign/contrib/pipnet.cc
@@ -632,7 +632,7 @@ bool PIPNet::Preprocess(Mat* mat, FDTensor* output,
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/facedet/contrib/retinaface.cc b/fastdeploy/vision/facedet/contrib/retinaface.cc
index 6f38f5636..cd5f93ab9 100644
--- a/fastdeploy/vision/facedet/contrib/retinaface.cc
+++ b/fastdeploy/vision/facedet/contrib/retinaface.cc
@@ -81,8 +81,8 @@ RetinaFace::RetinaFace(const std::string& model_file,
                        const RuntimeOption& custom_option,
                        const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT};  
-    valid_gpu_backends = {Backend::ORT, Backend::TRT};  
+    valid_cpu_backends = {Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -145,7 +145,7 @@ bool RetinaFace::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/facedet/contrib/ultraface.cc b/fastdeploy/vision/facedet/contrib/ultraface.cc
index e7dd99dc4..cf398b2e4 100644
--- a/fastdeploy/vision/facedet/contrib/ultraface.cc
+++ b/fastdeploy/vision/facedet/contrib/ultraface.cc
@@ -27,7 +27,7 @@ UltraFace::UltraFace(const std::string& model_file,
                      const RuntimeOption& custom_option,
                      const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT};  
+    valid_cpu_backends = {Backend::ORT};
     valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
@@ -90,7 +90,7 @@ bool UltraFace::Preprocess(
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/facedet/contrib/yolov5face.cc b/fastdeploy/vision/facedet/contrib/yolov5face.cc
index d508e905a..3fb309bea 100644
--- a/fastdeploy/vision/facedet/contrib/yolov5face.cc
+++ b/fastdeploy/vision/facedet/contrib/yolov5face.cc
@@ -64,8 +64,8 @@ YOLOv5Face::YOLOv5Face(const std::string& model_file,
                        const RuntimeOption& custom_option,
                        const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT}; 
+    valid_cpu_backends = {Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -115,11 +115,11 @@ bool YOLOv5Face::Preprocess(
   // process after image load
   float ratio = std::min(size[1] * 1.0f / static_cast<float>(mat->Height()),
                          size[0] * 1.0f / static_cast<float>(mat->Width()));
-#ifndef __ANDROID__     
-  // Because of the low CPU performance on the Android device, 
-  // we decided to hide this extra resize. It won't make much 
+#ifndef __ANDROID__
+  // Because of the low CPU performance on the Android device,
+  // we decided to hide this extra resize. It won't make much
   // difference to the final result.
-  if (std::fabs(ratio - 1.0f) > 1e-06) {  
+  if (std::fabs(ratio - 1.0f) > 1e-06) {
     int interp = cv::INTER_LINEAR;
     if (ratio > 1.0) {
       interp = cv::INTER_LINEAR;
@@ -128,7 +128,7 @@ bool YOLOv5Face::Preprocess(
     int resize_w = int(round(static_cast<float>(mat->Width()) * ratio));
     Resize::Run(mat, resize_w, resize_h, -1, -1, interp);
   }
-#endif  
+#endif
   // yolov5face's preprocess steps
   // 1. letterbox
   // 2. BGR->RGB
@@ -149,9 +149,9 @@ bool YOLOv5Face::Preprocess(
 
   HWC2CHW::Run(mat);
   Cast::Run(mat, "float");
-  
+
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc b/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc
index ad5dd7e33..7af63f585 100644
--- a/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc
+++ b/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc
@@ -32,10 +32,12 @@ Yolov7FacePreprocessor::Yolov7FacePreprocessor() {
   max_wh_ = 7680.0;
 }
 
-bool Yolov7FacePreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
-                                 std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+bool Yolov7FacePreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
   if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   ims_info->resize(images->size());
@@ -56,8 +58,9 @@ bool Yolov7FacePreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTenso
   return true;
 }
 
-bool Yolov7FacePreprocessor::Preprocess(FDMat* mat, FDTensor* output,
-                                        std::map<std::string, std::array<float, 2>>* im_info){
+bool Yolov7FacePreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
   // Record the shape of image and the shape of preprocessed image
   (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
                                static_cast<float>(mat->Width())};
@@ -75,13 +78,13 @@ bool Yolov7FacePreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                 static_cast<float>(mat->Width())};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
 void Yolov7FacePreprocessor::LetterBox(FDMat* mat) {
   float scale =
-      std::min(size_[1] * 1.0 / mat->Height(), size_[0] * 1.0 / mat->Width()); 
+      std::min(size_[1] * 1.0 / mat->Height(), size_[0] * 1.0 / mat->Width());
   if (!is_scale_up_) {
     scale = std::min(scale, 1.0f);
   }
diff --git a/fastdeploy/vision/faceid/contrib/adaface/preprocessor.cc b/fastdeploy/vision/faceid/contrib/adaface/preprocessor.cc
old mode 100755
new mode 100644
index 8e8f95950..cb0d90310
--- a/fastdeploy/vision/faceid/contrib/adaface/preprocessor.cc
+++ b/fastdeploy/vision/faceid/contrib/adaface/preprocessor.cc
@@ -26,8 +26,7 @@ AdaFacePreprocessor::AdaFacePreprocessor() {
   permute_ = true;
 }
 
-bool AdaFacePreprocessor::Preprocess(FDMat * mat, FDTensor* output) {
-
+bool AdaFacePreprocessor::Preprocess(FDMat* mat, FDTensor* output) {
   // face recognition model's preprocess steps in insightface
   // reference: insightface/recognition/arcface_torch/inference.py
   // 1. Resize
@@ -48,14 +47,15 @@ bool AdaFacePreprocessor::Preprocess(FDMat * mat, FDTensor* output) {
   Cast::Run(mat, "float");
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
 bool AdaFacePreprocessor::Run(std::vector<FDMat>* images,
                               std::vector<FDTensor>* outputs) {
   if (images->empty()) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   FDASSERT(images->size() == 1, "Only support batch = 1 now.");
diff --git a/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc b/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
index 398a7016e..e7f55cf65 100644
--- a/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
+++ b/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
@@ -50,7 +50,7 @@ bool InsightFaceRecognitionPreprocessor::Preprocess(FDMat* mat,
   }
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
diff --git a/fastdeploy/vision/headpose/contrib/fsanet.cc b/fastdeploy/vision/headpose/contrib/fsanet.cc
index 59f25ac5a..c22909134 100644
--- a/fastdeploy/vision/headpose/contrib/fsanet.cc
+++ b/fastdeploy/vision/headpose/contrib/fsanet.cc
@@ -22,13 +22,12 @@ namespace vision {
 
 namespace headpose {
 
-FSANet::FSANet(const std::string& model_file,
-               const std::string& params_file,
+FSANet::FSANet(const std::string& model_file, const std::string& params_file,
                const RuntimeOption& custom_option,
                const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT}; 
+    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -52,7 +51,7 @@ bool FSANet::Initialize() {
 }
 
 bool FSANet::Preprocess(Mat* mat, FDTensor* output,
-                      std::map<std::string, std::array<int, 2>>* im_info) {
+                        std::map<std::string, std::array<int, 2>>* im_info) {
   // Resize
   int resize_w = size[0];
   int resize_h = size[1];
@@ -62,7 +61,8 @@ bool FSANet::Preprocess(Mat* mat, FDTensor* output,
 
   // Normalize
   std::vector<float> alpha = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f};
-  std::vector<float> beta = {-127.5f / 128.0f, -127.5f / 128.0f, -127.5f / 128.0f};
+  std::vector<float> beta = {-127.5f / 128.0f, -127.5f / 128.0f,
+                             -127.5f / 128.0f};
   Convert::Run(mat, alpha, beta);
 
   // Record output shape of preprocessed image
@@ -72,12 +72,13 @@ bool FSANet::Preprocess(Mat* mat, FDTensor* output,
   Cast::Run(mat, "float");
 
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
-bool FSANet::Postprocess(FDTensor& infer_result, HeadPoseResult* result,
-                       const std::map<std::string, std::array<int, 2>>& im_info) {
+bool FSANet::Postprocess(
+    FDTensor& infer_result, HeadPoseResult* result,
+    const std::map<std::string, std::array<int, 2>>& im_info) {
   FDASSERT(infer_result.shape[0] == 1, "Only support batch = 1 now.");
   if (infer_result.dtype != FDDataType::FP32) {
     FDERROR << "Only support post process with float32 data." << std::endl;
@@ -85,8 +86,7 @@ bool FSANet::Postprocess(FDTensor& infer_result, HeadPoseResult* result,
   }
 
   auto iter_in = im_info.find("input_shape");
-  FDASSERT(iter_in != im_info.end(),
-           "Cannot find input_shape from im_info.");
+  FDASSERT(iter_in != im_info.end(), "Cannot find input_shape from im_info.");
   int in_h = iter_in->second[0];
   int in_w = iter_in->second[1];
 
diff --git a/fastdeploy/vision/matting/contrib/modnet.cc b/fastdeploy/vision/matting/contrib/modnet.cc
index c3a89733d..05141a926 100644
--- a/fastdeploy/vision/matting/contrib/modnet.cc
+++ b/fastdeploy/vision/matting/contrib/modnet.cc
@@ -77,7 +77,7 @@ bool MODNet::Preprocess(Mat* mat, FDTensor* output,
   Cast::Run(mat, "float");
 
   mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
   return true;
 }
 
@@ -106,8 +106,8 @@ bool MODNet::Postprocess(
   float* alpha_ptr = static_cast<float*>(alpha_tensor.Data());
   // cv::Mat alpha_zero_copy_ref(out_h, out_w, CV_32FC1, alpha_ptr);
   // Mat alpha_resized(alpha_zero_copy_ref);  // ref-only, zero copy.
-  Mat alpha_resized = Mat::Create(out_h, out_w, 1, FDDataType::FP32, 
-                                  alpha_ptr); // ref-only, zero copy.
+  Mat alpha_resized = Mat::Create(out_h, out_w, 1, FDDataType::FP32,
+                                  alpha_ptr);  // ref-only, zero copy.
   if ((out_h != ipt_h) || (out_w != ipt_w)) {
     Resize::Run(&alpha_resized, ipt_w, ipt_h, -1, -1);
   }
diff --git a/fastdeploy/vision/matting/contrib/rvm.cc b/fastdeploy/vision/matting/contrib/rvm.cc
old mode 100755
new mode 100644
index 258205cf8..2b16aab83
--- a/fastdeploy/vision/matting/contrib/rvm.cc
+++ b/fastdeploy/vision/matting/contrib/rvm.cc
@@ -74,7 +74,7 @@ bool RobustVideoMatting::Preprocess(
   (*im_info)["output_shape"] = {mat->Height(), mat->Width()};
 
   mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
   return true;
 }
 
@@ -118,16 +118,16 @@ bool RobustVideoMatting::Postprocess(
 
   // for alpha
   float* alpha_ptr = static_cast<float*>(alpha.Data());
-  Mat alpha_resized = Mat::Create(out_h, out_w, 1, FDDataType::FP32, 
-                                  alpha_ptr); // ref-only, zero copy.
+  Mat alpha_resized = Mat::Create(out_h, out_w, 1, FDDataType::FP32,
+                                  alpha_ptr);  // ref-only, zero copy.
   if ((out_h != in_h) || (out_w != in_w)) {
     Resize::Run(&alpha_resized, in_w, in_h, -1, -1);
   }
 
   // for foreground
   float* fgr_ptr = static_cast<float*>(fgr.Data());
-  Mat fgr_resized = Mat::Create(out_h, out_w, 1, FDDataType::FP32, 
-                                fgr_ptr); // ref-only, zero copy.
+  Mat fgr_resized = Mat::Create(out_h, out_w, 1, FDDataType::FP32,
+                                fgr_ptr);  // ref-only, zero copy.
   if ((out_h != in_h) || (out_w != in_w)) {
     Resize::Run(&fgr_resized, in_w, in_h, -1, -1);
   }
diff --git a/fastdeploy/vision/utils/sort_det_res.cc b/fastdeploy/vision/utils/sort_det_res.cc
old mode 100644
new mode 100755
index dd33478a3..d0813a260
--- a/fastdeploy/vision/utils/sort_det_res.cc
+++ b/fastdeploy/vision/utils/sort_det_res.cc
@@ -77,27 +77,42 @@ void SortDetectionResult(DetectionResult* result) {
   MergeSort(result, low, high);
 }
 
-bool LexSortByXYCompare(const std::array<float, 4>& box_a,
-                        const std::array<float, 4>& box_b) {
+template <typename T>
+bool LexSortByXYCompare(const std::array<T, 4>& box_a,
+                        const std::array<T, 4>& box_b) {
   // WARN: The status shoule be false if (a==b).
   // https://blog.csdn.net/xxxwrq/article/details/83080640
-  auto is_equal = [](const float& a, const float& b) -> bool {
+  auto is_equal = [](const T& a, const T& b) -> bool {
     return std::abs(a - b) < 1e-6f;
   };
-  const float& x0_a = box_a[0];
-  const float& y0_a = box_a[1];
-  const float& x0_b = box_b[0];
-  const float& y0_b = box_b[1];
+  const T& x0_a = box_a[0];
+  const T& y0_a = box_a[1];
+  const T& x0_b = box_b[0];
+  const T& y0_b = box_b[1];
   if (is_equal(x0_a, x0_b)) {
     return is_equal(y0_a, y0_b) ? false : y0_a > y0_b;
   }
   return x0_a > x0_b;
 }
 
+// Only for int dtype
+template <>
+bool LexSortByXYCompare(const std::array<int, 4>& box_a,
+                        const std::array<int, 4>& box_b) {
+  const int& x0_a = box_a[0];
+  const int& y0_a = box_a[1];
+  const int& x0_b = box_b[0];
+  const int& y0_b = box_b[1];
+  if (x0_a == x0_b) {
+    return y0_a == y0_b ? false : y0_a > y0_b;
+  }
+  return x0_a > x0_b;
+}
+
 void ReorderDetectionResultByIndices(DetectionResult* result,
                                      const std::vector<size_t>& indices) {
   // reorder boxes, scores, label_ids, masks
-  DetectionResult backup = (*result);  // move
+  DetectionResult backup = (*result);
   const bool contain_masks = backup.contain_masks;
   const int boxes_num = backup.boxes.size();
   result->Clear();
@@ -122,7 +137,7 @@ void ReorderDetectionResultByIndices(DetectionResult* result,
 }
 
 void LexSortDetectionResultByXY(DetectionResult* result) {
-  if (result->boxes.size() == 0) {
+  if (result->boxes.empty()) {
     return;
   }
   std::vector<size_t> indices;
@@ -138,6 +153,35 @@ void LexSortDetectionResultByXY(DetectionResult* result) {
   ReorderDetectionResultByIndices(result, indices);
 }
 
+void LexSortOCRDetResultByXY(std::vector<std::array<int, 8>>* result) {
+  if (result->empty()) {
+    return;
+  }
+  std::vector<size_t> indices;
+  indices.resize(result->size());
+  std::vector<std::array<int, 4>> boxes;
+  boxes.resize(result->size());
+  for (size_t i = 0; i < result->size(); ++i) {
+    indices[i] = i;
+    // 4 points to 2 points for LexSort
+    boxes[i] = {(*result)[i][0], (*result)[i][1], (*result)[i][6],
+                (*result)[i][7]};
+  }
+  // lex sort by x(w) then y(h)
+  std::sort(indices.begin(), indices.end(), [&boxes](size_t a, size_t b) {
+    return LexSortByXYCompare(boxes[a], boxes[b]);
+  });
+  // reorder boxes
+  std::vector<std::array<int, 8>> backup = (*result);
+  const int boxes_num = backup.size();
+  result->clear();
+  result->resize(boxes_num);
+  // boxes
+  for (int i = 0; i < boxes_num; ++i) {
+    (*result)[i] = backup[indices[i]];
+  }
+}
+
 }  // namespace utils
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/utils/utils.h b/fastdeploy/vision/utils/utils.h
old mode 100644
new mode 100755
index bca781973..1f8f21b48
--- a/fastdeploy/vision/utils/utils.h
+++ b/fastdeploy/vision/utils/utils.h
@@ -67,8 +67,11 @@ void NMS(FaceDetectionResult* result, float iou_threshold = 0.5);
 /// Sort DetectionResult/FaceDetectionResult by score
 FASTDEPLOY_DECL void SortDetectionResult(DetectionResult* result);
 FASTDEPLOY_DECL void SortDetectionResult(FaceDetectionResult* result);
-/// Lex Sort DetectionResult/FaceDetectionResult by x(w) & y(h) axis
+/// Lex Sort DetectionResult by x(w) & y(h) axis
 FASTDEPLOY_DECL void LexSortDetectionResultByXY(DetectionResult* result);
+/// Lex Sort OCRDet Result by x(w) & y(h) axis
+FASTDEPLOY_DECL void LexSortOCRDetResultByXY(
+                     std::vector<std::array<int, 8>>* result);
 
 /// L2 Norm / cosine similarity  (for face recognition, ...)
 FASTDEPLOY_DECL std::vector<float>