Merge branch 'develop' of https://github.com/felixhjh/FastDeploy into develop

2025-10-11 19:40:25 +08:00 · 2022-12-11 14:10:56 +00:00
parent 2f6f6977d7 bfa5c68c5a
commit 310592e491
416 changed files with 20998 additions and 2782 deletions
--- a/.github/ISSUE_TEMPLATE/报告issue.md
+++ b/.github/ISSUE_TEMPLATE/报告issue.md
@@ -19,3 +19,7 @@ assignees: ''
 - 性能问题，描述清楚对比的方式
 - - 注意性能测试，循环跑N次，取后80%的用时平均（模型启动时，刚开始受限于资源分配，速度会较慢）
 - - FastDeploy的Predict包含模型本身之外的数据前后处理用时
+- 模型部署出错
+- - 先执行`examples`下的部署示例，包括使用examples提供的模型，确认是否可以正确执行
+- - 如若`examples`下的代码可以运行，但自己的模型，或自己的代码不能运行
+- - - 提供自己的代码使用方式或自己的模型，供工程师快速定位问题
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -4,7 +4,7 @@
 ### PR types(PR类型)
 <!-- One of PR types [ Model | Backend | Serving | Quantization | Doc | Bug Fix | Other] -->

-### Describe
+### Description
 <!-- Describe what this PR does -->


--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/utils.cmake)
 if(NOT MSVC)
  set(CMAKE_CXX_STANDARD 11)
  set(CMAKE_CXX_FLAGS "-Wno-format")
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
 endif(NOT MSVC)

 if(UNIX AND (NOT APPLE) AND (NOT ANDROID) AND (NOT ENABLE_TIMVX))
@@ -425,13 +426,6 @@ if(ENABLE_VISION)
  endif()
 endif()

-if(ANDROID OR IOS)
-  if(ENABLE_TEXT)
-    set(ENABLE_TEXT OFF CACHE BOOL "Force ENABLE_TEXT OFF" FORCE)
-    message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support fast_tokenizer with Android/IOS now.")
-  endif()
-endif()
-
 if(ENABLE_TEXT)
  add_definitions(-DENABLE_TEXT)
  list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_TEXT_SRCS})
@@ -488,6 +482,8 @@ set_target_properties(${LIBRARY_NAME} PROPERTIES VERSION ${FASTDEPLOY_VERSION})
 if(MSVC)
  # disable warnings for dll export
  target_compile_options(${LIBRARY_NAME} PRIVATE "$<$<BUILD_INTERFACE:$<COMPILE_LANGUAGE:CXX>>:/wd4251>$<$<BUILD_INTERFACE:$<COMPILE_LANGUAGE:CUDA>>:-Xcompiler=/wd4251>")
+  file(GLOB FD_FILES_REQUIRE_BIGOBJ ${CSRCS_DIR_NAME}/fastdeploy/function/reduce.cc)
+  set_source_files_properties(${FD_FILES_REQUIRE_BIGOBJ} PROPERTIES COMPILE_FLAGS "/bigobj")
 endif()

 # extra depend libs for android
@@ -506,8 +502,12 @@ target_link_libraries(${LIBRARY_NAME} ${DEPEND_LIBS})

 if(WIN32)
  if(ENABLE_VISION)
-    add_custom_target(copy_yaml_library ALL COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/third_party/yaml-cpp/Release  ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/yaml-cpp/lib DEPENDS ${LIBRARY_NAME})
-    add_custom_target(copy_yaml_include ALL COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/third_party/yaml-cpp/include  ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/yaml-cpp/include DEPENDS ${LIBRARY_NAME})
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+      add_custom_target(copy_yaml_library ALL COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/third_party/yaml-cpp  ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/yaml-cpp/lib DEPENDS ${LIBRARY_NAME})
+    else()
+      add_custom_target(copy_yaml_library ALL COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/third_party/yaml-cpp/Release  ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/yaml-cpp/lib DEPENDS ${LIBRARY_NAME})
+      add_custom_target(copy_yaml_include ALL COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/third_party/yaml-cpp/include  ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/yaml-cpp/include DEPENDS ${LIBRARY_NAME})
+    endif()
  endif()
 endif()

--- a/FastDeploy.cmake.in
+++ b/FastDeploy.cmake.in
@@ -35,9 +35,12 @@ list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/include)
 # Note(zhoushunjie): include some useful utils function
 include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)

-if(NOT CMAKE_CXX_STANDARD)
+# Set C++11 as standard for the whole project
+if(NOT MSVC)
  set(CMAKE_CXX_STANDARD 11)
-endif()
+  set(CMAKE_CXX_FLAGS "-Wno-format")
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
+endif(NOT MSVC)

 if(ANDROID)
  add_library(fastdeploy STATIC IMPORTED GLOBAL)
@@ -210,11 +213,18 @@ endif()

 if (ENABLE_TEXT)
  if(ANDROID)
-    message(FATAL_ERROR "Not support fastdeploy text APIs with Android now!")
+    if(NOT ANDROID_TOOLCHAIN MATCHES "clang")
+      message(FATAL_ERROR "Currently, only support clang toolchain while cross compiling FastDeploy for Android with FastTokenizer, but found ${ANDROID_TOOLCHAIN}.")
+    endif()
+    add_library(core_tokenizers STATIC IMPORTED GLOBAL)
+    set_property(TARGET core_tokenizers PROPERTY IMPORTED_LOCATION
+                ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/lib/${ANDROID_ABI}/libcore_tokenizers.so)
+    list(APPEND FASTDEPLOY_LIBS core_tokenizers)
+  else()
+    # Add dependency libs later: Linux/Mac/Win/...
+    find_library(FAST_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/lib NO_DEFAULT_PATH)
+    list(APPEND FASTDEPLOY_LIBS ${FAST_TOKENIZER_LIB})
  endif()
-  # Add dependency libs later
-  find_library(FAST_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/lib NO_DEFAULT_PATH)
-  list(APPEND FASTDEPLOY_LIBS ${FAST_TOKENIZER_LIB})
  list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/include)
  list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/third_party/include)
 endif()
--- a/README_CN.md
+++ b/README_CN.md
@@ -35,87 +35,19 @@
 | [**Pose Estimation**](examples/vision/keypointdetection)                                                                                       | [**Behavior Recognition**](https://github.com/PaddlePaddle/FastDeploy/issues/6)                                                                | [**NLP**](examples/text)                                                                                                                         | [**Speech**](examples/audio/pp-tts)                                                                                                                                                                                                                                                                                                                                      |
 | <img src='https://user-images.githubusercontent.com/54695910/188054671-394db8dd-537c-42b1-9d90-468d7ad1530e.gif' height="126px" width="190px"> | <img src='https://user-images.githubusercontent.com/48054808/173034825-623e4f78-22a5-4f14-9b83-dc47aa868478.gif' height="126px" width="190px"> | <img src='https://user-images.githubusercontent.com/54695910/200162475-f5d85d70-18fb-4930-8e7e-9ca065c1d618.gif' height="126px" width="190px">   | <p align="left">**input** ：早上好今天是2020<br>/10/29，最低温度是-3°C。<br><br> <p align="left">**output**: [<img src="https://user-images.githubusercontent.com/54695910/200161645-871e08da-5a31-4736-879c-a88bb171a676.png" width="170" style="max-width: 100%;">](https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav)</p> |

-## 近期更新

- 🔥 [**【三日部署直播课回放】**](https://aistudio.baidu.com/aistudio/course/introduce/27800)
+## **社区交流**

- **社区交流**
+*  **Slack**：Join our [Slack community](https://join.slack.com/t/fastdeployworkspace/shared_invite/zt-1jznah134-3rxY~ytRb8rcPqkn9g~PDg) and chat with other community members about ideas

-  - **Slack**：Join our [Slack community](https://join.slack.com/t/fastdeployworkspace/shared_invite/zt-1hhvpb279-iw2pNPwrDaMBQ5OQhO3Siw) and chat with other community members about ideas
+*  **微信**：扫描二维码，填写问卷加入技术社区，与社区开发者探讨部署的痛点与方案

-  - **微信**：扫描二维码，填写问卷加入技术社区，与社区开发者探讨部署的痛点与方案
-
-    <div align="center">
-    <img src="https://user-images.githubusercontent.com/54695910/200145290-d5565d18-6707-4a0b-a9af-85fd36d35d13.jpg" width = "120" height = "120" />
-    </div>
-
- 🔥 **2022.11.23：Release FastDeploy [release v0.8.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.8)**
-
-  - **🖥️ 服务端部署：支持更多的模型，推理性能进一步提升**  
-
-    - 新增 PIPNet、FaceLandmark1000 [人脸对齐模型](./examples/vision/facealign)的部署支持；
-    - 新增[视频超分系列模型](./examples/vision/sr) PP-MSVSR、EDVR、BasicVSR 部署示例；
-    - 升级[YOLOv7部署代码](https://github.com/PaddlePaddle/FastDeploy/pull/611)，支持 predict 及 batch_predict；
-    - 新增 [UIE服务化部署](./examples/text/uie) 案例；
-    - [测试功能] 新增OpenVINO后端Device设置，支持集显/独立显卡的调用；
-
-  - **📲 移动端和端侧部署：支持更多模型**
-
-    - 新增Android图像分类、目标检测、语义分割、OCR、人脸检测 APK工程及示例.
-
-      | <font size=3>图像分类</font>                                                                                                                                                   | <font size=3>目标检测</font>                                                                                                                                              | <font size=3>语义分割</font>                                                                                                                                          | <font size=3>文字识别</font>                                                                                                                                        | <font size=3>人脸检测</font>                                                                                                                                            |
-      |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
-      | <font size=2>[工程代码](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/classification)</font> | <font size=2>[工程代码](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/detection)</font> | <font size=2>[工程代码](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/segmentation) | <font size=2>[工程代码](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr)</font> | <font size=2>[工程代码](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/facedet)</font> |
-      | <font size=2>[扫码或点击链接<br>安装试用](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-MobileNetV1.apk)</font>                                                         | <font size=2>[扫码或点击链接<br>安装试用](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-PicoDet.apk)</font>                                                        | <font size=2>[扫码或点击链接<br>安装试用](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-Portrait-HumanSegV2-Lite.apk)</font>                                   | <font size=2> [扫码或点击链接<br>安装试用](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-PP-OCRv2.apk)</font>                                                | <font size=2> [扫码或点击链接<br>安装试用](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-SCRFD.apk)</font>                                                       |
-      | <img src=https://user-images.githubusercontent.com/54695910/203604502-991972a8-5a9c-49cd-9e58-ed8e2a942b9b.png height="90" width="100">                                    | <img src=https://user-images.githubusercontent.com/54695910/203604475-724be708-27d6-4e56-9c2f-ae2eca24118c.png height="90" width="100">                               | <img src=https://user-images.githubusercontent.com/54695910/203604459-9a2915bc-91dc-460c-bff6-a0e2584d2eff.png height="90" width="100">                           | <img src=https://user-images.githubusercontent.com/54695910/203604453-6ce0118e-7b93-4044-8a92-56f2ab65c26a.png height="90" width="100">                         | <img src=https://user-images.githubusercontent.com/54695910/203604418-7c9703b5-1805-457e-966c-5a6625f212ff.png height="90" width="100">                             |
-
- [**more releases information**](./releases)
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/54695910/200145290-d5565d18-6707-4a0b-a9af-85fd36d35d13.jpg" width = "220" height = "220" />
+</div>

 ## 目录

-* <details open> <summary><b>📖 文档教程（点击可收缩）</b></summary><div>
-
-  - 安装文档
-
-    - [预编译库下载安装](docs/cn/build_and_install/download_prebuilt_libraries.md)
-    - [GPU部署环境编译安装](docs/cn/build_and_install/gpu.md)
-    - [CPU部署环境编译安装](docs/cn/build_and_install/cpu.md)
-    - [IPU部署环境编译安装](docs/cn/build_and_install/ipu.md)
-    - [Jetson部署环境编译安装](docs/cn/build_and_install/jetson.md)
-    - [Android平台部署环境编译安装](docs/cn/build_and_install/android.md)
-
-  - 快速使用
-
-    - [Python部署示例](docs/cn/quick_start/models/python.md)
-    - [C++部署示例](docs/cn/quick_start/models/cpp.md)
-    - [Runtime Python使用示例](docs/cn/quick_start/runtime/python.md)
-    - [Runtime C++使用示例](docs/cn/quick_start/runtime/cpp.md)
-
-  - API文档(进行中)
-
-    - [Python API文档](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/python/html/)
-    - [C++ API文档](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/cpp/html/)
-
-  - 性能调优
-
-    - [量化加速](docs/cn/quantize.md)
-
-  - 常见问题
-
-    - [1. 如何配置模型部署的推理后端](docs/cn/faq/how_to_change_backend.md)
-    - [2. Windows上C++ SDK如何使用](docs/cn/faq/use_sdk_on_windows.md)
-    - [3. Android上如何使用FastDeploy](java/android/README.md)
-    - [4. TensorRT使用中的一些技巧](docs/cn/faq/tensorrt_tricks.md)
-    - [5. 如何增加新的模型](docs/cn/faq/develop_a_new_model.md)(进行中)
-
-  - 更多FastDeploy部署模块
-
-    - [服务化部署](./serving)
-
-    - [Benchmark测试](./benchmark)
-
-</div></details>
-
 * **🖥️ 服务器端部署**

  * [Python SDK快速开始](#fastdeploy-quick-start-python)  
@@ -124,24 +56,21 @@

 * **📲 移动端和端侧部署**

-  * [Paddle Lite NPU部署](#fastdeploy-edge-sdk-npu)
  * [端侧模型支持列表](#fastdeploy-edge-models)

 * **🌐 Web和小程序部署**  

  * [Web端模型支持列表](#fastdeploy-web-models)
-
-* [**社区交流**](#fastdeploy-community)
-
-* [**Acknowledge**](#fastdeploy-acknowledge)  
-
-* [**License**](#fastdeploy-license)
+* [Acknowledge](#fastdeploy-acknowledge)  
+* [License](#fastdeploy-license)

 ## 🖥️ 服务端部署

 <div id="fastdeploy-quick-start-python"></div>

-<details close> <summary><b>Python SDK快速开始（点开查看详情）</b></summary><div>
+<details close>
+
+<summary><b>Python SDK快速开始（点开查看详情）</b></summary><div>

 #### 快速安装

@@ -180,11 +109,10 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000

 * 测试推理结果

-  ```python
-  # GPU/TensorRT部署参考 examples/vision/detection/paddledetection/python
-  import cv2
-  import fastdeploy.vision as vision
-  ```
+```python
+# GPU/TensorRT部署参考 examples/vision/detection/paddledetection/python
+import cv2
+import fastdeploy.vision as vision

 model = vision.detection.PPYOLOE("ppyoloe_crn_l_300e_coco/model.pdmodel",
                                 "ppyoloe_crn_l_300e_coco/model.pdiparams",
@@ -197,11 +125,13 @@ vis_im = vision.vis_detection(im, result, score_threshold=0.5)
 cv2.imwrite("vis_image.jpg", vis_im)

 ```
+
 </div></details>

 <div id="fastdeploy-quick-start-cpp"></div>

-<details>
+<details close>
+
 <summary><b>C++ SDK快速开始（点开查看详情）</b></summary><div>


@@ -274,14 +204,14 @@ int main(int argc, char* argv[]) {
 | Classification         | [PaddleClas/SqueeezeNetV1.1](./examples/vision/classification/paddleclas)                    | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ✅                           | ✅       |
 | Classification         | [PaddleClas/Inceptionv3](./examples/vision/classification/paddleclas)                        | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
 | Classification         | [PaddleClas/PP-HGNet](./examples/vision/classification/paddleclas)                           | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ✅                           | ✅       |
-| Detection              | [PaddleDetection/PP-YOLOE](./examples/vision/detection/paddledetection)                      | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/PicoDet](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/YOLOX](./examples/vision/detection/paddledetection)                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/YOLOv3](./examples/vision/detection/paddledetection)                        | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/PP-YOLO](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/PP-YOLOv2](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/Faster-RCNN](./examples/vision/detection/paddledetection)                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/Mask-RCNN](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PP-YOLOE](./examples/vision/detection/paddledetection)                      | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PicoDet](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/YOLOX](./examples/vision/detection/paddledetection)                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅               | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/YOLOv3](./examples/vision/detection/paddledetection)                        | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PP-YOLO](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PP-YOLOv2](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/Faster-RCNN](./examples/vision/detection/paddledetection)                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/Mask-RCNN](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
 | Detection              | [Megvii-BaseDetection/YOLOX](./examples/vision/detection/yolox)                              | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Detection              | [WongKinYiu/YOLOv7](./examples/vision/detection/yolov7)                                      | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Detection              | [WongKinYiu/YOLOv7end2end_trt](./examples/vision/detection/yolov7end2end_trt)                | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
@@ -318,14 +248,14 @@ int main(int argc, char* argv[]) {
 | Matting                | [ZHKKKe/MODNet](./examples/vision/matting/modnet)                                            | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Matting                | [PeterL1n/RobustVideoMatting]()                                                              | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Matting                | [PaddleSeg/PP-Matting](./examples/vision/matting/ppmatting)                                  | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
-| Matting                | [PaddleSeg/PP-HumanMatting](./examples/vision/matting/modnet)                                | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
+| Matting                | [PaddleSeg/PP-HumanMatting](./examples/vision/matting/modnet)                                | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Matting                | [PaddleSeg/ModNet](./examples/vision/matting/modnet)                                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Video Super-Resolution | [PaddleGAN/BasicVSR](./)                                                                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Video Super-Resolution | [PaddleGAN/EDVR](./examples/vision/sr/edvr)                                                  | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Video Super-Resolution | [PaddleGAN/PP-MSVSR](./examples/vision/sr/ppmsvsr)                                           | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Information Extraction | [PaddleNLP/UIE](./examples/text/uie)                                                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           |         |
 | NLP                    | [PaddleNLP/ERNIE-3.0](./examples/text/ernie-3.0)                                             | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ❔                           | ❔                           | ✅       |
-| Speech                 | [PaddleSpeech/PP-TTS](./examples/text/uie)                                                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ❔                           | --                          | ✅       |
+| Speech                 | [PaddleSpeech/PP-TTS](./examples/audio/pp-tts)                                                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ❔                           | --                          | ✅       |


 </div></details>
@@ -344,7 +274,7 @@ int main(int argc, char* argv[]) {
  <img src="https://user-images.githubusercontent.com/54695910/198619323-c9b1cbce-1c1c-4f92-9737-4805c7c0ff2f.png" />
 </div>

-| 任务场景               | 模型                                                                                        | 大小(MB)   | Linux   | Android | iOS     | Linux                   | Linux                          | Linux                       | Linux                            | 更新中...  |
+| 任务场景               | 模型                                                                                        | 大小(MB)   | Linux   | Android | Linux     | Linux                   | Linux                          | Linux                       | Linux                            | 更新中...  |
 |:------------------:|:-----------------------------------------------------------------------------------------:|:--------:|:-------:|:-------:|:-------:|:-----------------------:|:------------------------------:|:---------------------------:|:--------------------------------:|:-------:|
 | ---                | ---                                                                                       | ---      | ARM CPU | ARM CPU | 瑞芯微NPU<br>RK3568/RK3588 | 瑞芯微NPU<br>RV1109/RV1126/RK1808 | 晶晨NPU <br>A311D/S905D/C308X | 恩智浦NPU<br>i.MX&nbsp;8M&nbsp;Plus | 更新中...｜ |
 | Classification     | [PaddleClas/ResNet50](examples/vision/classification/paddleclas)                         | 98        | ✅       | ✅       |  ❔                             |      ✅                                |                                   |                                   |         |
@@ -392,15 +322,6 @@ int main(int argc, char* argv[]) {

 </div></details>

-<div id="fastdeploy-community"></div>
-
-## 社区交流
-
- **加入社区👬：** 微信扫描二维码，进入**FastDeploy技术交流群**
-
-<div align="center">
-<img src="https://user-images.githubusercontent.com/54695910/200145290-d5565d18-6707-4a0b-a9af-85fd36d35d13.jpg"  width = "225" height = "225" />
-</div>

 <div id="fastdeploy-acknowledge"></div>

--- a/README_EN.md
+++ b/README_EN.md
@@ -36,87 +36,19 @@ Including image classification, object detection, image segmentation, face detec
 | [**Pose Estimation**](examples/vision/keypointdetection)                                                                                       | [**Behavior Recognition**](https://github.com/PaddlePaddle/FastDeploy/issues/6)                                                                | [**NLP**](examples/text)                                                                                                                         | [**Speech**](examples/audio/pp-tts)                                                                                                                                                                                                                                                                                                                                                                                        |
 | <img src='https://user-images.githubusercontent.com/54695910/188054671-394db8dd-537c-42b1-9d90-468d7ad1530e.gif' height="126px" width="190px"> | <img src='https://user-images.githubusercontent.com/48054808/173034825-623e4f78-22a5-4f14-9b83-dc47aa868478.gif' height="126px" width="190px"> | <img src='https://user-images.githubusercontent.com/54695910/200162475-f5d85d70-18fb-4930-8e7e-9ca065c1d618.gif' height="126px" width="190px">   | <p align="left">**input**:Life was like a box<br> of chocolates, you never<br> know what you're <br>gonna get.<br> <p align="left">**output**: [<img src="https://user-images.githubusercontent.com/54695910/200161645-871e08da-5a31-4736-879c-a88bb171a676.png" width="150" style="max-width: 100%;">](https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav)</p> |

-## 📣 Recent Updates
+## 👬 Community

- **Community**
+ - **Slack**：Join our [Slack community](https://join.slack.com/t/fastdeployworkspace/shared_invite/zt-1jznah134-3rxY~ytRb8rcPqkn9g~PDg) and chat with other community members about ideas.

-  - **Slack**：Join our [Slack community](https://join.slack.com/t/fastdeployworkspace/shared_invite/zt-1hhvpb279-iw2pNPwrDaMBQ5OQhO3Siw) and chat with other community members about ideas.
-
-  - **WeChat**：Scan the QR code below using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group.
+ - **WeChat**：Scan the QR code below using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group.

    <div align="center">
-    <img src="https://user-images.githubusercontent.com/54695910/200145290-d5565d18-6707-4a0b-a9af-85fd36d35d13.jpg" width = "100" height = "100" />
+    <img src="https://user-images.githubusercontent.com/54695910/200145290-d5565d18-6707-4a0b-a9af-85fd36d35d13.jpg" width = "200" height = "200" />
    </div>

- 🔥 **2022.11.23：Release FastDeploy [release v0.8.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.8.0)** <br>
-
-  - **🖥️ Server-side and Cloud Deployment: Support more CV models, improve deployment performance**
-
-    - Support [PIPNet](./examples/vision/facealign/pipnet), [FaceLandmark1000](./examples/vision/facealign/face_landmark_1000) face alignment models deployment;
-    - Support [Video Super-Resolution](./examples/vision/sr) series model PP-MSVSR、EDVR、BasicVSR;
-    - Upgrade YOLOv7 deployment code to add `batch_predict` deployment;
-    - Support [UIE service-based](./examples/text/uie) deployment;
-    - Add Python API to_dlpack interface for FDTensor to support copyless transfer of FDTensor between frameworks.
-
-  - **📱 Mobile and Edge Device Deployment: support more CV model**
-
-    - Support Android image classification, target detection, semantic segmentation, OCR, face detection APK projects and examples.
-
-      | <font size=3>Image Classification</font>                                                                                                                                                  | <font size=3>Object Detection</font>                                                                                                                                                  | <font size=3>Semantic Segmentation</font>                                                                                                                                                              | <font size=3>OCR</font>                                                                                                                                                                 | <font size=3>Face Detection</font>                                                                                                                                                   |
-      |:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
-      | <font size=2>[Project Code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/classification)</font>        | <font size=2>[Project Code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/detection)</font>         | <font size=2>[Project Code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/segmentation)                              | <font size=2>[Project Code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr)</font>                 | <font size=2>[Project Code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/facedet)</font>          |
-      | <font size=2>[Scan&nbsp;the&nbsp;code<br>or&nbsp;click&nbsp;on&nbsp;the&nbsp;link<br>to&nbsp;install](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-MobileNetV1.apk)</font> | <font size=2>[Scan&nbsp;the&nbsp;code<br>or&nbsp;click&nbsp;on&nbsp;the&nbsp;link<br>to&nbsp;install](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-PicoDet.apk)</font> | <font size=2>[Scan&nbsp;the&nbsp;code<br>or&nbsp;click&nbsp;on&nbsp;the&nbsp;link<br>to&nbsp;install](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-Portrait-HumanSegV2-Lite.apk)</font> | <font size=2> [Scan&nbsp;the&nbsp;code<br>or&nbsp;click&nbsp;on&nbsp;the&nbsp;link<br>to&nbsp;install](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-PP-OCRv2.apk)</font> | <font size=2> [Scan&nbsp;the&nbsp;code<br>or&nbsp;click&nbsp;on&nbsp;the&nbsp;link<br>to&nbsp;install](https://bj.bcebos.com/fastdeploy/release/android/FastDeploy-SCRFD.apk)</font> |
-      | <img src=https://user-images.githubusercontent.com/54695910/203604502-991972a8-5a9c-49cd-9e58-ed8e2a942b9b.png height="100" width="110">                                                  | <img src=https://user-images.githubusercontent.com/54695910/203604475-724be708-27d6-4e56-9c2f-ae2eca24118c.png height="100" width="110">                                              | <img src=https://user-images.githubusercontent.com/54695910/203604459-9a2915bc-91dc-460c-bff6-a0e2584d2eff.png height="100" width="110">                                                               | <img src=https://user-images.githubusercontent.com/54695910/203604453-6ce0118e-7b93-4044-8a92-56f2ab65c26a.png height="100" width="110">                                                | <img src=https://user-images.githubusercontent.com/54695910/203604418-7c9703b5-1805-457e-966c-5a6625f212ff.png height="100" width="110">                                             |
-
- [**more releases information**](./releases)
-
 ## Contents

-* <details open><summary><b>📖 Tutorials（click to fold）</b></summary><div>
-
-  - Install
-
-    - [Install FastDeploy Prebuilt Libraries](docs/en/build_and_install/download_prebuilt_libraries.md)
-    - [Build and Install FastDeploy Library on GPU Platform](docs/en/build_and_install/gpu.md)
-    - [Build and Install FastDeploy Library on CPU Platform](docs/en/build_and_install/cpu.md)
-    - [Build and Install FastDeploy Library on IPU Platform](docs/en/build_and_install/ipu.md)
-    - [Build and Install FastDeploy Library on  Nvidia Jetson Platform](docs/en/build_and_install/jetson.md)
-    - [Build and Install FastDeploy Library on Android Platform](docs/en/build_and_install/android.md)
-
-  - A Quick Start - Demos
-
-    - [Python Deployment Demo](docs/en/quick_start/models/python.md)
-    - [C++ Deployment Demo](docs/en/quick_start/models/cpp.md)
-    - [A Quick Start on Runtime Python](docs/en/quick_start/runtime/python.md)
-    - [A Quick Start on Runtime C++](docs/en/quick_start/runtime/cpp.md)
-
-  - API (To be continued)
-
-    - [Python API](https://baidu-paddle.github.io/fastdeploy-api/python/html/)
-    - [C++ API](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/)
-
-  - Performance Optimization
-
-    - [Quantization Acceleration](docs/en/quantize.md)
-
-  - Frequent Q&As
-
-    - [1. How to Change Inference Backends](docs/en/faq/how_to_change_backend.md)
-    - [2. How to Use FastDeploy C++ SDK on Windows Platform](docs/en/faq/use_sdk_on_windows.md)
-    - [3. How to Use FastDeploy C++ SDK on Android Platform](java/android/README_EN.md)(To be Continued)
-    - [4. Tricks of TensorRT](docs/en/faq/tensorrt_tricks.md)
-    - [5. How to Develop a New Model](docs/en/faq/develop_a_new_model.md)(To be Continued)
-
-  - More FastDeploy Deployment Module
-
-    - [deployment AI Model as a Service](./serving)
-
-    - [Benchmark Testing](./benchmark)
-
-</div></details>
-
 * **🖥️ Server-side and Cloud Deployment**
-
  * [A Quick Start for Python SDK](#fastdeploy-quick-start-python)  
  * [A Quick Start for C++ SDK](#fastdeploy-quick-start-cpp)
  * [Supported Server-side and Cloud Model List](#fastdeploy-server-models)
@@ -129,11 +61,9 @@ Including image classification, object detection, image segmentation, face detec

  * [Supported Web and Mini Program Model List](#fastdeploy-web-models)

-* [**Community**](#fastdeploy-community)
+* [Acknowledge](#fastdeploy-acknowledge)  

-* [**Acknowledge**](#fastdeploy-acknowledge)  
-
-* [**License**](#fastdeploy-license)
+* [License](#fastdeploy-license)

 ## 🖥️ Server-side and Cloud Deployment

@@ -272,15 +202,14 @@ Notes: ✅: already supported; ❔: to be supported in the future;  N/A: Not Ava
 | Classification         | [PaddleClas/SqueeezeNetV1.1](./examples/vision/classification/paddleclas)                    | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ✅                           | ✅       |
 | Classification         | [PaddleClas/Inceptionv3](./examples/vision/classification/paddleclas)                        | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
 | Classification         | [PaddleClas/PP-HGNet](./examples/vision/classification/paddleclas)                           | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ✅                           | ✅       |
-| Classification         | [PaddleClas/SwinTransformer](./examples/vision/classification/paddleclas)                    | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/PP-YOLOE](./examples/vision/detection/paddledetection)                      | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/PicoDet](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/YOLOX](./examples/vision/detection/paddledetection)                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/YOLOv3](./examples/vision/detection/paddledetection)                        | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/PP-YOLO](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/PP-YOLOv2](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/Faster-RCNN](./examples/vision/detection/paddledetection)                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
-| Detection              | [PaddleDetection/Mask-RCNN](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PP-YOLOE](./examples/vision/detection/paddledetection)                      | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PicoDet](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/YOLOX](./examples/vision/detection/paddledetection)                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅               | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/YOLOv3](./examples/vision/detection/paddledetection)                        | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PP-YOLO](./examples/vision/detection/paddledetection)                       | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/PP-YOLOv2](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/Faster-RCNN](./examples/vision/detection/paddledetection)                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
+| Detection              | [PaddleDetection/Mask-RCNN](./examples/vision/detection/paddledetection)                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ✅       |
 | Detection              | [Megvii-BaseDetection/YOLOX](./examples/vision/detection/yolox)                              | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Detection              | [WongKinYiu/YOLOv7](./examples/vision/detection/yolov7)                                      | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Detection              | [WongKinYiu/YOLOv7end2end_trt](./examples/vision/detection/yolov7end2end_trt)                | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
@@ -317,14 +246,14 @@ Notes: ✅: already supported; ❔: to be supported in the future;  N/A: Not Ava
 | Matting                | [ZHKKKe/MODNet](./examples/vision/matting/modnet)                                            | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Matting                | [PeterL1n/RobustVideoMatting]()                                                              | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Matting                | [PaddleSeg/PP-Matting](./examples/vision/matting/ppmatting)                                  | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
-| Matting                | [PaddleSeg/PP-HumanMatting](./examples/vision/matting/modnet)                                | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
+| Matting                | [PaddleSeg/PP-HumanMatting](./examples/vision/matting/modnet)                                | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Matting                | [PaddleSeg/ModNet](./examples/vision/matting/modnet)                                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ✅                | ✅                           | ❔                           | ❔       |
 | Video Super-Resolution | [PaddleGAN/BasicVSR](./)                                                                     | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Video Super-Resolution | [PaddleGAN/EDVR](./examples/vision/sr/edvr)                                                  | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Video Super-Resolution | [PaddleGAN/PP-MSVSR](./examples/vision/sr/ppmsvsr)                                           | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           | ❔       |
 | Information Extraction | [PaddleNLP/UIE](./examples/text/uie)                                                         | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ✅                           | ❔                           |         |
 | NLP                    | [PaddleNLP/ERNIE-3.0](./examples/text/ernie-3.0)                                             | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ❔                           | ❔                           | ✅       |
-| Speech                 | [PaddleSpeech/PP-TTS](./examples/text/uie)                                                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ❔                           | --                          | ✅       |
+| Speech                 | [PaddleSpeech/PP-TTS](./examples/audio/pp-tts)                                                   | ✅                                                | ✅                        | ✅                        | ✅                        | ✅                       | ✅                     | ✅                          | ❔                | ❔                           | --                          | ✅       |

 </div></details>

@@ -390,18 +319,6 @@ Notes: ✅: already supported; ❔: to be supported in the future;  N/A: Not Ava

 </div></details>

-## Community
-
-<div id="fastdeploy-community"></div>
-
- If you have any question or suggestion, please give us your valuable input via GitHub Issues
- **Join Us👬：**
-  - **Slack**：Join our [Slack community](https://join.slack.com/t/fastdeployworkspace/shared_invite/zt-1hhvpb279-iw2pNPwrDaMBQ5OQhO3Siw) and chat with other community members about ideas
-  - **WeChat**：join our **WeChat community** and chat with other community members about ideas
-
-<div align="center">
-<img src="https://user-images.githubusercontent.com/54695910/200145290-d5565d18-6707-4a0b-a9af-85fd36d35d13.jpg"  width = "225" height = "225" />
-</div>

 ## Acknowledge

--- a/benchmark/benchmark_ppcls.py
+++ b/benchmark/benchmark_ppcls.py
@@ -75,6 +75,11 @@ def build_option(args):
            option.use_ort_backend()
        elif backend == "paddle":
            option.use_paddle_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+            option.set_openvino_device(name="GPU")
+            # change name and shape for models
+            option.set_openvino_shape_info({"x": [1, 3, 224, 224]})
        elif backend in ["trt", "paddle_trt"]:
            option.use_trt_backend()
            if backend == "paddle_trt":
@@ -108,27 +113,109 @@ def build_option(args):
    return option


-def get_current_memory_mb(gpu_id=None):
-    import pynvml
-    import psutil
-    pid = os.getpid()
-    p = psutil.Process(pid)
-    info = p.memory_full_info()
-    cpu_mem = info.uss / 1024. / 1024.
-    gpu_mem = 0
-    if gpu_id is not None:
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_mem = meminfo.used / 1024. / 1024.
-    return cpu_mem, gpu_mem
+class StatBase(object):
+    """StatBase"""
+    nvidia_smi_path = "nvidia-smi"
+    gpu_keys = ('index', 'uuid', 'name', 'timestamp', 'memory.total',
+                'memory.free', 'memory.used', 'utilization.gpu',
+                'utilization.memory')
+    nu_opt = ',nounits'
+    cpu_keys = ('cpu.util', 'memory.util', 'memory.used')


-def get_current_gputil(gpu_id):
-    import GPUtil
-    GPUs = GPUtil.getGPUs()
-    gpu_load = GPUs[gpu_id].load
-    return gpu_load
+class Monitor(StatBase):
+    """Monitor"""
+
+    def __init__(self, use_gpu=False, gpu_id=0, interval=0.1):
+        self.result = {}
+        self.gpu_id = gpu_id
+        self.use_gpu = use_gpu
+        self.interval = interval
+        self.cpu_stat_q = multiprocessing.Queue()
+
+    def start(self):
+        cmd = '%s --id=%s --query-gpu=%s --format=csv,noheader%s -lms 50' % (
+            StatBase.nvidia_smi_path, self.gpu_id, ','.join(StatBase.gpu_keys),
+            StatBase.nu_opt)
+        if self.use_gpu:
+            self.gpu_stat_worker = subprocess.Popen(
+                cmd,
+                stderr=subprocess.STDOUT,
+                stdout=subprocess.PIPE,
+                shell=True,
+                close_fds=True,
+                preexec_fn=os.setsid)
+        # cpu stat
+        pid = os.getpid()
+        self.cpu_stat_worker = multiprocessing.Process(
+            target=self.cpu_stat_func,
+            args=(self.cpu_stat_q, pid, self.interval))
+        self.cpu_stat_worker.start()
+
+    def stop(self):
+        try:
+            if self.use_gpu:
+                os.killpg(self.gpu_stat_worker.pid, signal.SIGUSR1)
+            # os.killpg(p.pid, signal.SIGTERM)
+            self.cpu_stat_worker.terminate()
+            self.cpu_stat_worker.join(timeout=0.01)
+        except Exception as e:
+            print(e)
+            return
+
+        # gpu
+        if self.use_gpu:
+            lines = self.gpu_stat_worker.stdout.readlines()
+            lines = [
+                line.strip().decode("utf-8") for line in lines
+                if line.strip() != ''
+            ]
+            gpu_info_list = [{
+                k: v
+                for k, v in zip(StatBase.gpu_keys, line.split(', '))
+            } for line in lines]
+            if len(gpu_info_list) == 0:
+                return
+            result = gpu_info_list[0]
+            for item in gpu_info_list:
+                for k in item.keys():
+                    if k not in ["name", "uuid", "timestamp"]:
+                        result[k] = max(int(result[k]), int(item[k]))
+                    else:
+                        result[k] = max(result[k], item[k])
+            self.result['gpu'] = result
+
+        # cpu
+        cpu_result = {}
+        if self.cpu_stat_q.qsize() > 0:
+            cpu_result = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+        while not self.cpu_stat_q.empty():
+            item = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+            for k in StatBase.cpu_keys:
+                cpu_result[k] = max(cpu_result[k], item[k])
+        cpu_result['name'] = cpuinfo.get_cpu_info()['brand_raw']
+        self.result['cpu'] = cpu_result
+
+    def output(self):
+        return self.result
+
+    def cpu_stat_func(self, q, pid, interval=0.0):
+        """cpu stat function"""
+        stat_info = psutil.Process(pid)
+        while True:
+            # pid = os.getpid()
+            cpu_util, mem_util, mem_use = stat_info.cpu_percent(
+            ), stat_info.memory_percent(), round(stat_info.memory_info().rss /
+                                                 1024.0 / 1024.0, 4)
+            q.put([cpu_util, mem_util, mem_use])
+            time.sleep(interval)
+        return


 if __name__ == '__main__':
@@ -141,6 +228,7 @@ if __name__ == '__main__':

    gpu_id = args.device_id
    enable_collect_memory_info = args.enable_collect_memory_info
+    dump_result = dict()
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -160,6 +248,16 @@ if __name__ == '__main__':
    try:
        model = fd.vision.classification.PaddleClasModel(
            model_file, params_file, config_file, runtime_option=option)
+        if enable_collect_memory_info:
+            import multiprocessing
+            import subprocess
+            import psutil
+            import signal
+            import cpuinfo
+            enable_gpu = args.device == "gpu"
+            monitor = Monitor(enable_gpu, gpu_id)
+            monitor.start()
+
        model.enable_record_time_of_runtime()
        im_ori = cv2.imread(args.image)
        for i in range(args.iter_num):
@@ -167,31 +265,28 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if enable_collect_memory_info:
-                gpu_util.append(get_current_gputil(gpu_id))
-                cm, gm = get_current_memory_mb(gpu_id)
-                cpu_mem.append(cm)
-                gpu_mem.append(gm)

        runtime_statis = model.print_statis_info_of_runtime()

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
        if enable_collect_memory_info:
-            cpu_mem_repeat = cpu_mem[warmup_iter:]
-            gpu_mem_repeat = gpu_mem[warmup_iter:]
-            gpu_util_repeat = gpu_util[warmup_iter:]
+            monitor.stop()
+            mem_info = monitor.output()
+            dump_result["cpu_rss_mb"] = mem_info['cpu'][
+                'memory.used'] if 'cpu' in mem_info else 0
+            dump_result["gpu_rss_mb"] = mem_info['gpu'][
+                'memory.used'] if 'gpu' in mem_info else 0
+            dump_result["gpu_util"] = mem_info['gpu'][
+                'utilization.gpu'] if 'gpu' in mem_info else 0

-        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if enable_collect_memory_info:
-            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
-            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
-            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
+        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
@@ -199,6 +294,9 @@ if __name__ == '__main__':
                str(dump_result["gpu_rss_mb"])))
            f.writelines("gpu_util: {} \n".format(
                str(dump_result["gpu_util"])))
+            print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
+            print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
+            print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
    except:
        f.writelines("!!!!!Infer Failed\n")

--- a/benchmark/benchmark_ppdet.py
+++ b/benchmark/benchmark_ppdet.py
@@ -75,6 +75,17 @@ def build_option(args):
            option.use_ort_backend()
        elif backend == "paddle":
            option.use_paddle_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+            # Using GPU and CPU heterogeneous execution mode
+            option.set_openvino_device("HETERO:GPU,CPU")
+            # change name and shape for models
+            option.set_openvino_shape_info({
+                "image": [1, 3, 320, 320],
+                "scale_factor": [1, 2]
+            })
+            # Set CPU up operator
+            option.set_openvino_cpu_operators(["MulticlassNms"])
        elif backend in ["trt", "paddle_trt"]:
            option.use_trt_backend()
            if backend == "paddle_trt":
@@ -108,27 +119,109 @@ def build_option(args):
    return option


-def get_current_memory_mb(gpu_id=None):
-    import pynvml
-    import psutil
-    pid = os.getpid()
-    p = psutil.Process(pid)
-    info = p.memory_full_info()
-    cpu_mem = info.uss / 1024. / 1024.
-    gpu_mem = 0
-    if gpu_id is not None:
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_mem = meminfo.used / 1024. / 1024.
-    return cpu_mem, gpu_mem
+class StatBase(object):
+    """StatBase"""
+    nvidia_smi_path = "nvidia-smi"
+    gpu_keys = ('index', 'uuid', 'name', 'timestamp', 'memory.total',
+                'memory.free', 'memory.used', 'utilization.gpu',
+                'utilization.memory')
+    nu_opt = ',nounits'
+    cpu_keys = ('cpu.util', 'memory.util', 'memory.used')


-def get_current_gputil(gpu_id):
-    import GPUtil
-    GPUs = GPUtil.getGPUs()
-    gpu_load = GPUs[gpu_id].load
-    return gpu_load
+class Monitor(StatBase):
+    """Monitor"""
+
+    def __init__(self, use_gpu=False, gpu_id=0, interval=0.1):
+        self.result = {}
+        self.gpu_id = gpu_id
+        self.use_gpu = use_gpu
+        self.interval = interval
+        self.cpu_stat_q = multiprocessing.Queue()
+
+    def start(self):
+        cmd = '%s --id=%s --query-gpu=%s --format=csv,noheader%s -lms 50' % (
+            StatBase.nvidia_smi_path, self.gpu_id, ','.join(StatBase.gpu_keys),
+            StatBase.nu_opt)
+        if self.use_gpu:
+            self.gpu_stat_worker = subprocess.Popen(
+                cmd,
+                stderr=subprocess.STDOUT,
+                stdout=subprocess.PIPE,
+                shell=True,
+                close_fds=True,
+                preexec_fn=os.setsid)
+        # cpu stat
+        pid = os.getpid()
+        self.cpu_stat_worker = multiprocessing.Process(
+            target=self.cpu_stat_func,
+            args=(self.cpu_stat_q, pid, self.interval))
+        self.cpu_stat_worker.start()
+
+    def stop(self):
+        try:
+            if self.use_gpu:
+                os.killpg(self.gpu_stat_worker.pid, signal.SIGUSR1)
+            # os.killpg(p.pid, signal.SIGTERM)
+            self.cpu_stat_worker.terminate()
+            self.cpu_stat_worker.join(timeout=0.01)
+        except Exception as e:
+            print(e)
+            return
+
+        # gpu
+        if self.use_gpu:
+            lines = self.gpu_stat_worker.stdout.readlines()
+            lines = [
+                line.strip().decode("utf-8") for line in lines
+                if line.strip() != ''
+            ]
+            gpu_info_list = [{
+                k: v
+                for k, v in zip(StatBase.gpu_keys, line.split(', '))
+            } for line in lines]
+            if len(gpu_info_list) == 0:
+                return
+            result = gpu_info_list[0]
+            for item in gpu_info_list:
+                for k in item.keys():
+                    if k not in ["name", "uuid", "timestamp"]:
+                        result[k] = max(int(result[k]), int(item[k]))
+                    else:
+                        result[k] = max(result[k], item[k])
+            self.result['gpu'] = result
+
+        # cpu
+        cpu_result = {}
+        if self.cpu_stat_q.qsize() > 0:
+            cpu_result = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+        while not self.cpu_stat_q.empty():
+            item = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+            for k in StatBase.cpu_keys:
+                cpu_result[k] = max(cpu_result[k], item[k])
+        cpu_result['name'] = cpuinfo.get_cpu_info()['brand_raw']
+        self.result['cpu'] = cpu_result
+
+    def output(self):
+        return self.result
+
+    def cpu_stat_func(self, q, pid, interval=0.0):
+        """cpu stat function"""
+        stat_info = psutil.Process(pid)
+        while True:
+            # pid = os.getpid()
+            cpu_util, mem_util, mem_use = stat_info.cpu_percent(
+            ), stat_info.memory_percent(), round(stat_info.memory_info().rss /
+                                                 1024.0 / 1024.0, 4)
+            q.put([cpu_util, mem_util, mem_use])
+            time.sleep(interval)
+        return


 if __name__ == '__main__':
@@ -141,6 +234,7 @@ if __name__ == '__main__':

    gpu_id = args.device_id
    enable_collect_memory_info = args.enable_collect_memory_info
+    dump_result = dict()
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -178,6 +272,16 @@ if __name__ == '__main__':
        else:
            raise Exception("model {} not support now in ppdet series".format(
                args.model))
+        if enable_collect_memory_info:
+            import multiprocessing
+            import subprocess
+            import psutil
+            import signal
+            import cpuinfo
+            enable_gpu = args.device == "gpu"
+            monitor = Monitor(enable_gpu, gpu_id)
+            monitor.start()
+
        model.enable_record_time_of_runtime()
        im_ori = cv2.imread(args.image)
        for i in range(args.iter_num):
@@ -185,31 +289,28 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if enable_collect_memory_info:
-                gpu_util.append(get_current_gputil(gpu_id))
-                cm, gm = get_current_memory_mb(gpu_id)
-                cpu_mem.append(cm)
-                gpu_mem.append(gm)

        runtime_statis = model.print_statis_info_of_runtime()

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
        if enable_collect_memory_info:
-            cpu_mem_repeat = cpu_mem[warmup_iter:]
-            gpu_mem_repeat = gpu_mem[warmup_iter:]
-            gpu_util_repeat = gpu_util[warmup_iter:]
+            monitor.stop()
+            mem_info = monitor.output()
+            dump_result["cpu_rss_mb"] = mem_info['cpu'][
+                'memory.used'] if 'cpu' in mem_info else 0
+            dump_result["gpu_rss_mb"] = mem_info['gpu'][
+                'memory.used'] if 'gpu' in mem_info else 0
+            dump_result["gpu_util"] = mem_info['gpu'][
+                'utilization.gpu'] if 'gpu' in mem_info else 0

-        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if enable_collect_memory_info:
-            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
-            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
-            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
+        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
@@ -217,6 +318,9 @@ if __name__ == '__main__':
                str(dump_result["gpu_rss_mb"])))
            f.writelines("gpu_util: {} \n".format(
                str(dump_result["gpu_util"])))
+            print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
+            print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
+            print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
    except:
        f.writelines("!!!!!Infer Failed\n")

--- a/benchmark/benchmark_ppocr.py
+++ b/benchmark/benchmark_ppocr.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fastdeploy as fd
+import cv2
+import os
+import numpy as np
+import time
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_dir", required=True, help="Model dir of PPOCR.")
+    parser.add_argument(
+        "--det_model", required=True, help="Path of Detection model of PPOCR.")
+    parser.add_argument(
+        "--cls_model",
+        required=True,
+        help="Path of Classification model of PPOCR.")
+    parser.add_argument(
+        "--rec_model",
+        required=True,
+        help="Path of Recognization model of PPOCR.")
+    parser.add_argument(
+        "--rec_label_file",
+        required=True,
+        help="Path of Recognization model of PPOCR.")
+    parser.add_argument(
+        "--image", type=str, required=False, help="Path of test image file.")
+    parser.add_argument(
+        "--cpu_num_thread",
+        type=int,
+        default=8,
+        help="default number of cpu thread.")
+    parser.add_argument(
+        "--device_id", type=int, default=0, help="device(gpu) id")
+    parser.add_argument(
+        "--iter_num",
+        required=True,
+        type=int,
+        default=300,
+        help="number of iterations for computing performace.")
+    parser.add_argument(
+        "--device",
+        default="cpu",
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
+    parser.add_argument(
+        "--enable_trt_fp16",
+        type=ast.literal_eval,
+        default=False,
+        help="whether enable fp16 in trt backend")
+    parser.add_argument(
+        "--enable_collect_memory_info",
+        type=ast.literal_eval,
+        default=False,
+        help="whether enable collect memory info")
+    args = parser.parse_args()
+    return args
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+    device = args.device
+    backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
+    option.set_cpu_thread_num(args.cpu_num_thread)
+    if device == "gpu":
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
+    else:
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))
+
+    return option
+
+
+class StatBase(object):
+    """StatBase"""
+    nvidia_smi_path = "nvidia-smi"
+    gpu_keys = ('index', 'uuid', 'name', 'timestamp', 'memory.total',
+                'memory.free', 'memory.used', 'utilization.gpu',
+                'utilization.memory')
+    nu_opt = ',nounits'
+    cpu_keys = ('cpu.util', 'memory.util', 'memory.used')
+
+
+class Monitor(StatBase):
+    """Monitor"""
+
+    def __init__(self, use_gpu=False, gpu_id=0, interval=0.1):
+        self.result = {}
+        self.gpu_id = gpu_id
+        self.use_gpu = use_gpu
+        self.interval = interval
+        self.cpu_stat_q = multiprocessing.Queue()
+
+    def start(self):
+        cmd = '%s --id=%s --query-gpu=%s --format=csv,noheader%s -lms 50' % (
+            StatBase.nvidia_smi_path, self.gpu_id, ','.join(StatBase.gpu_keys),
+            StatBase.nu_opt)
+        if self.use_gpu:
+            self.gpu_stat_worker = subprocess.Popen(
+                cmd,
+                stderr=subprocess.STDOUT,
+                stdout=subprocess.PIPE,
+                shell=True,
+                close_fds=True,
+                preexec_fn=os.setsid)
+        # cpu stat
+        pid = os.getpid()
+        self.cpu_stat_worker = multiprocessing.Process(
+            target=self.cpu_stat_func,
+            args=(self.cpu_stat_q, pid, self.interval))
+        self.cpu_stat_worker.start()
+
+    def stop(self):
+        try:
+            if self.use_gpu:
+                os.killpg(self.gpu_stat_worker.pid, signal.SIGUSR1)
+            # os.killpg(p.pid, signal.SIGTERM)
+            self.cpu_stat_worker.terminate()
+            self.cpu_stat_worker.join(timeout=0.01)
+        except Exception as e:
+            print(e)
+            return
+
+        # gpu
+        if self.use_gpu:
+            lines = self.gpu_stat_worker.stdout.readlines()
+            lines = [
+                line.strip().decode("utf-8") for line in lines
+                if line.strip() != ''
+            ]
+            gpu_info_list = [{
+                k: v
+                for k, v in zip(StatBase.gpu_keys, line.split(', '))
+            } for line in lines]
+            if len(gpu_info_list) == 0:
+                return
+            result = gpu_info_list[0]
+            for item in gpu_info_list:
+                for k in item.keys():
+                    if k not in ["name", "uuid", "timestamp"]:
+                        result[k] = max(int(result[k]), int(item[k]))
+                    else:
+                        result[k] = max(result[k], item[k])
+            self.result['gpu'] = result
+
+        # cpu
+        cpu_result = {}
+        if self.cpu_stat_q.qsize() > 0:
+            cpu_result = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+        while not self.cpu_stat_q.empty():
+            item = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+            for k in StatBase.cpu_keys:
+                cpu_result[k] = max(cpu_result[k], item[k])
+        cpu_result['name'] = cpuinfo.get_cpu_info()['brand_raw']
+        self.result['cpu'] = cpu_result
+
+    def output(self):
+        return self.result
+
+    def cpu_stat_func(self, q, pid, interval=0.0):
+        """cpu stat function"""
+        stat_info = psutil.Process(pid)
+        while True:
+            # pid = os.getpid()
+            cpu_util, mem_util, mem_use = stat_info.cpu_percent(
+            ), stat_info.memory_percent(), round(stat_info.memory_info().rss /
+                                                 1024.0 / 1024.0, 4)
+            q.put([cpu_util, mem_util, mem_use])
+            time.sleep(interval)
+        return
+
+
+if __name__ == '__main__':
+
+    args = parse_arguments()
+    option = build_option(args)
+    # Detection Model
+    det_model_file = os.path.join(args.model_dir, args.det_model,
+                                  "inference.pdmodel")
+    det_params_file = os.path.join(args.model_dir, args.det_model,
+                                   "inference.pdiparams")
+    # Classification Model
+    cls_model_file = os.path.join(args.model_dir, args.cls_model,
+                                  "inference.pdmodel")
+    cls_params_file = os.path.join(args.model_dir, args.cls_model,
+                                   "inference.pdiparams")
+    # Recognition Model
+    rec_model_file = os.path.join(args.model_dir, args.rec_model,
+                                  "inference.pdmodel")
+    rec_params_file = os.path.join(args.model_dir, args.rec_model,
+                                   "inference.pdiparams")
+    rec_label_file = os.path.join(args.model_dir, args.rec_label_file)
+
+    gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
+    dump_result = dict()
+    end2end_statis = list()
+    cpu_mem = list()
+    gpu_mem = list()
+    gpu_util = list()
+    if args.device == "cpu":
+        file_path = args.model_dir + "_model_" + args.backend + "_" + \
+            args.device + "_" + str(args.cpu_num_thread) + ".txt"
+    else:
+        if args.enable_trt_fp16:
+            file_path = args.model_dir + "_model_" + args.backend + "_fp16_" + args.device + ".txt"
+        else:
+            file_path = args.model_dir + "_model_" + args.backend + "_" + args.device + ".txt"
+    f = open(file_path, "w")
+    f.writelines("===={}====: \n".format(os.path.split(file_path)[-1][:-4]))
+
+    try:
+        rec_option = option
+        if "OCRv2" in args.model_dir:
+            det_option = option
+            if args.backend in ["trt", "paddle_trt"]:
+                det_option.set_trt_input_shape(
+                    "x", [1, 3, 64, 64], [1, 3, 640, 640], [1, 3, 960, 960])
+            det_model = fd.vision.ocr.DBDetector(
+                det_model_file, det_params_file, runtime_option=det_option)
+            cls_option = option
+            if args.backend in ["trt", "paddle_trt"]:
+                cls_option.set_trt_input_shape(
+                    "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024])
+            cls_model = fd.vision.ocr.Classifier(
+                cls_model_file, cls_params_file, runtime_option=cls_option)
+            rec_option = option
+            if args.backend in ["trt", "paddle_trt"]:
+                rec_option.set_trt_input_shape(
+                    "x", [1, 3, 32, 10], [10, 3, 32, 320], [32, 3, 32, 2304])
+            rec_model = fd.vision.ocr.Recognizer(
+                rec_model_file,
+                rec_params_file,
+                rec_label_file,
+                runtime_option=rec_option)
+            model = fd.vision.ocr.PPOCRv2(
+                det_model=det_model, cls_model=cls_model, rec_model=rec_model)
+        elif "OCRv3" in args.model_dir:
+            if args.backend in ["trt", "paddle_trt"]:
+                det_option.set_trt_input_shape(
+                    "x", [1, 3, 64, 64], [1, 3, 640, 640], [1, 3, 960, 960])
+            det_model = fd.vision.ocr.DBDetector(
+                det_model_file, det_params_file, runtime_option=det_option)
+            if args.backend in ["trt", "paddle_trt"]:
+                cls_option.set_trt_input_shape(
+                    "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024])
+            cls_model = fd.vision.ocr.Classifier(
+                cls_model_file, cls_params_file, runtime_option=cls_option)
+            if args.backend in ["trt", "paddle_trt"]:
+                rec_option.set_trt_input_shape(
+                    "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 2304])
+            rec_model = fd.vision.ocr.Recognizer(
+                rec_model_file,
+                rec_params_file,
+                rec_label_file,
+                runtime_option=rec_option)
+            model = fd.vision.ocr.PPOCRv3(
+                det_model=det_model, cls_model=cls_model, rec_model=rec_model)
+        else:
+            raise Exception("model {} not support now in ppocr series".format(
+                args.model_dir))
+        if enable_collect_memory_info:
+            import multiprocessing
+            import subprocess
+            import psutil
+            import signal
+            import cpuinfo
+            enable_gpu = args.device == "gpu"
+            monitor = Monitor(enable_gpu, gpu_id)
+            monitor.start()
+
+        det_model.enable_record_time_of_runtime()
+        cls_model.enable_record_time_of_runtime()
+        rec_model.enable_record_time_of_runtime()
+        im_ori = cv2.imread(args.image)
+        for i in range(args.iter_num):
+            im = im_ori
+            start = time.time()
+            result = model.predict(im)
+            end2end_statis.append(time.time() - start)
+
+        runtime_statis_det = det_model.print_statis_info_of_runtime()
+        runtime_statis_cls = cls_model.print_statis_info_of_runtime()
+        runtime_statis_rec = rec_model.print_statis_info_of_runtime()
+
+        warmup_iter = args.iter_num // 5
+        end2end_statis_repeat = end2end_statis[warmup_iter:]
+        if enable_collect_memory_info:
+            monitor.stop()
+            mem_info = monitor.output()
+            dump_result["cpu_rss_mb"] = mem_info['cpu'][
+                'memory.used'] if 'cpu' in mem_info else 0
+            dump_result["gpu_rss_mb"] = mem_info['gpu'][
+                'memory.used'] if 'gpu' in mem_info else 0
+            dump_result["gpu_util"] = mem_info['gpu'][
+                'utilization.gpu'] if 'gpu' in mem_info else 0
+
+        dump_result["runtime"] = (
+            runtime_statis_det["avg_time"] + runtime_statis_cls["avg_time"] +
+            runtime_statis_rec["avg_time"]) * 1000
+        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
+
+        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
+        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
+        if enable_collect_memory_info:
+            f.writelines("cpu_rss_mb: {} \n".format(
+                str(dump_result["cpu_rss_mb"])))
+            f.writelines("gpu_rss_mb: {} \n".format(
+                str(dump_result["gpu_rss_mb"])))
+            f.writelines("gpu_util: {} \n".format(
+                str(dump_result["gpu_util"])))
+            print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
+            print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
+            print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
+    except:
+        f.writelines("!!!!!Infer Failed\n")
+
+    f.close()
--- a/benchmark/benchmark_ppseg.py
+++ b/benchmark/benchmark_ppseg.py
@@ -75,6 +75,11 @@ def build_option(args):
            option.use_ort_backend()
        elif backend == "paddle":
            option.use_paddle_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+            option.set_openvino_device(name="GPU")  # use gpu
+            # change name and shape for models
+            option.set_openvino_shape_info({"x": [1, 3, 512, 512]})
        elif backend in ["trt", "paddle_trt"]:
            option.use_trt_backend()
            if backend == "paddle_trt":
@@ -108,27 +113,109 @@ def build_option(args):
    return option


-def get_current_memory_mb(gpu_id=None):
-    import pynvml
-    import psutil
-    pid = os.getpid()
-    p = psutil.Process(pid)
-    info = p.memory_full_info()
-    cpu_mem = info.uss / 1024. / 1024.
-    gpu_mem = 0
-    if gpu_id is not None:
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_mem = meminfo.used / 1024. / 1024.
-    return cpu_mem, gpu_mem
+class StatBase(object):
+    """StatBase"""
+    nvidia_smi_path = "nvidia-smi"
+    gpu_keys = ('index', 'uuid', 'name', 'timestamp', 'memory.total',
+                'memory.free', 'memory.used', 'utilization.gpu',
+                'utilization.memory')
+    nu_opt = ',nounits'
+    cpu_keys = ('cpu.util', 'memory.util', 'memory.used')


-def get_current_gputil(gpu_id):
-    import GPUtil
-    GPUs = GPUtil.getGPUs()
-    gpu_load = GPUs[gpu_id].load
-    return gpu_load
+class Monitor(StatBase):
+    """Monitor"""
+
+    def __init__(self, use_gpu=False, gpu_id=0, interval=0.1):
+        self.result = {}
+        self.gpu_id = gpu_id
+        self.use_gpu = use_gpu
+        self.interval = interval
+        self.cpu_stat_q = multiprocessing.Queue()
+
+    def start(self):
+        cmd = '%s --id=%s --query-gpu=%s --format=csv,noheader%s -lms 50' % (
+            StatBase.nvidia_smi_path, self.gpu_id, ','.join(StatBase.gpu_keys),
+            StatBase.nu_opt)
+        if self.use_gpu:
+            self.gpu_stat_worker = subprocess.Popen(
+                cmd,
+                stderr=subprocess.STDOUT,
+                stdout=subprocess.PIPE,
+                shell=True,
+                close_fds=True,
+                preexec_fn=os.setsid)
+        # cpu stat
+        pid = os.getpid()
+        self.cpu_stat_worker = multiprocessing.Process(
+            target=self.cpu_stat_func,
+            args=(self.cpu_stat_q, pid, self.interval))
+        self.cpu_stat_worker.start()
+
+    def stop(self):
+        try:
+            if self.use_gpu:
+                os.killpg(self.gpu_stat_worker.pid, signal.SIGUSR1)
+            # os.killpg(p.pid, signal.SIGTERM)
+            self.cpu_stat_worker.terminate()
+            self.cpu_stat_worker.join(timeout=0.01)
+        except Exception as e:
+            print(e)
+            return
+
+        # gpu
+        if self.use_gpu:
+            lines = self.gpu_stat_worker.stdout.readlines()
+            lines = [
+                line.strip().decode("utf-8") for line in lines
+                if line.strip() != ''
+            ]
+            gpu_info_list = [{
+                k: v
+                for k, v in zip(StatBase.gpu_keys, line.split(', '))
+            } for line in lines]
+            if len(gpu_info_list) == 0:
+                return
+            result = gpu_info_list[0]
+            for item in gpu_info_list:
+                for k in item.keys():
+                    if k not in ["name", "uuid", "timestamp"]:
+                        result[k] = max(int(result[k]), int(item[k]))
+                    else:
+                        result[k] = max(result[k], item[k])
+            self.result['gpu'] = result
+
+        # cpu
+        cpu_result = {}
+        if self.cpu_stat_q.qsize() > 0:
+            cpu_result = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+        while not self.cpu_stat_q.empty():
+            item = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+            for k in StatBase.cpu_keys:
+                cpu_result[k] = max(cpu_result[k], item[k])
+        cpu_result['name'] = cpuinfo.get_cpu_info()['brand_raw']
+        self.result['cpu'] = cpu_result
+
+    def output(self):
+        return self.result
+
+    def cpu_stat_func(self, q, pid, interval=0.0):
+        """cpu stat function"""
+        stat_info = psutil.Process(pid)
+        while True:
+            # pid = os.getpid()
+            cpu_util, mem_util, mem_use = stat_info.cpu_percent(
+            ), stat_info.memory_percent(), round(stat_info.memory_info().rss /
+                                                 1024.0 / 1024.0, 4)
+            q.put([cpu_util, mem_util, mem_use])
+            time.sleep(interval)
+        return


 if __name__ == '__main__':
@@ -141,6 +228,7 @@ if __name__ == '__main__':

    gpu_id = args.device_id
    enable_collect_memory_info = args.enable_collect_memory_info
+    dump_result = dict()
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -159,6 +247,16 @@ if __name__ == '__main__':
    try:
        model = fd.vision.segmentation.PaddleSegModel(
            model_file, params_file, config_file, runtime_option=option)
+        if enable_collect_memory_info:
+            import multiprocessing
+            import subprocess
+            import psutil
+            import signal
+            import cpuinfo
+            enable_gpu = args.device == "gpu"
+            monitor = Monitor(enable_gpu, gpu_id)
+            monitor.start()
+
        model.enable_record_time_of_runtime()
        im_ori = cv2.imread(args.image)
        for i in range(args.iter_num):
@@ -166,31 +264,28 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if enable_collect_memory_info:
-                gpu_util.append(get_current_gputil(gpu_id))
-                cm, gm = get_current_memory_mb(gpu_id)
-                cpu_mem.append(cm)
-                gpu_mem.append(gm)

        runtime_statis = model.print_statis_info_of_runtime()

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
        if enable_collect_memory_info:
-            cpu_mem_repeat = cpu_mem[warmup_iter:]
-            gpu_mem_repeat = gpu_mem[warmup_iter:]
-            gpu_util_repeat = gpu_util[warmup_iter:]
+            monitor.stop()
+            mem_info = monitor.output()
+            dump_result["cpu_rss_mb"] = mem_info['cpu'][
+                'memory.used'] if 'cpu' in mem_info else 0
+            dump_result["gpu_rss_mb"] = mem_info['gpu'][
+                'memory.used'] if 'gpu' in mem_info else 0
+            dump_result["gpu_util"] = mem_info['gpu'][
+                'utilization.gpu'] if 'gpu' in mem_info else 0

-        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if enable_collect_memory_info:
-            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
-            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
-            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
+        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
@@ -198,6 +293,9 @@ if __name__ == '__main__':
                str(dump_result["gpu_rss_mb"])))
            f.writelines("gpu_util: {} \n".format(
                str(dump_result["gpu_util"])))
+            print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
+            print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
+            print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
    except:
        f.writelines("!!!!!Infer Failed\n")

--- a/benchmark/benchmark_yolo.py
+++ b/benchmark/benchmark_yolo.py
@@ -75,6 +75,11 @@ def build_option(args):
            option.use_ort_backend()
        elif backend == "paddle":
            option.use_paddle_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+            option.set_openvino_device(name="GPU")
+            # change name and shape for models
+            option.set_openvino_shape_info({"images": [1, 3, 640, 640]})
        elif backend in ["trt", "paddle_trt"]:
            option.use_trt_backend()
            if backend == "paddle_trt":
@@ -108,27 +113,109 @@ def build_option(args):
    return option


-def get_current_memory_mb(gpu_id=None):
-    import pynvml
-    import psutil
-    pid = os.getpid()
-    p = psutil.Process(pid)
-    info = p.memory_full_info()
-    cpu_mem = info.uss / 1024. / 1024.
-    gpu_mem = 0
-    if gpu_id is not None:
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_mem = meminfo.used / 1024. / 1024.
-    return cpu_mem, gpu_mem
+class StatBase(object):
+    """StatBase"""
+    nvidia_smi_path = "nvidia-smi"
+    gpu_keys = ('index', 'uuid', 'name', 'timestamp', 'memory.total',
+                'memory.free', 'memory.used', 'utilization.gpu',
+                'utilization.memory')
+    nu_opt = ',nounits'
+    cpu_keys = ('cpu.util', 'memory.util', 'memory.used')


-def get_current_gputil(gpu_id):
-    import GPUtil
-    GPUs = GPUtil.getGPUs()
-    gpu_load = GPUs[gpu_id].load
-    return gpu_load
+class Monitor(StatBase):
+    """Monitor"""
+
+    def __init__(self, use_gpu=False, gpu_id=0, interval=0.1):
+        self.result = {}
+        self.gpu_id = gpu_id
+        self.use_gpu = use_gpu
+        self.interval = interval
+        self.cpu_stat_q = multiprocessing.Queue()
+
+    def start(self):
+        cmd = '%s --id=%s --query-gpu=%s --format=csv,noheader%s -lms 50' % (
+            StatBase.nvidia_smi_path, self.gpu_id, ','.join(StatBase.gpu_keys),
+            StatBase.nu_opt)
+        if self.use_gpu:
+            self.gpu_stat_worker = subprocess.Popen(
+                cmd,
+                stderr=subprocess.STDOUT,
+                stdout=subprocess.PIPE,
+                shell=True,
+                close_fds=True,
+                preexec_fn=os.setsid)
+        # cpu stat
+        pid = os.getpid()
+        self.cpu_stat_worker = multiprocessing.Process(
+            target=self.cpu_stat_func,
+            args=(self.cpu_stat_q, pid, self.interval))
+        self.cpu_stat_worker.start()
+
+    def stop(self):
+        try:
+            if self.use_gpu:
+                os.killpg(self.gpu_stat_worker.pid, signal.SIGUSR1)
+            # os.killpg(p.pid, signal.SIGTERM)
+            self.cpu_stat_worker.terminate()
+            self.cpu_stat_worker.join(timeout=0.01)
+        except Exception as e:
+            print(e)
+            return
+
+        # gpu
+        if self.use_gpu:
+            lines = self.gpu_stat_worker.stdout.readlines()
+            lines = [
+                line.strip().decode("utf-8") for line in lines
+                if line.strip() != ''
+            ]
+            gpu_info_list = [{
+                k: v
+                for k, v in zip(StatBase.gpu_keys, line.split(', '))
+            } for line in lines]
+            if len(gpu_info_list) == 0:
+                return
+            result = gpu_info_list[0]
+            for item in gpu_info_list:
+                for k in item.keys():
+                    if k not in ["name", "uuid", "timestamp"]:
+                        result[k] = max(int(result[k]), int(item[k]))
+                    else:
+                        result[k] = max(result[k], item[k])
+            self.result['gpu'] = result
+
+        # cpu
+        cpu_result = {}
+        if self.cpu_stat_q.qsize() > 0:
+            cpu_result = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+        while not self.cpu_stat_q.empty():
+            item = {
+                k: v
+                for k, v in zip(StatBase.cpu_keys, self.cpu_stat_q.get())
+            }
+            for k in StatBase.cpu_keys:
+                cpu_result[k] = max(cpu_result[k], item[k])
+        cpu_result['name'] = cpuinfo.get_cpu_info()['brand_raw']
+        self.result['cpu'] = cpu_result
+
+    def output(self):
+        return self.result
+
+    def cpu_stat_func(self, q, pid, interval=0.0):
+        """cpu stat function"""
+        stat_info = psutil.Process(pid)
+        while True:
+            # pid = os.getpid()
+            cpu_util, mem_util, mem_use = stat_info.cpu_percent(
+            ), stat_info.memory_percent(), round(stat_info.memory_info().rss /
+                                                 1024.0 / 1024.0, 4)
+            q.put([cpu_util, mem_util, mem_use])
+            time.sleep(interval)
+        return


 if __name__ == '__main__':
@@ -139,6 +226,7 @@ if __name__ == '__main__':

    gpu_id = args.device_id
    enable_collect_memory_info = args.enable_collect_memory_info
+    dump_result = dict()
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -170,6 +258,16 @@ if __name__ == '__main__':
        else:
            raise Exception("model {} not support now in yolo series".format(
                args.model))
+        if enable_collect_memory_info:
+            import multiprocessing
+            import subprocess
+            import psutil
+            import signal
+            import cpuinfo
+            enable_gpu = args.device == "gpu"
+            monitor = Monitor(enable_gpu, gpu_id)
+            monitor.start()
+
        model.enable_record_time_of_runtime()
        im_ori = cv2.imread(args.image)
        for i in range(args.iter_num):
@@ -177,31 +275,28 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if enable_collect_memory_info:
-                gpu_util.append(get_current_gputil(gpu_id))
-                cm, gm = get_current_memory_mb(gpu_id)
-                cpu_mem.append(cm)
-                gpu_mem.append(gm)

        runtime_statis = model.print_statis_info_of_runtime()

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
        if enable_collect_memory_info:
-            cpu_mem_repeat = cpu_mem[warmup_iter:]
-            gpu_mem_repeat = gpu_mem[warmup_iter:]
-            gpu_util_repeat = gpu_util[warmup_iter:]
+            monitor.stop()
+            mem_info = monitor.output()
+            dump_result["cpu_rss_mb"] = mem_info['cpu'][
+                'memory.used'] if 'cpu' in mem_info else 0
+            dump_result["gpu_rss_mb"] = mem_info['gpu'][
+                'memory.used'] if 'gpu' in mem_info else 0
+            dump_result["gpu_util"] = mem_info['gpu'][
+                'utilization.gpu'] if 'gpu' in mem_info else 0

-        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if enable_collect_memory_info:
-            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
-            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
-            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
+        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
@@ -209,6 +304,9 @@ if __name__ == '__main__':
                str(dump_result["gpu_rss_mb"])))
            f.writelines("gpu_util: {} \n".format(
                str(dump_result["gpu_util"])))
+            print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
+            print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
+            print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
    except:
        f.writelines("!!!!!Infer Failed\n")

--- a/benchmark/convert_info.py
+++ b/benchmark/convert_info.py
@@ -70,10 +70,8 @@ for i in range(line_nums):
                cpu_rss_mb_list = cpu_rss_mb_ori.split(".")
                cpu_rss_mb = cpu_rss_mb_list[0] + "." + cpu_rss_mb_list[1][:2]
            if "gpu_rss_mb" in lines[i + 4]:
-                gpu_rss_mb_ori = lines[i + 4].split(": ")[1]
-                # two decimal places
-                gpu_rss_mb_list = gpu_rss_mb_ori.split(".")
-                gpu_rss_mb = gpu_rss_mb_list[0] + "." + gpu_rss_mb_list[1][:2]
+                gpu_rss_mb_ori = lines[i + 4].split(": ")[1].strip()
+                gpu_rss_mb = str(gpu_rss_mb_ori) + ".0"
        if "ort_cpu_1" in lines[i]:
            ort_cpu_thread1[
                model_name] = runtime + "\t" + end2end + "\t" + cpu_rss_mb
@@ -111,7 +109,7 @@ for i in range(line_nums):

 f2 = open("struct_cpu_" + domain + ".txt", "w")
 f2.writelines(
-    "model_name\tthread_nums\tort_run\tort_end2end\tcpu_rss_mb\tov_run\tov_end2end\tcpu_rss_mb\tpaddle_run\tpaddle_end2end\tcpu_rss_mb\n"
+    "model_name\tthread_nums\tort_run\tort_end2end\tcpu_mem\tov_run\tov_end2end\tcpu_mem\tpaddle_run\tpaddle_end2end\tcpu_mem\n"
 )
 for model_name in model_name_set:
    lines1 = model_name + '\t1\t'
@@ -148,7 +146,7 @@ f2.close()

 f3 = open("struct_gpu_" + domain + ".txt", "w")
 f3.writelines(
-    "model_name\tort_run\tort_end2end\tgpu_rss_mb\tpaddle_run\tpaddle_end2end\tgpu_rss_mb\tpaddle_trt_run\tpaddle_trt_end2end\tgpu_rss_mb\tpaddle_trt_fp16_run\tpaddle_trt_fp16_end2end\tgpu_rss_mb\ttrt_run\ttrt_end2end\tgpu_rss_mb\ttrt_fp16_run\ttrt_fp16_end2end\tgpu_rss_mb\n"
+    "model_name\tort_run\tort_end2end\tgpu_mem\tpaddle_run\tpaddle_end2end\tgpu_mem\tpaddle_trt_run\tpaddle_trt_end2end\tgpu_mem\tpaddle_trt_fp16_run\tpaddle_trt_fp16_end2end\tgpu_mem\ttrt_run\ttrt_end2end\tgpu_mem\ttrt_fp16_run\ttrt_fp16_end2end\tgpu_mem\n"
 )
 for model_name in model_name_set:
    lines1 = model_name + '\t'
--- a/benchmark/run_benchmark_ppocr.sh
+++ b/benchmark/run_benchmark_ppocr.sh
@@ -0,0 +1,23 @@
+echo "[FastDeploy]    Running PPOCR benchmark..."
+
+# for PPOCRv2
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ort --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend paddle --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ov --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend ort --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_trt_fp16 True --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_trt_fp16 True --enable_collect_memory_info True
+
+# for PPOCRv3
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ort --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend paddle --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ov --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend ort --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_trt_fp16 True --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_collect_memory_info True
+python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_trt_fp16 True --enable_collect_memory_info True
--- a/cmake/fast_tokenizer.cmake
+++ b/cmake/fast_tokenizer.cmake
@@ -24,9 +24,16 @@ set(FASTTOKENIZER_INC_DIR
    "${FASTTOKENIZER_INSTALL_DIR}/include"
    "${FASTTOKENIZER_INSTALL_DIR}/third_party/include"
    CACHE PATH "fast_tokenizer include directory." FORCE)
-set(FASTTOKENIZER_LIB_DIR
-    "${FASTTOKENIZER_INSTALL_DIR}/lib/"
-    CACHE PATH "fast_tokenizer lib directory." FORCE)
+if(ANDROID)
+  set(FASTTOKENIZER_LIB_DIR
+      "${FASTTOKENIZER_INSTALL_DIR}/lib/${ANDROID_ABI}"
+      CACHE PATH "fast_tokenizer lib directory." FORCE)
+else()
+  set(FASTTOKENIZER_LIB_DIR
+      "${FASTTOKENIZER_INSTALL_DIR}/lib/"
+      CACHE PATH "fast_tokenizer lib directory." FORCE)
+endif()    
+
 set(FASTTOKENIZER_THIRD_LIB_DIR
    "${FASTTOKENIZER_INSTALL_DIR}/third_party/lib/"
    CACHE PATH "fast_tokenizer lib directory." FORCE)
@@ -37,21 +44,21 @@ include_directories(${FASTTOKENIZER_INC_DIR})

 # Set lib path
 if(WIN32)
-set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/core_tokenizers.lib"
-    CACHE FILEPATH "fast_tokenizer compile library." FORCE)
-message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
-set(ICUDT_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icudt.lib")
-set(ICUUC_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icuuc.lib")
-
+  set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/core_tokenizers.lib"
+      CACHE FILEPATH "fast_tokenizer compile library." FORCE)
+  set(ICUDT_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icudt.lib")
+  set(ICUUC_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icuuc.lib")
 elseif(APPLE)
-set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib"
-    CACHE FILEPATH "fast_tokenizer compile library." FORCE)
+  set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib"
+      CACHE FILEPATH "fast_tokenizer compile library." FORCE)
+elseif(ANDROID)
+  set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
+      CACHE FILEPATH "fast_tokenizer compile library." FORCE)
 else()
-
-set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
-    CACHE FILEPATH "fast_tokenizer compile library." FORCE)
-message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
+  set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
+      CACHE FILEPATH "fast_tokenizer compile library." FORCE)
 endif(WIN32)
+message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")

 set(FASTTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/fast_tokenizer/")
 set(FASTTOKENIZER_VERSION "1.0.0")
@@ -68,6 +75,15 @@ elseif(APPLE)
  else()
    set(FASTTOKENIZER_FILE "fast_tokenizer-osx-x86_64-${FASTTOKENIZER_VERSION}.tgz")
  endif()
+elseif(ANDROID)
+  # check ABI, toolchain
+  if((NOT ANDROID_ABI MATCHES "armeabi-v7a") AND (NOT ANDROID_ABI MATCHES "arm64-v8a"))
+    message(FATAL_ERROR "FastDeploy with FastTokenizer on Android only support armeabi-v7a, arm64-v8a now.")
+  endif()
+  if(NOT ANDROID_TOOLCHAIN MATCHES "clang")
+     message(FATAL_ERROR "Currently, only support clang toolchain while cross compiling FastDeploy for Android with FastTokenizer, but found ${ANDROID_TOOLCHAIN}.")
+  endif()    
+  set(FASTTOKENIZER_FILE "fast_tokenizer-android-${ANDROID_ABI}-${FASTTOKENIZER_VERSION}.tgz")
 else()
  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
    set(FASTTOKENIZER_FILE "fast_tokenizer-linux-aarch64-${FASTTOKENIZER_VERSION}.tgz")
@@ -77,18 +93,39 @@ else()
 endif()
 set(FASTTOKENIZER_URL "${FASTTOKENIZER_URL_BASE}${FASTTOKENIZER_FILE}")

-ExternalProject_Add(
-  ${FASTTOKENIZER_PROJECT}
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  URL ${FASTTOKENIZER_URL}
-  PREFIX ${FASTTOKENIZER_PREFIX_DIR}
-  DOWNLOAD_NO_PROGRESS 1
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND ""
-  UPDATE_COMMAND ""
-  INSTALL_COMMAND
-    ${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR} ${FASTTOKENIZER_INSTALL_DIR}
-  BUILD_BYPRODUCTS ${FASTTOKENIZER_COMPILE_LIB})
+if(ANDROID)
+  ExternalProject_Add(
+    ${FASTTOKENIZER_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL ${FASTTOKENIZER_URL}
+    PREFIX ${FASTTOKENIZER_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+      ${CMAKE_COMMAND} -E remove_directory ${FASTTOKENIZER_INSTALL_DIR} &&
+      ${CMAKE_COMMAND} -E make_directory ${FASTTOKENIZER_INSTALL_DIR} &&  
+      ${CMAKE_COMMAND} -E make_directory ${FASTTOKENIZER_INSTALL_DIR}/lib && 
+      ${CMAKE_COMMAND} -E make_directory ${FASTTOKENIZER_INSTALL_DIR}/third_party &&
+      ${CMAKE_COMMAND} -E rename ${FASTTOKENIZER_SOURCE_DIR}/lib/ ${FASTTOKENIZER_INSTALL_DIR}/lib/${ANDROID_ABI} &&
+      ${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR}/include ${FASTTOKENIZER_INSTALL_DIR}/include && 
+      ${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR}/third_party/include ${FASTTOKENIZER_INSTALL_DIR}/third_party/include 
+    BUILD_BYPRODUCTS ${FASTTOKENIZER_COMPILE_LIB})
+else()
+  ExternalProject_Add(
+    ${FASTTOKENIZER_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL ${FASTTOKENIZER_URL}
+    PREFIX ${FASTTOKENIZER_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+      ${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR} ${FASTTOKENIZER_INSTALL_DIR}
+    BUILD_BYPRODUCTS ${FASTTOKENIZER_COMPILE_LIB})
+endif()

 add_library(fast_tokenizer STATIC IMPORTED GLOBAL)
 set_property(TARGET fast_tokenizer PROPERTY IMPORTED_LOCATION ${FASTTOKENIZER_COMPILE_LIB})
--- a/cmake/opencv.cmake
+++ b/cmake/opencv.cmake
@@ -41,10 +41,12 @@ elseif(IOS)
 else()
  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
    set(OPENCV_FILENAME "opencv-linux-aarch64-3.4.14")
-  elseif(TARGET_ABI MATCHES "armhf")
-    set(OPENCV_FILENAME "opencv-armv7hf")
  else()
-    set(OPENCV_FILENAME "opencv-linux-x64-3.4.16")
+    if(ENABLE_TIMVX)
+      set(OPENCV_FILENAME "opencv-armv7hf")
+    else()
+      set(OPENCV_FILENAME "opencv-linux-x64-3.4.16")
+    endif()
  endif()
  if(ENABLE_OPENCV_CUDA)
    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
@@ -57,7 +59,7 @@ endif()
 set(OPENCV_INSTALL_DIR ${THIRD_PARTY_PATH}/install/)
 if(ANDROID)
  set(OPENCV_URL_PREFIX "https://bj.bcebos.com/fastdeploy/third_libs")
-elseif(TARGET_ABI MATCHES "armhf")
+elseif(ENABLE_TIMVX)
  set(OPENCV_URL_PREFIX "https://bj.bcebos.com/fastdeploy/test")
 else() # TODO: use fastdeploy/third_libs instead.
  set(OPENCV_URL_PREFIX "https://bj.bcebos.com/paddle2onnx/libs")
@@ -185,7 +187,7 @@ else()
    file(RENAME ${THIRD_PARTY_PATH}/install/${OPENCV_FILENAME}/ ${THIRD_PARTY_PATH}/install/opencv)
    set(OPENCV_FILENAME opencv)
    set(OpenCV_DIR ${THIRD_PARTY_PATH}/install/${OPENCV_FILENAME})
-    if(TARGET_ABI MATCHES "armhf")
+    if(ENABLE_TIMVX)
      set(OpenCV_DIR ${OpenCV_DIR}/lib/cmake/opencv4)
    endif()
    if (WIN32)
--- a/cmake/paddle2onnx.cmake
+++ b/cmake/paddle2onnx.cmake
@@ -43,7 +43,7 @@ else()
 endif(WIN32)

 set(PADDLE2ONNX_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
-set(PADDLE2ONNX_VERSION "1.0.4rc0")
+set(PADDLE2ONNX_VERSION "1.0.5")
 if(WIN32)
  set(PADDLE2ONNX_FILE "paddle2onnx-win-x64-${PADDLE2ONNX_VERSION}.zip")
  if(NOT CMAKE_CL_64)
--- a/cmake/paddle_inference.cmake
+++ b/cmake/paddle_inference.cmake
@@ -13,6 +13,8 @@
 # limitations under the License.
 include(ExternalProject)

+option(PADDLEINFERENCE_DIRECTORY "Directory of Paddle Inference library" OFF)
+
 set(PADDLEINFERENCE_PROJECT "extern_paddle_inference")
 set(PADDLEINFERENCE_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle_inference)
 set(PADDLEINFERENCE_SOURCE_DIR
@@ -27,6 +29,10 @@ set(PADDLEINFERENCE_LIB_DIR
 set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
                      "${PADDLEINFERENCE_LIB_DIR}")

+if(PADDLEINFERENCE_DIRECTORY)
+  set(PADDLEINFERENCE_INC_DIR ${PADDLEINFERENCE_DIRECTORY}/paddle/include)
+endif()
+
 include_directories(${PADDLEINFERENCE_INC_DIR})
 if(WIN32)
  set(PADDLEINFERENCE_COMPILE_LIB
@@ -47,50 +53,59 @@ else()
 endif(WIN32)


-set(PADDLEINFERENCE_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
-set(PADDLEINFERENCE_VERSION "2.4-dev3")
-if(WIN32)
-  if (WITH_GPU)
-    set(PADDLEINFERENCE_FILE "paddle_inference-win-x64-gpu-trt-${PADDLEINFERENCE_VERSION}.zip")
-  else()
-    set(PADDLEINFERENCE_FILE "paddle_inference-win-x64-${PADDLEINFERENCE_VERSION}.zip")
-  endif()
-elseif(APPLE)
-  if(CURRENT_OSX_ARCH MATCHES "arm64")
-    message(FATAL_ERROR "Paddle Backend doesn't support Mac OSX with Arm64 now.")
-    set(PADDLEINFERENCE_FILE "paddle_inference-osx-arm64-${PADDLEINFERENCE_VERSION}.tgz")
-  else()
-    set(PADDLEINFERENCE_FILE "paddle_inference-osx-x86_64-${PADDLEINFERENCE_VERSION}.tgz")
+if(PADDLEINFERENCE_DIRECTORY)
+  if(EXISTS "${THIRD_PARTY_PATH}/install/paddle_inference")
+    file(REMOVE_RECURSE "${THIRD_PARTY_PATH}/install/paddle_inference")
  endif()
+  find_package(Python COMPONENTS Interpreter Development REQUIRED)
+  message(STATUS "Copying ${PADDLEINFERENCE_DIRECTORY} to ${THIRD_PARTY_PATH}/install/paddle_inference ...")
+  execute_process(COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/copy_directory.py ${PADDLEINFERENCE_DIRECTORY} ${THIRD_PARTY_PATH}/install/paddle_inference)
 else()
-  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
-    message(FATAL_ERROR "Paddle Backend doesn't support linux aarch64 now.")
-    set(PADDLEINFERENCE_FILE "paddle_inference-linux-aarch64-${PADDLEINFERENCE_VERSION}.tgz")
-  else()
-    set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-${PADDLEINFERENCE_VERSION}.tgz")
-    if(WITH_GPU)
-        set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-gpu-trt-${PADDLEINFERENCE_VERSION}.tgz")
+  set(PADDLEINFERENCE_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
+  set(PADDLEINFERENCE_VERSION "2.4-dev3")
+  if(WIN32)
+    if (WITH_GPU)
+      set(PADDLEINFERENCE_FILE "paddle_inference-win-x64-gpu-trt-${PADDLEINFERENCE_VERSION}.zip")
+    else()
+      set(PADDLEINFERENCE_FILE "paddle_inference-win-x64-${PADDLEINFERENCE_VERSION}.zip")
    endif()
-    if (WITH_IPU)
-        set(PADDLEINFERENCE_VERSION "2.4-dev1")
-        set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-ipu-${PADDLEINFERENCE_VERSION}.tgz")
+  elseif(APPLE)
+    if(CURRENT_OSX_ARCH MATCHES "arm64")
+      message(FATAL_ERROR "Paddle Backend doesn't support Mac OSX with Arm64 now.")
+      set(PADDLEINFERENCE_FILE "paddle_inference-osx-arm64-${PADDLEINFERENCE_VERSION}.tgz")
+    else()
+      set(PADDLEINFERENCE_FILE "paddle_inference-osx-x86_64-${PADDLEINFERENCE_VERSION}.tgz")
+    endif()
+  else()
+    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+      message(FATAL_ERROR "Paddle Backend doesn't support linux aarch64 now.")
+      set(PADDLEINFERENCE_FILE "paddle_inference-linux-aarch64-${PADDLEINFERENCE_VERSION}.tgz")
+    else()
+      set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-${PADDLEINFERENCE_VERSION}.tgz")
+      if(WITH_GPU)
+          set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-gpu-trt-${PADDLEINFERENCE_VERSION}.tgz")
+      endif()
+      if (WITH_IPU)
+          set(PADDLEINFERENCE_VERSION "2.4-dev1")
+          set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-ipu-${PADDLEINFERENCE_VERSION}.tgz")
+      endif()
    endif()
  endif()
-endif()
-set(PADDLEINFERENCE_URL "${PADDLEINFERENCE_URL_BASE}${PADDLEINFERENCE_FILE}")
+  set(PADDLEINFERENCE_URL "${PADDLEINFERENCE_URL_BASE}${PADDLEINFERENCE_FILE}")
  
-ExternalProject_Add(
-  ${PADDLEINFERENCE_PROJECT}
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  URL ${PADDLEINFERENCE_URL}
-  PREFIX ${PADDLEINFERENCE_PREFIX_DIR}
-  DOWNLOAD_NO_PROGRESS 1
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND ""
-  UPDATE_COMMAND ""
-  INSTALL_COMMAND
-	${CMAKE_COMMAND} -E copy_directory ${PADDLEINFERENCE_SOURCE_DIR} ${PADDLEINFERENCE_INSTALL_DIR}
-  BUILD_BYPRODUCTS ${PADDLEINFERENCE_COMPILE_LIB})
+  ExternalProject_Add(
+    ${PADDLEINFERENCE_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL ${PADDLEINFERENCE_URL}
+    PREFIX ${PADDLEINFERENCE_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+  	${CMAKE_COMMAND} -E copy_directory ${PADDLEINFERENCE_SOURCE_DIR} ${PADDLEINFERENCE_INSTALL_DIR}
+    BUILD_BYPRODUCTS ${PADDLEINFERENCE_COMPILE_LIB})
+endif(PADDLEINFERENCE_DIRECTORY)

 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
  add_custom_target(patchelf_paddle_inference ALL COMMAND  bash -c "PATCHELF_EXE=${PATCHELF_EXE} python ${PROJECT_SOURCE_DIR}/scripts/patch_paddle_inference.py ${PADDLEINFERENCE_INSTALL_DIR}/paddle/lib/libpaddle_inference.so" DEPENDS ${LIBRARY_NAME})
--- a/cmake/paddlelite.cmake
+++ b/cmake/paddlelite.cmake
@@ -52,17 +52,19 @@ endif()
 if(WIN32 OR APPLE OR IOS)
  message(FATAL_ERROR "Doesn't support windows/mac/ios platform with backend Paddle Lite now.")
 elseif(ANDROID)
-  set(PADDLELITE_URL "${PADDLELITE_URL_PREFIX}/lite-android-${ANDROID_ABI}-latest.tgz")
+  set(PADDLELITE_URL "${PADDLELITE_URL_PREFIX}/lite-android-${ANDROID_ABI}-latest-dev.tgz")
  if(ANDROID_ABI MATCHES "arm64-v8a") 
-    set(PADDLELITE_URL "${PADDLELITE_URL_PREFIX}/lite-android-${ANDROID_ABI}-fp16-latest.tgz")
+    set(PADDLELITE_URL "${PADDLELITE_URL_PREFIX}/lite-android-${ANDROID_ABI}-fp16-latest-dev.tgz")
  endif()  
 else() # Linux
  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
-    set(PADDLELITE_URL "${PADDLELITE_URL_PREFIX}/lite-linux-arm64-20220920.tgz")
-  elseif(TARGET_ABI MATCHES "armhf")
-    set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/test/lite-linux_armhf_1101.tgz")
+    set(PADDLELITE_URL "${PADDLELITE_URL_PREFIX}/lite-linux-arm64-20221209.tgz")
  else()
-    message(FATAL_ERROR "Only support Linux aarch64 now, x64 is not supported with backend Paddle Lite.")
+    if(ENABLE_TIMVX)
+      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/test/lite-linux_armhf_1130.tgz")
+    else()
+      message(FATAL_ERROR "Only support Linux aarch64 or ENABLE_TIMVX now, x64 is not supported with backend Paddle Lite.")
+    endif()
  endif()
 endif()

--- a/cmake/timvx.cmake
+++ b/cmake/timvx.cmake
@@ -1,11 +1,10 @@
-if (NOT DEFINED TARGET_ABI)
+if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
    set(CMAKE_SYSTEM_NAME Linux)
    set(CMAKE_SYSTEM_PROCESSOR arm)
    set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc")
    set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
    set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
-    set(TARGET_ABI armhf)
    set(CMAKE_BUILD_TYPE MinSizeRel)
 else()
    if(NOT ${ENABLE_LITE_BACKEND})
--- a/docs/README_CN.md
+++ b/docs/README_CN.md
@@ -10,6 +10,7 @@
 - [IPU部署环境编译安装](cn/build_and_install/ipu.md)
 - [Jetson部署环境编译安装](cn/build_and_install/jetson.md)
 - [Android平台部署环境编译安装](cn/build_and_install/android.md)
+- [服务化部署镜像编译安装](../serving/docs/zh_CN/compile.md)

 ## 快速使用

@@ -22,6 +23,7 @@

 - [Python API文档](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/python/html/)
 - [C++ API文档](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/cpp/html/)
+- [Android Java API文档](../java/android)

 ## 性能调优

@@ -31,9 +33,9 @@

 - [1. 如何配置模型部署的推理后端](cn/faq/how_to_change_backend.md)
 - [2. Windows上C++ SDK如何使用](cn/faq/use_sdk_on_windows.md)
- [3. Android上如何使用FastDeploy](cn/faq/use_sdk_on_android.md)(进行中)
+- [3. Android上如何使用FastDeploy C++ SDK](cn/faq/use_cpp_sdk_on_android.md)
 - [4. TensorRT使用中的一些技巧](cn/faq/tensorrt_tricks.md)
- [5. 如何增加新的模型](cn/faq/develop_a_new_model.md)(进行中)
+- [5. 如何增加新的模型](cn/faq/develop_a_new_model.md)

 ## 更多FastDeploy部署模块

--- a/docs/README_EN.md
+++ b/docs/README_EN.md
@@ -4,12 +4,13 @@

 ## Install

- [How to Install FastDeploy Prebuilt Libraries](en/build_and_install/download_prebuilt_libraries.md)
- [How to Build and Install FastDeploy Library on GPU Platform](en/build_and_install/gpu.md)
- [How to Build and Install FastDeploy Library on CPU Platform](en/build_and_install/cpu.md)
- [How to Build and Install FastDeploy Library on IPU Platform](en/build_and_install/ipu.md)
- [How to Build and Install FastDeploy Library on  Nvidia Jetson Platform](en/build_and_install/jetson.md)
- [How to Build and Install FastDeploy Library on Android Platform](en/build_and_install/android.md)
+- [Install FastDeploy Prebuilt Libraries](en/build_and_install/download_prebuilt_libraries.md)
+- [Build and Install FastDeploy Library on GPU Platform](en/build_and_install/gpu.md)
+- [Build and Install FastDeploy Library on CPU Platform](en/build_and_install/cpu.md)
+- [Build and Install FastDeploy Library on IPU Platform](en/build_and_install/ipu.md)
+- [Build and Install FastDeploy Library on  Nvidia Jetson Platform](en/build_and_install/jetson.md)
+- [Build and Install FastDeploy Library on Android Platform](en/build_and_install/android.md)
+- [Build and Install FastDeploy Serving Deployment Image](../serving/docs/EN/compile-en.md)

 ## A Quick Start - Demos

@@ -22,6 +23,7 @@

 - [Python API](https://baidu-paddle.github.io/fastdeploy-api/python/html/)
 - [C++ API](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/) 
+- [Android Java API](../java/android)

 ## Performance Optimization

@@ -31,11 +33,11 @@

 - [1. How to Change Inference Backends](en/faq/how_to_change_backend.md)
 - [2. How to Use FastDeploy C++ SDK on Windows Platform](en/faq/use_sdk_on_windows.md)
- [3. How to Use FastDeploy C++ SDK on Android Platform](en/faq/use_sdk_on_android.md)
+- [3. How to Use FastDeploy C++ SDK on Android Platform](en/faq/use_cpp_sdk_on_android.md)
 - [4. Tricks of TensorRT](en/faq/tensorrt_tricks.md)
 - [5. How to Develop a New Model](en/faq/develop_a_new_model.md)

 ## More FastDeploy Deployment Module

- [deployment AI Model as a Service](../serving)
+- [Deployment AI Model as a Service](../serving)
 - [Benchmark Testing](../benchmark)
--- a/docs/api_docs/cpp/Doxyfile
+++ b/docs/api_docs/cpp/Doxyfile
@@ -2100,7 +2100,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

-PREDEFINED = protected=private
+PREDEFINED = protected=private ENABLE_VISION_VISUALIZE=1

 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
--- a/docs/api_docs/cpp/main_page.md
+++ b/docs/api_docs/cpp/main_page.md
@@ -1,31 +1,7 @@
 # FastDeploy C++ API Summary

-## Runtime
-
-FastDeploy Runtime can be used as an inference engine with the same code, we can deploy Paddle/ONNX model on different device by different backends.  
-Currently, FastDeploy supported backends listed as below,
-
-| Backend | Hardware | Support Model Format | Platform |
-| :------ | :------- | :------------------- | :------- |
-| Paddle Inference | CPU/Nvidia GPU | Paddle | Windows(x64)/Linux(x64) |
-| ONNX Runtime | CPU/Nvidia GPU | Paddle/ONNX | Windows(x64)/Linux(x64/aarch64)/Mac(x86/arm64) |
-| TensorRT | Nvidia GPU | Paddle/ONNX | Windows(x64)/Linux(x64)/Jetson |
-| OpenVINO | CPU | Paddle/ONNX | Windows(x64)/Linux(x64)/Mac(x86) |
-| Poros | CPU/Nvidia GPU | TorchScript | Linux(x64) |
-
-### Example code
- [Python examples](./)
- [C++ examples](./)
-
-### Related APIs
- [RuntimeOption](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/structfastdeploy_1_1RuntimeOption.html)
- [Runtime](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/structfastdeploy_1_1Runtime.html)
-
-## Vision Models
-
-| Task | Model | API | Example |
-| :---- | :---- | :---- | :----- |
-| object detection | PaddleDetection/PPYOLOE | [fastdeploy::vision::detection::PPYOLOE](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/classfastdeploy_1_1vision_1_1detection_1_1PPYOLOE.html) | [C++](./)/[Python](./) |
-| keypoint detection | PaddleDetection/PPTinyPose | [fastdeploy::vision::keypointdetection::PPTinyPose](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/classfastdeploy_1_1pipeline_1_1PPTinyPose.html) | [C++](./)/[Python](./) |
-| image classification | PaddleClassification serials | [fastdeploy::vision::classification::PaddleClasModel](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/classfastdeploy_1_1vision_1_1classification_1_1PaddleClasModel.html) | [C++](./)/[Python](./) |
-| semantic segmentation | PaddleSegmentation serials | [fastdeploy::vision::classification::PaddleSegModel](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/classfastdeploy_1_1vision_1_1segmentation_1_1PaddleSegModel.html) | [C++](./)/[Python](./) |
+- Github: [https://github.com/PaddlePaddle/FastDeploy](https://github.com/PaddlePaddle/FastDeploy)
+- [Installation](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/en/build_and_install)
+- [Usage Documents](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/README_EN.md)
+- [Release Notes](https://github.com/PaddlePaddle/FastDeploy/releases)
+- [Examples](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples)
--- a/docs/api_docs/python/semantic_segmentation.md
+++ b/docs/api_docs/python/semantic_segmentation.md
@@ -1,5 +1,14 @@
 # Semantic Segmentation(语义分割)

+
+## fastdeploy.vision.segmentation.PaddleSegPreprocessor
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.segmentation.PaddleSegPreprocessor
+    :members:
+    :inherited-members:
+```
+
 ## fastdeploy.vision.segmentation.PaddleSegModel

 ```{eval-rst}
@@ -7,3 +16,11 @@
    :members:
    :inherited-members:
 ```
+
+## fastdeploy.vision.segmentation.PaddleSegPostprocessor
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.segmentation.PaddleSegPostprocessor
+    :members:
+    :inherited-members:
+```
--- a/docs/api_docs/python/visualize.md
+++ b/docs/api_docs/python/visualize.md
@@ -0,0 +1,57 @@
+# Visaulize(可视化)
+
+## fastdeploy.vision.vis_detection
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.vis_detection
+    :members:
+    :inherited-members:
+```
+
+## fastdeploy.vision.vis_segmentation
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.vis_segmentation
+    :members:
+    :inherited-members:
+```
+
+## fastdeploy.vision.vis_keypoint_detection
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.vis_keypoint_detection
+    :members:
+    :inherited-members:
+```
+## fastdeploy.vision.vis_face_detection
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.vis_face_detection
+    :members:
+    :inherited-members:
+```
+
+
+## fastdeploy.vision.vis_face_alignment
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.vis_face_alignment
+    :members:
+    :inherited-members:
+```
+
+## fastdeploy.vision.vis_matting
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.vis_matting
+    :members:
+    :inherited-members:
+```
+
+## fastdeploy.vision.vis_ppocr
+
+```{eval-rst}
+.. autoclass:: fastdeploy.vision.vis_ppocr
+    :members:
+    :inherited-members:
+```
--- a/docs/cn/build_and_install/README.md
+++ b/docs/cn/build_and_install/README.md
@@ -8,10 +8,10 @@
 ## 自行编译安装
 - [GPU部署环境](gpu.md)
 - [CPU部署环境](cpu.md)
- [CPU部署环境](ipu.md)
+- [IPU部署环境](ipu.md)
 - [Jetson部署环境](jetson.md)
 - [Android平台部署环境](android.md)
- [瑞芯微RK1126部署环境](rk1126.md)
+- [瑞芯微RV1126部署环境](rv1126.md)


 ## FastDeploy编译选项说明
@@ -22,6 +22,7 @@
 | ENABLE_PADDLE_BACKEND   | 默认OFF，是否编译集成Paddle Inference后端(CPU/GPU上推荐打开)                              |  
 | ENABLE_LITE_BACKEND     | 默认OFF，是否编译集成Paddle Lite后端(编译Android库时需要设置为ON)                             |
 | ENABLE_RKNPU2_BACKEND   | 默认OFF，是否编译集成RKNPU2后端(RK3588/RK3568/RK3566上推荐打开)                           |
+| ENABLE_TIMVX            | 默认OFF，需要在RV1126/RV1109上部署时，需设置为ON                                          |
 | ENABLE_TRT_BACKEND      | 默认OFF，是否编译集成TensorRT后端(GPU上推荐打开)                                          |
 | ENABLE_OPENVINO_BACKEND | 默认OFF，是否编译集成OpenVINO后端(CPU上推荐打开)                                          |
 | ENABLE_VISION           | 默认OFF，是否编译集成视觉模型的部署模块                                                     |
--- a/docs/cn/build_and_install/cpu.md
+++ b/docs/cn/build_and_install/cpu.md
@@ -27,6 +27,11 @@ Linux上编译需满足
 - gcc/g++ >= 5.4(推荐8.2)
 - cmake >= 3.18.0

+此外更推荐开发者自行安装，编译时通过`-DOPENCV_DIRECTORY`来指定环境中的OpenCV（如若不指定-DOPENCV_DIRECTORY，会自动下载FastDeploy提供的预编译的OpenCV，但在**Linux平台**无法支持Video的读取，以及imshow等可视化界面功能）
+```
+sudo apt-get install libopencv-dev
+```
+
 ```bash
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy
@@ -36,6 +41,7 @@ cmake .. -DENABLE_ORT_BACKEND=ON \
         -DENABLE_OPENVINO_BACKEND=ON \
         -DCMAKE_INSTALL_PREFIX=${PWD}/compiled_fastdeploy_sdk \
         -DENABLE_VISION=ON \
+         -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4 \
         -DENABLE_TEXT=ON
 make -j12
 make install
@@ -90,6 +96,8 @@ export ENABLE_PADDLE_BACKEND=ON
 export ENABLE_OPENVINO_BACKEND=ON
 export ENABLE_VISION=ON
 export ENABLE_TEXT=ON
+# OPENCV_DIRECTORY可选，不指定会自动下载FastDeploy提供的预编译OpenCV库
+export OPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4

 python setup.py build
 python setup.py bdist_wheel
--- a/docs/cn/build_and_install/download_prebuilt_libraries.md
+++ b/docs/cn/build_and_install/download_prebuilt_libraries.md
@@ -20,7 +20,7 @@ FastDeploy提供各平台预编译库，供开发者直接下载安装使用。

 ### Python安装

-Release版本（当前最新0.8.0）安装
+Release版本（当前最新1.0.1）安装
 ```bash
 pip install fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
 ```
@@ -41,8 +41,8 @@ Release版本

 | 平台 | 文件 | 说明 |
 | :--- | :--- | :---- |
-| Linux x64 | [fastdeploy-linux-x64-gpu-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-0.8.0.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2编译产出 |
-| Windows x64 | [fastdeploy-win-x64-gpu-0.8.0.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-0.8.0.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2编译产出 |
+| Linux x64 | [fastdeploy-linux-x64-gpu-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-1.0.1.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2编译产出 |
+| Windows x64 | [fastdeploy-win-x64-gpu-1.0.1.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-1.0.1.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2编译产出 |

 Develop版本（Nightly build）

@@ -63,7 +63,7 @@ Develop版本（Nightly build）

 ### Python安装

-Release版本（当前最新0.7.0）安装
+Release版本（当前最新1.0.1）安装
 ```bash
 pip install fastdeploy-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
 ```
@@ -79,16 +79,16 @@ Release版本

 | 平台 | 文件 | 说明 |
 | :--- | :--- | :---- |
-| Linux x64 | [fastdeploy-linux-x64-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-0.8.0.tgz) | g++ 8.2编译产出 |
-| Windows x64 | [fastdeploy-win-x64-0.8.0.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-0.8.0.zip) | Visual Studio 16 2019编译产出 |
-| Mac OSX x64 | [fastdeploy-osx-x86_64-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-0.8.0.tgz) | clang++ 10.0.0编译产出|
-| Mac OSX arm64 | [fastdeploy-osx-arm64-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-0.8.0.tgz) | clang++ 13.0.0编译产出 |
-| Linux aarch64 | - | 自行编译，可集成ONNX Runtime、Paddle Lite后端 |  
+| Linux x64 | [fastdeploy-linux-x64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.1.tgz) | g++ 8.2编译产出 |
+| Windows x64 | [fastdeploy-win-x64-1.0.1.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-1.0.1.zip) | Visual Studio 16 2019编译产出 |
+| Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.1.tgz) | clang++ 10.0.0编译产出|
+| Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.1.tgz) | clang++ 13.0.0编译产出 |
+| Linux aarch64 | [fastdeploy-osx-arm64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.1.tgz) | gcc 6.3编译产出 |  
 | Android armv7&v8 | [fastdeploy-android-1.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.0-shared.tgz) | NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |

 ## Java SDK安装

-Release版本（Java SDK 目前仅支持Android，版本为1.0.0 pre-release）  
+Release版本（Java SDK 目前仅支持Android，版本为1.0.0）  

 | 平台 | 文件 | 说明 |
 | :--- | :--- | :---- |
--- a/docs/cn/build_and_install/gpu.md
+++ b/docs/cn/build_and_install/gpu.md
@@ -33,6 +33,11 @@ Linux上编译需满足
 - cuda >= 11.2
 - cudnn >= 8.2

+此外更推荐开发者自行安装，编译时通过`-DOPENCV_DIRECTORY`来指定环境中的OpenCV（如若不指定-DOPENCV_DIRECTORY，会自动下载FastDeploy提供的预编译的OpenCV，但在**Linux平台**无法支持Video的读取，以及imshow等可视化界面功能）
+```
+sudo apt-get install libopencv-dev
+```
+
 ```bash
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy
@@ -46,6 +51,7 @@ cmake .. -DENABLE_ORT_BACKEND=ON \
         -DCUDA_DIRECTORY=/usr/local/cuda \
         -DCMAKE_INSTALL_PREFIX=${PWD}/compiled_fastdeploy_sdk \
         -DENABLE_VISION=ON \
+         -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4 \
         -DENABLE_TEXT=ON
 make -j12
 make install
@@ -115,6 +121,8 @@ export ENABLE_TRT_BACKEND=ON
 export WITH_GPU=ON
 export TRT_DIRECTORY=/Paddle/TensorRT-8.4.1.5
 export CUDA_DIRECTORY=/usr/local/cuda
+# OPENCV_DIRECTORY可选，不指定会在编译过程下载FastDeploy预编译的OpenCV库
+export OPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4 \

 python setup.py build
 python setup.py bdist_wheel
--- a/docs/cn/build_and_install/jetson.md
+++ b/docs/cn/build_and_install/jetson.md
@@ -1,7 +1,7 @@

 # Jetson部署库编译

-FastDeploy当前在Jetson仅支持ONNX Runtime CPU和TensorRT GPU两种后端推理
+FastDeploy当前在Jetson仅支持ONNX Runtime CPU和TensorRT GPU/Paddle Inference两种后端推理

 ## C++ SDK编译安装

@@ -10,12 +10,17 @@ FastDeploy当前在Jetson仅支持ONNX Runtime CPU和TensorRT GPU两种后端推
 - cmake >= 3.10.0
 - jetpack >= 4.6.1

+
+如果需要集成Paddle Inference后端，在[Paddle Inference预编译库](https://www.paddlepaddle.org.cn/inference/v2.4/guides/install/download_lib.html#c)页面根据开发环境选择对应的Jetpack C++包下载，并解压。
+
 ```bash
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy
 mkdir build && cd build
 cmake .. -DBUILD_ON_JETSON=ON \
         -DENABLE_VISION=ON \
+         -DENABLE_PADDLE_BACKEND=ON \ # 可选项，如若不需要Paddle Inference后端，可关闭
+         -DPADDLEINFERENCE_DIRECTORY=/Download/paddle_inference_jetson \
         -DCMAKE_INSTALL_PREFIX=${PWD}/installed_fastdeploy
 make -j8
 make install
@@ -34,6 +39,8 @@ make install

 Python打包依赖`wheel`，编译前请先执行`pip install wheel`

+如果需要集成Paddle Inference后端，在[Paddle Inference预编译库](https://www.paddlepaddle.org.cn/inference/v2.4/guides/install/download_lib.html#c)页面根据开发环境选择对应的Jetpack C++包下载，并解压。
+
 所有编译选项通过环境变量导入

 ```bash
@@ -42,6 +49,10 @@ cd FastDeploy/python
 export BUILD_ON_JETSON=ON
 export ENABLE_VISION=ON

+# ENABLE_PADDLE_BACKEND & PADDLEINFERENCE_DIRECTORY为可选项
+export ENABLE_PADDLE_BACKEND=ON
+export PADDLEINFERENCE_DIRECTORY=/Download/paddle_inference_jetson
+
 python setup.py build
 python setup.py bdist_wheel
 ```
--- a/docs/cn/build_and_install/rk1126.md
+++ b/docs/cn/build_and_install/rk1126.md
@@ -1,63 +0,0 @@
-# 瑞芯微 RK1126 部署环境编译安装
-
-FastDeploy基于 Paddle-Lite 后端支持在瑞芯微（Rockchip）Soc 上进行部署推理。
-更多详细的信息请参考：[PaddleLite部署示例](https://paddle-lite.readthedocs.io/zh/develop/demo_guides/verisilicon_timvx.html)。
-
-本文档介绍如何编译基于 PaddleLite 的 C++ FastDeploy 交叉编译库。
-
-相关编译选项说明如下：  
-|编译选项|默认值|说明|备注|  
-|:---|:---|:---|:---|  
-|ENABLE_LITE_BACKEND|OFF|编译RK库时需要设置为ON| - |
-
-更多编译选项请参考[FastDeploy编译选项说明](./README.md)
-
-## 交叉编译环境搭建
-
-### 宿主机环境需求  
- os：Ubuntu == 16.04
- cmake： version >= 3.10.0  
-
-### 环境搭建
-```bash
- # 1. Install basic software
-apt update
-apt-get install -y --no-install-recommends \
-  gcc g++ git make wget python unzip
-
-# 2. Install arm gcc toolchains
-apt-get install -y --no-install-recommends \
-  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
-  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
-  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
-
-# 3. Install cmake 3.10 or above
-wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
-  tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
-  mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
-  ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
-  ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
-```
-
-## 基于 PaddleLite 的 FastDeploy 交叉编译库编译
-搭建好交叉编译环境之后，编译命令如下：
-```bash
-# Download the latest source code
-git clone https://github.com/PaddlePaddle/FastDeploy.git
-cd FastDeploy  
-mkdir build && cd build
-
-# CMake configuration with RK toolchain
-cmake -DCMAKE_TOOLCHAIN_FILE=./../cmake/timvx.cmake \
-      -DENABLE_TIMVX=ON  \
-      -DCMAKE_INSTALL_PREFIX=fastdeploy-tmivx \
-      -DENABLE_VISION=ON \ # 是否编译集成视觉模型的部署模块，可选择开启
-      -Wno-dev ..
-
-# Build FastDeploy RK1126 C++ SDK
-make -j8
-make install
-```  
-编译完成之后，会生成 fastdeploy-tmivx 目录，表示基于 PadddleLite TIM-VX 的 FastDeploy 库编译完成。
-
-RK1126 上部署 PaddleClas 分类模型请参考：[PaddleClas RK1126开发板 C++ 部署示例](../../../examples/vision/classification/paddleclas/rk1126/README.md)
--- a/docs/cn/build_and_install/rv1126.md
+++ b/docs/cn/build_and_install/rv1126.md
@@ -0,0 +1,100 @@
+# 瑞芯微 RV1126 部署环境编译安装
+
+FastDeploy基于 Paddle-Lite 后端支持在瑞芯微（Rockchip）Soc 上进行部署推理。
+更多详细的信息请参考：[PaddleLite部署示例](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/verisilicon_timvx.html)。
+
+本文档介绍如何编译基于 PaddleLite 的 C++ FastDeploy 交叉编译库。
+
+相关编译选项说明如下：  
+|编译选项|默认值|说明|备注|  
+|:---|:---|:---|:---|  
+|ENABLE_LITE_BACKEND|OFF|编译RK库时需要设置为ON| - |
+|ENABLE_TIMVX|OFF|编译RK库时需要设置为ON| - |
+
+更多编译选项请参考[FastDeploy编译选项说明](./README.md)
+
+## 交叉编译环境搭建
+
+### 宿主机环境需求  
+- os：Ubuntu == 16.04
+- cmake： version >= 3.10.0  
+
+### 环境搭建
+```bash
+ # 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip
+
+# 2. Install arm gcc toolchains
+apt-get install -y --no-install-recommends \
+  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
+  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
+  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+  tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+  mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+  ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+  ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+```
+
+## 基于 PaddleLite 的 FastDeploy 交叉编译库编译
+搭建好交叉编译环境之后，编译命令如下：
+```bash
+# Download the latest source code
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy  
+mkdir build && cd build
+
+# CMake configuration with RK toolchain
+cmake -DCMAKE_TOOLCHAIN_FILE=./../cmake/timvx.cmake \
+      -DENABLE_TIMVX=ON  \
+      -DCMAKE_INSTALL_PREFIX=fastdeploy-tmivx \
+      -DENABLE_VISION=ON \ # 是否编译集成视觉模型的部署模块，可选择开启
+      -Wno-dev ..
+
+# Build FastDeploy RV1126 C++ SDK
+make -j8
+make install
+```  
+编译完成之后，会生成 fastdeploy-tmivx 目录，表示基于 PadddleLite TIM-VX 的 FastDeploy 库编译完成。
+
+## 准备设备运行环境
+部署前要保证芯原 Linux Kernel NPU 驱动 galcore.so 版本及所适用的芯片型号与依赖库保持一致，在部署前，请登录开发板，并通过命令行输入以下命令查询 NPU 驱动版本，Rockchip建议的驱动版本为：	6.4.6.5
+```bash
+dmesg | grep Galcore
+```  
+
+如果当前版本不符合上述，请用户仔细阅读以下内容，以保证底层 NPU 驱动环境正确。
+
+有两种方式可以修改当前的 NPU 驱动版本：
+1. 手动替换 NPU 驱动版本。（推荐）
+2. 刷机，刷取 NPU 驱动版本符合要求的固件。
+
+### 手动替换 NPU 驱动版本
+1. 使用如下命令下载解压 PaddleLite demo，其中提供了现成的驱动文件
+```bash
+wget https://paddlelite-demo.bj.bcebos.com/devices/generic/PaddleLite-generic-demo.tar.gz
+tar -xf PaddleLite-generic-demo.tar.gz
+```
+2. 使用 `uname -a` 查看 `Linux Kernel` 版本，确定为 `Linux` 系统 4.19.111 版本，
+3. 将 `PaddleLite-generic-demo/libs/PaddleLite/linux/armhf/lib/verisilicon_timvx/viv_sdk_6_4_6_5/lib/1126/4.19.111/` 路径下的 `galcore.ko` 上传至开发板。
+
+4. 登录开发板，命令行输入 `sudo rmmod galcore` 来卸载原始驱动，输入 `sudo insmod galcore.ko` 来加载传上设备的驱动。（是否需要 sudo 根据开发板实际情况，部分 adb 链接的设备请提前 adb root）。此步骤如果操作失败，请跳转至方法 2。
+5. 在开发板中输入 `dmesg | grep Galcore` 查询 NPU 驱动版本，确定为：6.4.6.5
+
+### 刷机
+根据具体的开发板型号，向开发板卖家或官网客服索要 6.4.6.5 版本 NPU 驱动对应的固件和刷机方法。
+
+更多细节请参考：[PaddleLite准备设备环境](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/verisilicon_timvx.html#zhunbeishebeihuanjing)
+
+## 基于 FastDeploy 在 RV1126 上的部署示例
+1. RV1126 上部署 PaddleClas 分类模型请参考：[PaddleClas 分类模型在 RV1126 上的 C++ 部署示例](../../../examples/vision/classification/paddleclas/rv1126/README.md)
+
+2. RV1126 上部署 PPYOLOE 检测模型请参考：[PPYOLOE 检测模型在 RV1126 上的 C++ 部署示例](../../../examples/vision/detection/paddledetection/rv1126/README.md)
+
+3. RV1126 上部署 YOLOv5 检测模型请参考：[YOLOv5 检测模型在 RV1126 上的 C++ 部署示例](../../../examples/vision/detection/yolov5/rv1126/README.md)
+
+4. RV1126 上部署 PP-LiteSeg 分割模型请参考：[PP-LiteSeg 分割模型在 RV1126 上的 C++ 部署示例](../../../examples/vision/segmentation/paddleseg/rv1126/README.md)
--- a/docs/cn/faq/custom_opencv.md
+++ b/docs/cn/faq/custom_opencv.md
@@ -0,0 +1,35 @@
+[English](../../en/faq/custom_opencv.md) | 中文
+
+# 自定义OpenCV版本
+
+受限于不同平台限制，目前FastDeploy提供的预编译包在**Linux平台**内置的OpenCV无法读取视频，或调用`imshow`等操作。对于有这类需求的开发者，可根据本文档来自行编译FastDeploy。
+
+FastDeploy目前支持通过`-DOPENCV_DIRECTORY`来指定环境中的OpenCV版本，以Ubuntu为例，我们可以按照如下方式编译安装。
+
+
+## CPU C++ SDK
+
+### 1. 安装Opencv
+```
+sudo apt-get install libopencv-dev
+```
+
+### 2. 指定OpenCV编译FastDeploy
+```
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+mkdir build && cd build
+cmake .. -DENABLE_ORT_BACKEND=ON \
+         -DENABLE_PADDLE_BACKEND=ON \
+         -DENABLE_OPENVINO_BACKEND=ON \
+         -DENABLE_VISION=ON \
+         -DCMAKE_INSTALL_PREFIX=${PWD}/installed_fastdeploy \
+         -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4
+make -j8
+make install
+```
+编译完成的C++ SDK即为当前目录下的`installed_fastdeploy`，使用这个新的SDK即可。
+
+其它部署硬件上的编译方式同理，通过`-DOPENCV_DIRECTORY`指定环境中的OpenCV编译即可, 注意此处的路径`/usr/lib/x86_64-linux-gnu/cmake/opencv4`需根据你的实际环境路径来设定，此目录下包含`OpenCVConfig-version.cmake`、`OpenCVConfig.cmake`等文件。
+
+- [FastDeploy更多部署环境的编译](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/README_CN.md)
--- a/docs/cn/quantize.md
+++ b/docs/cn/quantize.md
@@ -36,7 +36,7 @@ FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了
 目前, FastDeploy支持自动化压缩,并完成部署测试的模型的Runtime Benchmark和端到端Benchmark如下所示.

 Benchmark表格说明:
- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
 - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
 - 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
 - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
@@ -63,7 +63,7 @@ Benchmark表格说明:
 | [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |     995.85  |     477.93|None|None      |   2.08         |51.1 | 46.2|量化蒸馏训练 |

 #### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT   |    GPU    |  24.61   | 21.20 |  20.78     | 20.94     |      1.18         | 37.6  | 36.7 | 量化蒸馏训练 |
 | [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | Paddle-TensorRT  |    GPU   |  23.53    |  None |  21.98    | 19.84     |      1.28        | 37.6  | 36.8 | 量化蒸馏训练 |
@@ -94,7 +94,7 @@ Benchmark表格说明:
 | [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        |  Paddle Inference  |    CPU    |     12.29  |   4.68  |     None|None|2.62       |77.89 | 71.36 |离线量化 |

 #### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  4.92| 2.28|2.24|2.23 |      2.21     | 79.12  | 79.06 | 离线量化 |
 | [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle-TensorRT  |    GPU    |  4.48|None |2.09|2.10 |      2.14   | 79.12  | 79.06 | 离线量化 |
@@ -119,7 +119,7 @@ NOTE:
 - TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子

 #### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | TensorRT         |    GPU    |  35.75 | 15.42 |20.70|20.85  |      2.32      | 51.4  | 50.7 | 量化蒸馏训练 |
 | [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | Paddle-TensorRT |    GPU    | 33.48    |None  |  18.47 |18.03   |     1.81       | 51.4  | 50.5| 量化蒸馏训练 |
@@ -134,6 +134,6 @@ NOTE:
 | [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize)  | Paddle Inference |    CPU    |     1138.04|   602.62 |None|None     |      1.89      |77.37 | 71.62 |量化蒸馏训练 |

 #### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize)  | Paddle Inference |    CPU    |     4726.65|   4134.91|None|None     |      1.14      |77.37 | 71.62 |量化蒸馏训练 |
--- a/docs/cn/quick_start/models/cpp.md
+++ b/docs/cn/quick_start/models/cpp.md
@@ -68,7 +68,7 @@ target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})

 ## 4. 编译可执行程序

-假设当前目录已经准备好`infer_demo.cc`和`CMakeLists.txt`两个文件，目录结构如下所示，即可进行编译
+假设当前目录已经准备好`infer_demo.cc`和`CMakeLists.txt`两个文件，即可进行编译

 ### Linux & Mac

--- a/docs/en/build_and_install/cpu.md
+++ b/docs/en/build_and_install/cpu.md
@@ -30,6 +30,11 @@ Prerequisite for Compiling on Linux & Mac:
 - gcc/g++ >= 5.4 (8.2 is recommended)
 - cmake >= 3.18.0

+It it recommend install OpenCV library manually, and define `-DOPENCV_DIRECTORY` to set path of OpenCV library(If the flag is not defined, a prebuilt OpenCV library will be downloaded automaticly while building FastDeploy, but the prebuilt OpenCV cannot support reading video file or other function e.g `imshow`)
+```
+sudo apt-get install libopencv-dev
+```
+
 ```
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy
@@ -38,7 +43,8 @@ cmake .. -DENABLE_ORT_BACKEND=ON \
         -DENABLE_PADDLE_BACKEND=ON \
         -DENABLE_OPENVINO_BACKEND=ON \
         -DCMAKE_INSTALL_PREFIX=${PWD}/compiled_fastdeploy_sdk \
-         -DENABLE_VISION=ON
+         -DENABLE_VISION=ON \
+         -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4
 make -j12
 make install
 ```
@@ -84,6 +90,11 @@ All compilation options are introduced via environment variables

 ### Linux & Mac

+It it recommend install OpenCV library manually, and define `-DOPENCV_DIRECTORY` to set path of OpenCV library(If the flag is not defined, a prebuilt OpenCV library will be downloaded automaticly while building FastDeploy, but the prebuilt OpenCV cannot support reading video file or other function e.g `imshow`)
+```
+sudo apt-get install libopencv-dev
+```
+
 ```
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy/python
@@ -91,6 +102,8 @@ export ENABLE_ORT_BACKEND=ON
 export ENABLE_PADDLE_BACKEND=ON
 export ENABLE_OPENVINO_BACKEND=ON
 export ENABLE_VISION=ON
+# The OPENCV_DIRECTORY is optional, if not exported, a prebuilt OpenCV library will be downloaded
+export OPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4

 python setup.py build
 python setup.py bdist_wheel
--- a/docs/en/build_and_install/download_prebuilt_libraries.md
+++ b/docs/en/build_and_install/download_prebuilt_libraries.md
@@ -1,3 +1,4 @@
+English | [中文](../../cn/build_and_install/download_prebuilt_libraries.md)
 # How to Install Prebuilt Library

 FastDeploy provides pre-built libraries for developers to download and install directly. Meanwhile, FastDeploy also offers easy access to compile so that developers can compile FastDeploy according to their own needs.
@@ -21,7 +22,7 @@ FastDeploy supports Computer Vision, Text and NLP model deployment on CPU and Nv

 ### Python SDK

-Install the released version（the newest 0.8.0 for now）
+Install the released version（the newest 1.0.1 for now）

 ```
 pip install fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
@@ -41,12 +42,12 @@ conda config --add channels conda-forge && conda install cudatoolkit=11.2 cudnn=

 ### C++ SDK

-Install the released version（Latest 0.8.0）
+Install the released version（Latest 1.0.1）

 | Platform    | File                                                                                                                  | Description                                               |
 |:----------- |:--------------------------------------------------------------------------------------------------------------------- |:--------------------------------------------------------- |
-| Linux x64 | [fastdeploy-linux-x64-gpu-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-0.8.0.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2 |
-| Windows x64 | [fastdeploy-win-x64-gpu-0.8.0.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-0.8.0.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2 |
+| Linux x64 | [fastdeploy-linux-x64-gpu-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-1.0.1.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2 |
+| Windows x64 | [fastdeploy-win-x64-gpu-1.0.1.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-1.0.1.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2 |

 Install the Develop version（Nightly build）

@@ -68,7 +69,7 @@ FastDeploy supports computer vision, text and NLP model deployment on CPU with P

 ### Python SDK

-Install the released version（Latest 0.8.0 for now）
+Install the released version（Latest 1.0.1 for now）

 ```
 pip install fastdeploy-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
@@ -82,15 +83,15 @@ pip install fastdeploy-python==0.0.0 -f https://www.paddlepaddle.org.cn/whl/fast

 ### C++ SDK

-Install the released version（Latest 0.8.0 for now, Android is 1.0.0 pre-release）
+Install the released version（Latest 1.0.1 for now, Android is 1.0.1）

 | Platform      | File                                                                                                                  | Description                    |
 |:------------- |:--------------------------------------------------------------------------------------------------------------------- |:------------------------------ |
-| Linux x64 | [fastdeploy-linux-x64-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-0.8.0.tgz) | g++ 8.2 |
-| Windows x64 | [fastdeploy-win-x64-0.8.0.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-0.8.0.zip) | Visual Studio 16 2019 |
-| Mac OSX x64 | [fastdeploy-osx-x86_64-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-0.8.0.tgz) | clang++ 10.0.0|
-| Mac OSX arm64 | [fastdeploy-osx-arm64-0.8.0.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-0.8.0.tgz) | clang++ 13.0.0 |
-| Linux aarch64 | - | - |  
+| Linux x64 | [fastdeploy-linux-x64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.1.tgz) | g++ 8.2 |
+| Windows x64 | [fastdeploy-win-x64-1.0.1.zip](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-1.0.1.zip) | Visual Studio 16 2019 |
+| Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.1.tgz) | clang++ 10.0.0|
+| Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.1.tgz) | clang++ 13.0.0 |
+| Linux aarch64 | [fastdeploy-osx-arm64-1.0.1.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.1.tgz) | gcc 6.3 |  
 | Android armv7&v8 | [fastdeploy-android-1.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.0-shared.tgz)| NDK 25, clang++, support arm64-v8a及armeabi-v7a |

 ## Java SDK
--- a/docs/en/build_and_install/gpu.md
+++ b/docs/en/build_and_install/gpu.md
@@ -34,6 +34,11 @@ Prerequisite for Compiling on Linux:
 - cuda >= 11.2
 - cudnn >= 8.2

+It it recommend install OpenCV library manually, and define `-DOPENCV_DIRECTORY` to set path of OpenCV library(If the flag is not defined, a prebuilt OpenCV library will be downloaded automaticly while building FastDeploy, but the prebuilt OpenCV cannot support reading video file or other function e.g `imshow`)
+```
+sudo apt-get install libopencv-dev
+```
+
 ```
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy
@@ -46,7 +51,8 @@ cmake .. -DENABLE_ORT_BACKEND=ON \
         -DTRT_DIRECTORY=/Paddle/TensorRT-8.4.1.5 \
         -DCUDA_DIRECTORY=/usr/local/cuda \
         -DCMAKE_INSTALL_PREFIX=${PWD}/compiled_fastdeploy_sdk \
-         -DENABLE_VISION=ON
+         -DENABLE_VISION=ON \
+         -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4
 make -j12
 make install
 ```
@@ -106,6 +112,11 @@ Prerequisite for Compiling on Linux:

 All compilation options are imported via environment variables

+It it recommend install OpenCV library manually, and define `-DOPENCV_DIRECTORY` to set path of OpenCV library(If the flag is not defined, a prebuilt OpenCV library will be downloaded automaticly while building FastDeploy, but the prebuilt OpenCV cannot support reading video file or other function e.g `imshow`)
+```
+sudo apt-get install libopencv-dev
+```
+
 ```
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy/python
@@ -117,6 +128,8 @@ export ENABLE_TRT_BACKEND=ON
 export WITH_GPU=ON
 export TRT_DIRECTORY=/Paddle/TensorRT-8.4.1.5
 export CUDA_DIRECTORY=/usr/local/cuda
+# The OPENCV_DIRECTORY is optional, if not exported, a prebuilt OpenCV library will be downloaded
+export OPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4

 python setup.py build
 python setup.py bdist_wheel
--- a/docs/en/build_and_install/jetson.md
+++ b/docs/en/build_and_install/jetson.md
@@ -1,7 +1,7 @@

 # How to Build FastDeploy Library on Nvidia Jetson Platform

-FastDeploy supports CPU inference with ONNX Runtime and GPU inference with Nvidia TensorRT on Nvidia Jetson platform
+FastDeploy supports CPU inference with ONNX Runtime and GPU inference with Nvidia TensorRT/Paddle Inference on Nvidia Jetson platform

 ## How to Build and Install FastDeploy C++ Library

@@ -11,12 +11,16 @@ Prerequisite for Compiling on NVIDIA Jetson:
 - cmake >= 3.10.0
 - jetpack >= 4.6.1

+If you need to integrate Paddle Inference backend(Support CPU/GPU)，please download and decompress the prebuilt library in [Paddle Inference prebuild libraries](https://www.paddlepaddle.org.cn/inference/v2.4/guides/install/download_lib.html#c) according to your develop envriment.
+
 ```
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy
 mkdir build && cd build
 cmake .. -DBUILD_ON_JETSON=ON \
         -DENABLE_VISION=ON \
+         -DENABLE_PADDLE_BACKEND=ON \ # This is optional, can be OFF if you don't need
+         -DPADDLEINFERENCE_DIRECTORY=/Download/paddle_inference_jetson \
         -DCMAKE_INSTALL_PREFIX=${PWD}/installed_fastdeploy
 make -j8
 make install
@@ -35,6 +39,8 @@ Prerequisite for Compiling on NVIDIA Jetson:

 Notice the `wheel` is required if you need to pack a wheel, execute `pip install wheel` first.

+If you need to integrate Paddle Inference backend(Support CPU/GPU)，please download and decompress the prebuilt library in [Paddle Inference prebuild libraries](https://www.paddlepaddle.org.cn/inference/v2.4/guides/install/download_lib.html#c) according to your develop envriment.
+
 All compilation options are imported via environment variables

 ```
@@ -43,6 +49,10 @@ cd FastDeploy/python
 export BUILD_ON_JETSON=ON
 export ENABLE_VISION=ON

+# ENABLE_PADDLE_BACKEND & PADDLEINFERENCE_DIRECTORY are optional
+export ENABLE_PADDLE_BACKEND=ON
+export PADDLEINFERENCE_DIRECTORY=/Download/paddle_inference_jetson
+
 python setup.py build
 python setup.py bdist_wheel
 ```
--- a/docs/en/faq/custom_opencv.md
+++ b/docs/en/faq/custom_opencv.md
@@ -0,0 +1,37 @@
+English | [中文](../../cn/faq/custom_opencv.md)
+
+# Use Own OpenCV Library
+
+The prebuilt FastDeploy library has a built-in OpenCV library, which is not able to read video file or call `imshow` because the prebuilt FastDeploy has to build in manylinux version. If you need to read video or other functions provided by opencv, this document shows how to build FastDeploy with your own OpenCV in your environment.
+
+FastDeploy provides flag `-DOPENCV_DIRECTORY` to set path of OpenCV library, the following steps show how to build CPU C++ SDK on Ubuntu.
+
+## CPU C++ SDK
+
+### 1. Install OpenCV
+
+```
+sudo apt-get install libopencv-dev
+```
+
+### 2. Build FastDeploy
+
+```
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+mkdir build && cd build
+cmake .. -DENABLE_ORT_BACKEND=ON \
+         -DENABLE_PADDLE_BACKEND=ON \
+         -DENABLE_OPENVINO_BACKEND=ON \
+         -DENABLE_VISION=ON \
+         -DCMAKE_INSTALL_PREFIX=${PWD}/installed_fastdeploy \
+         -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4
+make -j8
+make install
+```
+
+Now we get the C++ SDK in current directory `installed_fastdeploy`, this library can use all the functions from your own OpenCV library.
+
+This document also works for other hardware deployment(GPU/IPU/XPU...) on Linux platform. 
+
+- [More Options to build FastDeploy](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/README_EN.md)
--- a/examples/multimodal/stable_diffusion/cpp/CMakeLists.txt
+++ b/examples/multimodal/stable_diffusion/cpp/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PROJECT(main C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+set(THIRD_LIBS "")
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+include_directories(${FASTDEPLOY_INCS})
+
+file(GLOB_RECURSE ALL_SRCS ${PROJECT_SOURCE_DIR}/*.cc)
+
+add_executable(main ${ALL_SRCS})
+target_link_libraries(main ${FASTDEPLOY_LIBS} ${THIRD_LIBS})
--- a/examples/multimodal/stable_diffusion/cpp/README.md
+++ b/examples/multimodal/stable_diffusion/cpp/README.md
@@ -0,0 +1,12 @@
+# StableDiffusion C++部署示例
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+本目录下提供`*_infer.cc`快速完成StableDiffusion各任务的C++部署示例。
+
+## Inpaint任务
+
+StableDiffusion Inpaint任务是一个根据提示文本补全图片的任务，具体而言就是用户给定提示文本，原始图片以及原始图片的mask图片，该任务输出补全后的图片。
--- a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
+++ b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
@@ -0,0 +1,398 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dpm_solver_multistep_scheduler.h"
+#include "fastdeploy/core/fd_scalar.h"
+#include "fastdeploy/function/functions.h"
+#include <algorithm>
+#include <cmath>
+
+namespace fastdeploy {
+
+void DPMSolverMultistepScheduler::BetaForAlphaBar(FDTensor* out,
+                                                  int num_diffusion_timesteps,
+                                                  float max_beta) {
+  auto alpha_bar = [](float time_step) -> float {
+    constexpr float pi = 3.14159265358979323846;
+    return std::pow(std::cos((time_step + 0.008) / 1.008 * pi / 2), 2);
+  };
+  std::vector<FDTensor> betas;
+  for (int i = 0; i < num_diffusion_timesteps; ++i) {
+    float t1 = i / num_diffusion_timesteps;
+    float t2 = (i + 1) / num_diffusion_timesteps;
+    float beta_val = (std::min)(1 - alpha_bar(t1) / alpha_bar(t2), max_beta);
+    betas.emplace_back(Scalar(beta_val));
+  }
+  function::Concat(betas, out);
+}
+
+DPMSolverMultistepScheduler::DPMSolverMultistepScheduler(
+    int num_train_timesteps, float beta_start, float beta_end,
+    const std::string& beta_schedule, const std::vector<float>& trained_betas,
+    int solver_order, bool predict_epsilon, bool thresholding,
+    float dynamic_thresholding_ratio, float sample_max_value,
+    const std::string& algorithm_type, const std::string& solver_type,
+    bool lower_order_final)
+    : config({num_train_timesteps, beta_start, beta_end, beta_schedule,
+              solver_order, predict_epsilon, thresholding,
+              dynamic_thresholding_ratio, sample_max_value, algorithm_type,
+              solver_type, lower_order_final}) {
+  int beta_size = trained_betas.size();
+  if (beta_size > 0) {
+    betas_.Allocate({beta_size}, FDDataType::FP32);
+    std::copy(trained_betas.data(), trained_betas.data() + beta_size,
+              reinterpret_cast<float*>(betas_.Data()));
+  } else if (beta_schedule == "linear") {
+    function::Linspace(beta_start, beta_end, num_train_timesteps, &betas_,
+                       FDDataType::FP32);
+  } else if (beta_schedule == "scaled_linear") {
+    function::Linspace(std::sqrt(beta_start), std::sqrt(beta_end),
+                       num_train_timesteps, &betas_, FDDataType::FP32);
+    betas_ = betas_ * betas_;
+  } else if (beta_schedule == "squaredcos_cap_v2") {
+    BetaForAlphaBar(&betas_, num_train_timesteps);
+  } else {
+    FDASSERT(false, "%s is not implemented for DPMSolverMultistepScheduler",
+             beta_schedule.c_str());
+  }
+
+  alphas_ = 1.0f - betas_;
+  function::Cumprod(alphas_, &alphas_cumprod_);
+  function::Sqrt(alphas_cumprod_, &alpha_t_);
+  function::Sqrt(1.0f - alphas_cumprod_, &sigma_t_);
+  FDTensor alpha_t_log, sigma_t_log;
+  function::Log(alpha_t_, &alpha_t_log);
+  function::Log(sigma_t_, &sigma_t_log);
+  lambda_t_ = alpha_t_log - sigma_t_log;
+
+  FDASSERT(config.algorithm_type_ == "dpmsolver" ||
+               config.algorithm_type_ == "dpmsolver++",
+           "%s does is not implemented for DPMSolverMultistepScheduler",
+           config.algorithm_type_.c_str());
+  FDASSERT(config.solver_type_ == "midpoint" || config.solver_type_ == "heun",
+           "%s does is not implemented for DPMSolverMultistepScheduler",
+           config.solver_type_.c_str());
+  num_inference_steps_ = -1;
+
+  function::Linspace(0, config.num_train_timesteps_ - 1,
+                     config.num_train_timesteps_, &timesteps_);
+  function::Cast(timesteps_, &timesteps_, FDDataType::INT64);
+  // Reverse timesteps
+  int64_t* timesteps_data = reinterpret_cast<int64_t*>(timesteps_.Data());
+  std::reverse(timesteps_data, timesteps_data + timesteps_.Numel());
+
+  model_outputs_.resize(config.solver_order_);
+  lower_order_nums_ = 0;
+}
+
+float DPMSolverMultistepScheduler::InitNoiseSigma() { return 1.0; }
+
+void DPMSolverMultistepScheduler::ConvertModelOutput(
+    const FDTensor& model_output, int timestep, const FDTensor& sample,
+    FDTensor* out) {
+  if (config.algorithm_type_ == "dpmsolver++") {
+    FDTensor x0_pred;
+    if (config.predict_epsilon_) {
+      FDTensor alpha_t, sigma_t;
+      function::Slice(alpha_t_, {0}, {timestep}, &alpha_t);
+      function::Slice(sigma_t_, {0}, {timestep}, &sigma_t);
+      x0_pred = (sample - sigma_t * model_output) / alpha_t;
+    } else {
+      x0_pred = model_output;
+    }
+    if (config.thresholding_) {
+      FDTensor dynamic_max_val, x0_pred_abs;
+      function::Abs(x0_pred, &x0_pred_abs);
+      x0_pred_abs.Reshape({x0_pred_abs.Shape()[0], -1});
+      function::Quantile(x0_pred_abs, {config.dynamic_thresholding_ratio_}, {1},
+                         &dynamic_max_val);
+
+      FDTensor max_value, dy_max_val;
+      function::FullLike(dynamic_max_val, config.sample_max_value_, &max_value,
+                         dynamic_max_val.Dtype());
+      function::Maximum(dynamic_max_val, max_value, &dy_max_val);
+      int expand_dims = x0_pred.Shape().size() - 1;
+      for (int i = 0; i < expand_dims; ++i) {
+        dy_max_val.ExpandDim(dy_max_val.Shape().size());
+      }
+      float clip_max = reinterpret_cast<float*>(dy_max_val.Data())[0];
+      function::Clip(x0_pred, -clip_max, clip_max, &x0_pred);
+      x0_pred = x0_pred / dy_max_val;
+    }
+    *out = std::move(x0_pred);
+  } else if (config.algorithm_type_ == "dpmsolver") {
+    if (config.predict_epsilon_) {
+      *out = model_output;
+    } else {
+      FDTensor alpha_t, sigma_t;
+      function::Slice(alpha_t_, {0}, {timestep}, &alpha_t);
+      function::Slice(sigma_t_, {0}, {timestep}, &sigma_t);
+      *out = (sample - (alpha_t * model_output)) / sigma_t;
+    }
+  }
+}
+
+void DPMSolverMultistepScheduler::DPMSolverFirstOrderUpdate(
+    const FDTensor& model_output, int timestep, int prev_timestep,
+    const FDTensor& sample, FDTensor* out) {
+  FDTensor lambda_t, lambda_s;
+  function::Slice(lambda_t_, {0}, {prev_timestep}, &lambda_t);
+  function::Slice(lambda_t_, {0}, {timestep}, &lambda_s);
+
+  FDTensor alpha_t, alpha_s;
+  function::Slice(alpha_t_, {0}, {prev_timestep}, &alpha_t);
+  function::Slice(alpha_t_, {0}, {timestep}, &alpha_s);
+
+  FDTensor sigma_t, sigma_s;
+  function::Slice(sigma_t_, {0}, {prev_timestep}, &sigma_t);
+  function::Slice(sigma_t_, {0}, {timestep}, &sigma_s);
+
+  FDTensor h = lambda_t - lambda_s;
+  if (config.algorithm_type_ == "dpmsolver++") {
+    function::Exp(0.0f - h, &h);
+    *out = (sigma_t / sigma_s) * sample - (alpha_t * (h - 1.0f)) * model_output;
+  } else if (config.algorithm_type_ == "dpmsolver") {
+    function::Exp(h, &h);
+    *out = (alpha_t / alpha_s) * sample - (sigma_t * (h - 1.0f)) * model_output;
+  }
+}
+
+void DPMSolverMultistepScheduler::MultiStepDPMSolverSecondOrderUpdate(
+    const std::vector<FDTensor>& model_output_list,
+    const std::vector<int>& timestep_list, int prev_timestep,
+    const FDTensor& sample, FDTensor* out) {
+  int timestep_size = timestep_list.size();
+  int model_output_size = model_output_list.size();
+  int t = prev_timestep;
+  int s0 = timestep_list[timestep_size - 1];
+  int s1 = timestep_list[timestep_size - 2];
+  const FDTensor& m0 = model_output_list[model_output_size - 1];
+  const FDTensor& m1 = model_output_list[model_output_size - 2];
+  FDTensor lambda_t, lambda_s0, lambda_s1;
+  function::Slice(lambda_t_, {0}, {t}, &lambda_t);
+  function::Slice(lambda_t_, {0}, {s0}, &lambda_s0);
+  function::Slice(lambda_t_, {0}, {s1}, &lambda_s1);
+
+  FDTensor alpha_t, alpha_s0, sigma_t, sigma_s0;
+  function::Slice(alpha_t_, {0}, {t}, &alpha_t);
+  function::Slice(alpha_t_, {0}, {s0}, &alpha_s0);
+  function::Slice(sigma_t_, {0}, {t}, &sigma_t);
+  function::Slice(sigma_t_, {0}, {s0}, &sigma_s0);
+
+  FDTensor h = lambda_t - lambda_s0;
+  FDTensor h0 = lambda_s0 - lambda_s1;
+  FDTensor r0 = h0 / h;
+  FDTensor D0 = m0;
+  FDTensor D1 = (1.0f / r0) * (m0 - m1);
+  if (config.algorithm_type_ == "dpmsolver++") {
+    if (config.solver_type_ == "midpoint") {
+      function::Exp(0.0f - h, &h);
+      *out = (sigma_t / sigma_s0 * sample) - (alpha_t * (h - 1.0f) * D0) -
+             (0.5f * alpha_t * (h - 1.0f) * D1);
+    } else if (config.solver_type_ == "heun") {
+      FDTensor h_exp;
+      function::Exp(0.0f - h, &h_exp);
+      *out = (sigma_t / sigma_s0 * sample) - (alpha_t * (h_exp - 1.0f) * D0) +
+             (alpha_t * ((h_exp - 1.0f) / h + 1.0f) * D1);
+    }
+  } else if (config.algorithm_type_ == "dpmsolver") {
+    FDTensor h_exp;
+    function::Exp(h, &h_exp);
+    if (config.solver_type_ == "midpoint") {
+      *out = alpha_t / alpha_s0 * sample - sigma_t * (h_exp - 1.0f) * D0 -
+             0.5 * (sigma_t * (h_exp - 1.0f) * D1);
+    } else if (config.solver_type_ == "heun") {
+      *out = alpha_t / alpha_s0 * sample - sigma_t * (h_exp - 1.0f) * D0 -
+             (sigma_t * ((h_exp - 1.0f) / h - 1.0f) * D1);
+    }
+  }
+}
+
+void DPMSolverMultistepScheduler::MultiStepDPMSolverThirdOrderUpdate(
+    const std::vector<FDTensor>& model_output_list,
+    const std::vector<int>& timestep_list, int prev_timestep,
+    const FDTensor& sample, FDTensor* out) {
+  int timestep_size = timestep_list.size();
+  int model_output_size = model_output_list.size();
+  int t = prev_timestep;
+
+  int s0 = timestep_list[timestep_size - 1];
+  int s1 = timestep_list[timestep_size - 2];
+  int s2 = timestep_list[timestep_size - 3];
+  const FDTensor& m0 = model_output_list[model_output_size - 1];
+  const FDTensor& m1 = model_output_list[model_output_size - 2];
+  const FDTensor& m2 = model_output_list[model_output_size - 3];
+
+  FDTensor lambda_t, lambda_s0, lambda_s1, lambda_s2;
+  function::Slice(lambda_t_, {0}, {t}, &lambda_t);
+  function::Slice(lambda_t_, {0}, {s0}, &lambda_s0);
+  function::Slice(lambda_t_, {0}, {s1}, &lambda_s1);
+  function::Slice(lambda_t_, {0}, {s2}, &lambda_s2);
+
+  FDTensor alpha_t, alpha_s0, sigma_t, sigma_s0;
+  function::Slice(alpha_t_, {0}, {t}, &alpha_t);
+  function::Slice(alpha_t_, {0}, {s0}, &alpha_s0);
+  function::Slice(sigma_t_, {0}, {t}, &sigma_t);
+  function::Slice(sigma_t_, {0}, {s0}, &sigma_s0);
+
+  FDTensor h = lambda_t - lambda_s0;
+  FDTensor h0 = lambda_s0 - lambda_s1;
+  FDTensor h1 = lambda_s1 - lambda_s2;
+
+  FDTensor r0 = h0 / h;
+  FDTensor r1 = h1 / h;
+  FDTensor D0 = m0;
+  FDTensor D1_0 = (1.0f / r0) * (m0 - m1);
+  FDTensor D1_1 = (1.0f / r1) * (m1 - m2);
+  FDTensor D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1);
+  FDTensor D2 = (1.0f / (r0 + r1)) * (D1_0 - D1_1);
+
+  if (config.algorithm_type_ == "dpmsolver++") {
+    FDTensor h_exp;
+    function::Exp(0.0f - h, &h_exp);
+    *out = (sigma_t / sigma_s0) * sample - (alpha_t * (h_exp - 1.0f)) * D0 +
+           (alpha_t * ((h_exp - 1.0) / h + 1.0)) * D1 -
+           (alpha_t * ((h_exp - 1.0 + h) / (h * h) - 0.5)) * D2;
+
+  } else if (config.algorithm_type_ == "dpmsolver") {
+    FDTensor h_exp;
+    function::Exp(h, &h_exp);
+    *out = (alpha_t / alpha_s0) * sample - (sigma_t * (h_exp - 1.0f)) * D0 +
+           (sigma_t * ((h_exp - 1.0) / h - 1.0)) * D1 -
+           (sigma_t * ((h_exp - 1.0 - h) / (h * h) - 0.5)) * D2;
+  }
+}
+
+void DPMSolverMultistepScheduler::ScaleModelInput(
+    const FDTensor& sample, FDTensor* out,
+    const std::vector<FDTensor>& timesteps) {
+  *out = sample;
+}
+
+void DPMSolverMultistepScheduler::SetTimesteps(int num_inference_steps) {
+  num_inference_steps_ = num_inference_steps;
+  function::Linspace(0, config.num_train_timesteps_ - 1,
+                     num_inference_steps + 1, &timesteps_);
+  function::Round(timesteps_, &timesteps_);
+  // Reverse timesteps
+  float* timesteps_data = reinterpret_cast<float*>(timesteps_.Data());
+  std::reverse(timesteps_data, timesteps_data + timesteps_.Numel());
+  FDTensor timestep_tmp;
+  timestep_tmp.Allocate({num_inference_steps}, timesteps_.Dtype());
+  float* timestep_tmp_data = reinterpret_cast<float*>(timestep_tmp.Data());
+  std::copy(timesteps_data, timesteps_data + num_inference_steps,
+            timestep_tmp_data);
+  timesteps_ = std::move(timestep_tmp);
+
+  function::Cast(timesteps_, &timesteps_, FDDataType::INT64);
+
+  model_outputs_.clear();
+  model_outputs_.resize(config.solver_order_);
+
+  lower_order_nums_ = 0;
+}
+
+void DPMSolverMultistepScheduler::Step(const FDTensor& model_output,
+                                       int timestep, const FDTensor& sample,
+                                       FDTensor* prev_sample) {
+  FDASSERT(num_inference_steps_ > -1,
+           "Number of inference steps is -1, you need to run SetTimesteps "
+           "after creating the scheduler");
+  int64_t step_index = timesteps_.Numel() - 1;
+  int64_t* timesteps_data = reinterpret_cast<int64_t*>(timesteps_.Data());
+  int64_t* timesteps_iter =
+      std::find(timesteps_data, timesteps_data + timesteps_.Numel(), timestep);
+  if (timesteps_iter - timesteps_data < timesteps_.Numel()) {
+    step_index = timesteps_iter - timesteps_data;
+  }
+  int64_t prev_timestep = 0;
+  if (step_index != timesteps_.Numel() - 1) {
+    prev_timestep = timesteps_data[step_index + 1];
+  }
+  bool lower_order_final = (step_index == timesteps_.Numel() - 1) &&
+                           config.lower_order_final_ &&
+                           (timesteps_.Numel() < 15);
+  bool lower_order_second = (step_index == timesteps_.Numel() - 2) &&
+                            config.lower_order_final_ &&
+                            (timesteps_.Numel() < 15);
+  FDTensor model_out;
+  ConvertModelOutput(model_output, timestep, sample, &model_out);
+  for (int i = 0; i < config.solver_order_ - 1; ++i) {
+    model_outputs_[i] = std::move(model_outputs_[i + 1]);
+  }
+  model_outputs_[config.solver_order_ - 1] = std::move(model_out);
+
+  if (config.solver_order_ == 1 || lower_order_nums_ < 1 || lower_order_final) {
+    DPMSolverFirstOrderUpdate(model_outputs_[config.solver_order_ - 1],
+                              timestep, prev_timestep, sample, prev_sample);
+  } else if (config.solver_order_ == 2 || lower_order_nums_ < 2 ||
+             lower_order_second) {
+    int t0 = reinterpret_cast<int64_t*>(timesteps_.Data())[step_index - 1];
+    std::vector<int> timestep_list = {t0, timestep};
+    MultiStepDPMSolverSecondOrderUpdate(model_outputs_, timestep_list,
+                                        prev_timestep, sample, prev_sample);
+  } else {
+    int t0 = reinterpret_cast<int64_t*>(timesteps_.Data())[step_index - 1];
+    int t1 = reinterpret_cast<int64_t*>(timesteps_.Data())[step_index - 2];
+    std::vector<int> timestep_list = {t1, t0, timestep};
+    MultiStepDPMSolverThirdOrderUpdate(model_outputs_, timestep_list,
+                                       prev_timestep, sample, prev_sample);
+  }
+
+  if (lower_order_nums_ < config.solver_order_) {
+    lower_order_nums_ += 1;
+  }
+}
+
+void DPMSolverMultistepScheduler::AddNoise(const FDTensor& original_samples,
+                                           const FDTensor& noise,
+                                           const FDTensor& timesteps,
+                                           FDTensor* out) {
+  function::Cast(alphas_cumprod_, &alphas_cumprod_, original_samples.Dtype());
+
+  const int64_t* timesteps_data =
+      reinterpret_cast<const int64_t*>(timesteps.Data());
+  std::vector<int64_t> timesteps_vec;
+  for (int i = 0; i < timesteps.Numel(); ++i) {
+    timesteps_vec.push_back(timesteps_data[i]);
+  }
+  FDTensor sqrt_alpha_prod;
+  function::Slice(alphas_cumprod_, {0}, timesteps_vec, &sqrt_alpha_prod);
+  function::Sqrt(sqrt_alpha_prod, &sqrt_alpha_prod);
+  sqrt_alpha_prod.Reshape({-1});
+  int rank_diff =
+      original_samples.Shape().size() - sqrt_alpha_prod.Shape().size();
+  for (int i = 0; i < rank_diff; ++i) {
+    int curr_rank = sqrt_alpha_prod.Shape().size();
+    sqrt_alpha_prod.ExpandDim(curr_rank - 1);
+  }
+
+  FDTensor sqrt_one_minus_alpha_prod;
+  function::Slice(alphas_cumprod_, {0}, timesteps_vec,
+                  &sqrt_one_minus_alpha_prod);
+  sqrt_one_minus_alpha_prod = 1.0f - sqrt_one_minus_alpha_prod;
+  function::Sqrt(sqrt_one_minus_alpha_prod, &sqrt_one_minus_alpha_prod);
+  sqrt_one_minus_alpha_prod.Reshape({-1});
+  rank_diff = original_samples.Shape().size() -
+              sqrt_one_minus_alpha_prod.Shape().size();
+  for (int i = 0; i < rank_diff; ++i) {
+    int curr_rank = sqrt_one_minus_alpha_prod.Shape().size();
+    sqrt_one_minus_alpha_prod.ExpandDim(curr_rank - 1);
+  }
+  *out = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise;
+}
+
+FDTensor DPMSolverMultistepScheduler::GetTimesteps() { return timesteps_; }
+
+}  // namespace fastdeploy
--- a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
+++ b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "./scheduler.h"
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+
+class DPMSolverMultistepScheduler : public Scheduler {
+ public:
+  DPMSolverMultistepScheduler(int num_train_timesteps = 1000,
+                              float beta_start = 0.0001, float beta_end = 0.02,
+                              const std::string& beta_schedule = "linear",
+                              const std::vector<float>& trained_betas = {},
+                              int solver_order = 2, bool predict_epsilon = true,
+                              bool thresholding = false,
+                              float dynamic_thresholding_ratio = 0.995,
+                              float sample_max_value = 1.0,
+                              const std::string& algorithm_type = "dpmsolver++",
+                              const std::string& solver_type = "midpoint",
+                              bool lower_order_final = true);
+  void BetaForAlphaBar(FDTensor* out, int num_diffusion_timesteps,
+                       float max_beta = 0.999);
+  void ConvertModelOutput(const FDTensor& model_output, int timestep,
+                          const FDTensor& sample, FDTensor* out);
+  void DPMSolverFirstOrderUpdate(const FDTensor& model_output, int timestep,
+                                 int prev_timestep, const FDTensor& sample,
+                                 FDTensor* out);
+  void MultiStepDPMSolverSecondOrderUpdate(
+      const std::vector<FDTensor>& model_output_list,
+      const std::vector<int>& timestep_list, int prev_timestep,
+      const FDTensor& sample, FDTensor* out);
+  void MultiStepDPMSolverThirdOrderUpdate(
+      const std::vector<FDTensor>& model_output_list,
+      const std::vector<int>& timestep_list, int prev_timestep,
+      const FDTensor& sample, FDTensor* out);
+  void SetTimesteps(int num_inference_steps) override;
+  void Step(const FDTensor& model_output, int timestep, const FDTensor& sample,
+            FDTensor* prev_sample) override;
+  void ScaleModelInput(const FDTensor& sample, FDTensor* out,
+                       const std::vector<FDTensor>& timesteps = {}) override;
+  void AddNoise(const FDTensor& original_samples, const FDTensor& noise,
+                const FDTensor& timesteps, FDTensor* out) override;
+  float InitNoiseSigma() override;
+  FDTensor GetTimesteps() override;
+  struct Config {
+    int num_train_timesteps_;
+    float beta_start_;
+    float beta_end_;
+    std::string beta_schedule_;
+    int solver_order_;
+    bool predict_epsilon_;
+    bool thresholding_;
+    float dynamic_thresholding_ratio_;
+    float sample_max_value_;
+    std::string algorithm_type_;
+    std::string solver_type_;
+    bool lower_order_final_;
+  } config;
+
+ private:
+  FDTensor betas_;
+  FDTensor alphas_;
+  FDTensor alphas_cumprod_;
+  FDTensor alpha_t_;
+  FDTensor sigma_t_;
+  FDTensor lambda_t_;
+  int num_inference_steps_;
+  FDTensor timesteps_;
+  int lower_order_nums_;
+  std::vector<FDTensor> model_outputs_;
+};
+
+}  // namespace fastdeploy
--- a/examples/multimodal/stable_diffusion/cpp/main.cc
+++ b/examples/multimodal/stable_diffusion/cpp/main.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "./dpm_solver_multistep_scheduler.h"
+#include "./pipeline_stable_diffusion_inpaint.h"
+#include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/common/processors/mat.h"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+template <typename T> std::string Str(const T* value, int size) {
+  std::ostringstream oss;
+  oss << "[ " << value[0];
+  for (int i = 1; i < size; ++i) {
+    oss << " ," << value[i];
+  }
+  oss << " ]";
+  return oss.str();
+}
+
+std::unique_ptr<fastdeploy::Runtime> CreateRuntime(
+    const std::string& model_file, const std::string& params_file,
+    bool use_trt_backend = false, bool use_fp16 = false,
+    const std::unordered_map<std::string, std::vector<std::vector<int>>>&
+        dynamic_shapes = {},
+    const std::vector<std::string>& disable_paddle_trt_ops = {}) {
+  fastdeploy::RuntimeOption runtime_option;
+  runtime_option.SetModelPath(model_file, params_file,
+                              fastdeploy::ModelFormat::PADDLE);
+  runtime_option.UseGpu();
+  if (!use_trt_backend) {
+    runtime_option.UsePaddleBackend();
+  } else {
+    runtime_option.UseTrtBackend();
+    runtime_option.EnablePaddleToTrt();
+    for (auto it = dynamic_shapes.begin(); it != dynamic_shapes.end(); ++it) {
+      if (it->second.size() != 3) {
+        std::cerr << "The size of dynamic_shapes of input `" << it->first
+                  << "` should be 3, but receive " << it->second.size()
+                  << std::endl;
+        continue;
+      }
+      std::vector<int> min_shape = (it->second)[0];
+      std::vector<int> opt_shape = (it->second)[1];
+      std::vector<int> max_shape = (it->second)[2];
+      runtime_option.SetTrtInputShape(it->first, min_shape, opt_shape,
+                                      max_shape);
+    }
+    runtime_option.SetTrtCacheFile("paddle.trt");
+    runtime_option.EnablePaddleTrtCollectShape();
+    runtime_option.DisablePaddleTrtOPs(disable_paddle_trt_ops);
+    if (use_fp16) {
+      runtime_option.EnableTrtFP16();
+    }
+  }
+  std::unique_ptr<fastdeploy::Runtime> runtime =
+      std::unique_ptr<fastdeploy::Runtime>(new fastdeploy::Runtime());
+  if (!runtime->Init(runtime_option)) {
+    std::cerr << "--- Init FastDeploy Runitme Failed! "
+              << "\n--- Model:  " << model_file << std::endl;
+    return nullptr;
+  } else {
+    std::cout << "--- Init FastDeploy Runitme Done! "
+              << "\n--- Model:  " << model_file << std::endl;
+  }
+  return runtime;
+}
+
+int main() {
+  // 0. Init all configs
+  std::string model_dir = "sd15_inpaint";
+  int max_length = 77;
+  bool use_trt_backend = true;
+  bool use_fp16 = true;
+  int batch_size = 1;
+  int num_images_per_prompt = 1;
+  int num_inference_steps = 50;
+
+  int height = 512;
+  int width = 512;
+  constexpr int unet_inpaint_channels = 9;
+  constexpr int latents_channels = 4;
+
+  // 1. Init scheduler
+  std::unique_ptr<fastdeploy::Scheduler> dpm(
+      new fastdeploy::DPMSolverMultistepScheduler(
+          /* num_train_timesteps */ 1000,
+          /* beta_start = */ 0.00085,
+          /* beta_end = */ 0.012,
+          /* beta_schedule = */ "scaled_linear",
+          /* trained_betas = */ {},
+          /* solver_order = */ 2,
+          /* predict_epsilon = */ true,
+          /* thresholding = */ false,
+          /* dynamic_thresholding_ratio = */ 0.995,
+          /* sample_max_value = */ 1.0,
+          /* algorithm_type = */ "dpmsolver++",
+          /* solver_type = */ "midpoint",
+          /* lower_order_final = */ true));
+
+  // 2. Init text encoder runtime
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      text_dynamic_shape = {{"input_ids",
+                             {/* min_shape */ {1, max_length},
+                              /* opt_shape */ {batch_size, max_length},
+                              /* max_shape */ {2 * batch_size, max_length}}}};
+  std::string text_model_dir = model_dir + sep + "text_encoder";
+  std::string text_model_file = text_model_dir + sep + "inference.pdmodel";
+  std::string text_params_file = text_model_dir + sep + "inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> text_encoder_runtime =
+      CreateRuntime(text_model_file, text_params_file, use_trt_backend,
+                    use_fp16, text_dynamic_shape);
+
+  // 3. Init vae encoder runtime
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      vae_encoder_dynamic_shape = {
+          {"sample",
+           {/* min_shape */ {1, 3, height, width},
+            /* opt_shape */ {2 * batch_size, 3, height, width},
+            /* max_shape */ {2 * batch_size, 3, height, width}}}};
+  std::string vae_encoder_model_dir = model_dir + sep + "vae_encoder";
+  std::string vae_encoder_model_file =
+      vae_encoder_model_dir + sep + "inference.pdmodel";
+  std::string vae_encoder_params_file =
+      vae_encoder_model_dir + sep + "inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> vae_encoder_runtime =
+      CreateRuntime(vae_encoder_model_file, vae_encoder_params_file,
+                    use_trt_backend, use_fp16, vae_encoder_dynamic_shape);
+
+  // 4. Init vae decoder runtime
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      vae_decoder_dynamic_shape = {
+          {"latent_sample",
+           {/* min_shape */ {1, latents_channels, height / 8, width / 8},
+            /* opt_shape */
+            {2 * batch_size, latents_channels, height / 8, width / 8},
+            /* max_shape */
+            {2 * batch_size, latents_channels, height / 8, width / 8}}}};
+  std::string vae_decoder_model_dir = model_dir + sep + "vae_decoder";
+  std::string vae_decoder_model_file =
+      vae_decoder_model_dir + sep + "inference.pdmodel";
+  std::string vae_decoder_params_file =
+      vae_decoder_model_dir + sep + "inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> vae_decoder_runtime =
+      CreateRuntime(vae_decoder_model_file, vae_decoder_params_file,
+                    use_trt_backend, use_fp16, vae_decoder_dynamic_shape);
+
+  // 5. Init unet runtime
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      unet_dynamic_shape = {
+          {"sample",
+           {/* min_shape */ {1, unet_inpaint_channels, height / 8, width / 8},
+            /* opt_shape */
+            {2 * batch_size, unet_inpaint_channels, height / 8, width / 8},
+            /* max_shape */
+            {2 * batch_size, unet_inpaint_channels, height / 8, width / 8}}},
+          {"timesteps", {{1}, {1}, {1}}},
+          {"encoder_hidden_states",
+           {{1, max_length, 768},
+            {2 * batch_size, max_length, 768},
+            {2 * batch_size, max_length, 768}}}};
+  std::vector<std::string> unet_disable_paddle_trt_ops = {"sin", "cos"};
+  std::string unet_model_dir = model_dir + sep + "unet";
+  std::string unet_model_file = unet_model_dir + sep + "inference.pdmodel";
+  std::string unet_params_file = unet_model_dir + sep + "inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> unet_runtime =
+      CreateRuntime(unet_model_file, unet_params_file, use_trt_backend,
+                    use_fp16, unet_dynamic_shape, unet_disable_paddle_trt_ops);
+
+  // 6. Init fast tokenizer
+  paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer tokenizer(
+      "clip/vocab.json", "clip/merges.txt", /* max_length = */ max_length);
+  fastdeploy::StableDiffusionInpaintPipeline pipe(
+      /* vae_encoder = */ std::move(vae_encoder_runtime),
+      /* vae_decoder = */ std::move(vae_decoder_runtime),
+      /* text_encoder = */ std::move(text_encoder_runtime),
+      /* unet = */ std::move(unet_runtime),
+      /* scheduler = */ std::move(dpm),
+      /* tokenizer = */ tokenizer);
+
+  // 7. Read images
+  auto image = cv::imread("overture-creations.png");
+  auto mask_image = cv::imread("overture-creations-mask.png");
+
+  // 8. Predict
+  /*
+   * One may need to pass the initial noise to predict api.
+   * There's an example:
+   * std::vector<float> latents_data = {xxxx};
+   * fastdeploy::FDTensor latents;
+   * latents.SetExternalData({batch_size * num_images_per_prompt, latents_channels, height / 8, width / 8},fastdeploy::FDDataType::FP32, latents_data.data());
+   * pipe.Predict(..., /* latents = *\/ &latents, ....);
+   */
+  std::vector<std::string> prompts = {
+      "Face of a yellow cat, high resolution, sitting on a park bench"};
+  std::vector<fastdeploy::FDTensor> outputs;
+  fastdeploy::TimeCounter tc;
+  tc.Start();
+  pipe.Predict(prompts, image, mask_image, &outputs,
+               /* height = */ height,
+               /* width = */ width,
+               /* num_inference_steps = */ num_inference_steps,
+               /* guidance_scale = */ 7.5,
+               /* negative_prompt = */ {},
+               /* num_images_per_prompt = */ num_images_per_prompt,
+               /* eta = */ 1.0,
+               /* max_length = */ max_length,
+               /* latents = */ nullptr,
+               /* output_cv_mat = */ true,
+               /* callback = */ nullptr,
+               /* callback_steps = */ 1);
+  tc.End();
+  tc.PrintInfo();
+  fastdeploy::vision::FDMat mat = fastdeploy::vision::FDMat::Create(outputs[0]);
+  cv::imwrite("cat_on_bench_new.png", *mat.GetOpenCVMat());
+  return 0;
+}
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -0,0 +1,320 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pipeline_stable_diffusion_inpaint.h"
+#include "fastdeploy/function/functions.h"
+#include "fastdeploy/vision/common/processors/color_space_convert.h"
+#include "fastdeploy/vision/common/processors/mat.h"
+#include "fastdeploy/vision/common/processors/resize.h"
+#include <algorithm>
+
+using namespace paddlenlp;
+
+namespace fastdeploy {
+
+static constexpr int NUM_LATENT_CHANNELS = 4;
+static constexpr int NUM_UNET_INPUT_CHANNELS = 9;
+
+void StableDiffusionInpaintPipeline::PrepareMaskAndMaskedImage(
+    const cv::Mat& image, const cv::Mat& mask_mat,
+    const std::vector<int64_t>& shape, FDTensor* mask, FDTensor* mask_image) {
+  vision::FDMat image_fdmat(image);
+  vision::BGR2RGB::Run(&image_fdmat, vision::ProcLib::OPENCV);
+  vision::Resize::Run(&image_fdmat, shape[1] * 8, shape[0] * 8, -1.0f, -1.0f,
+                      cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
+  image_fdmat.ShareWithTensor(mask_image);
+
+  vision::FDMat mask_fdmat(mask_mat);
+  vision::BGR2GRAY::Run(&mask_fdmat, vision::ProcLib::OPENCV);
+  vision::Resize::Run(&mask_fdmat, shape[1] * 8, shape[0] * 8, -1.0f, -1.0f,
+                      cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
+  FDTensor image_mask;
+  mask_fdmat.ShareWithTensor(&image_mask);
+  function::Cast(image_mask, &image_mask, FDDataType::FP32);
+  std::vector<float> float_mask(image_mask.Numel(), 0);
+  float* image_mask_ptr = reinterpret_cast<float*>(image_mask.Data());
+  for (int i = 0; i < image_mask.Numel(); ++i) {
+    if (image_mask_ptr[i] < 127.5) {
+      float_mask[i] = 1;
+    }
+  }
+  // NCHW format
+  image_mask.SetExternalData({1, 1, shape[0] * 8, shape[1] * 8},
+                             FDDataType::FP32, float_mask.data());
+
+  // Set mask_image
+  mask_image->ExpandDim();
+  function::Transpose(*mask_image, mask_image, {0, 3, 1, 2});
+  function::Cast(*mask_image, mask_image, FDDataType::FP32);
+  *mask_image = *mask_image / 127.5f - 1.0f;
+  *mask_image = *mask_image * image_mask;
+
+  // Set mask
+  vision::FDMat mask_fdmat_t(mask_mat);
+  vision::BGR2GRAY::Run(&mask_fdmat_t, vision::ProcLib::OPENCV);
+  vision::Resize::Run(&mask_fdmat_t, shape[1], shape[0], -1.0f, -1.0f,
+                      cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
+  mask_fdmat_t.ShareWithTensor(mask);
+  function::Cast(*mask, mask, FDDataType::FP32);
+  *mask = *mask / 255.0f;
+  mask->ExpandDim();
+  function::Transpose(*mask, mask, {0, 3, 1, 2});
+  float* mask_data = reinterpret_cast<float*>(mask->Data());
+  for (int i = 0; i < mask->Numel(); ++i) {
+    if (mask_data[i] < 0.5) {
+      mask_data[i] = 0;
+    } else {
+      mask_data[i] = 1;
+    }
+  }
+}
+
+StableDiffusionInpaintPipeline::StableDiffusionInpaintPipeline(
+    std::unique_ptr<Runtime> vae_encoder, std::unique_ptr<Runtime> vae_decoder,
+    std::unique_ptr<Runtime> text_encoder, std::unique_ptr<Runtime> unet,
+    std::unique_ptr<Scheduler> scheduler,
+    const paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer&
+        tokenizer)
+    : vae_encoder_(std::move(vae_encoder)),
+      vae_decoder_(std::move(vae_decoder)),
+      text_encoder_(std::move(text_encoder)), unet_(std::move(unet)),
+      scheduler_(std::move(scheduler)), tokenizer_(tokenizer) {}
+
+void StableDiffusionInpaintPipeline::Predict(
+    const std::vector<std::string>& prompts, const cv::Mat& image,
+    const cv::Mat& mask_image, std::vector<FDTensor>* output_images, int height,
+    int width, int num_inference_steps, float guidance_scale,
+    const std::vector<std::string>& negative_prompt, int num_images_per_prompt,
+    float eta, uint32_t max_length, const FDTensor* latents, bool output_cv_mat,
+    callback_ptr callback, int callback_steps) {
+  int batch_size = prompts.size();
+  FDASSERT(batch_size >= 1, "prompts should not be empty");
+  FDASSERT(
+      height % 8 == 0 && width % 8 == 0,
+      "`height` and `width` have to be divisible by 8 but are {%d} and {%d}.",
+      height, width);
+  FDASSERT(callback_steps > 0,
+           "`callback_steps` has to be a positive integer but is {%d}",
+           callback_steps);
+
+  // Setting tokenizer attr
+  if (max_length == 0) {
+    tokenizer_.EnablePadMethod(fast_tokenizer::core::RIGHT,
+                               tokenizer_.GetPadTokenId(), 0,
+                               tokenizer_.GetPadToken(), nullptr, nullptr);
+    tokenizer_.DisableTruncMethod();
+  } else {
+    tokenizer_.EnablePadMethod(fast_tokenizer::core::RIGHT,
+                               tokenizer_.GetPadTokenId(), 0,
+                               tokenizer_.GetPadToken(), &max_length, nullptr);
+    tokenizer_.EnableTruncMethod(max_length, 0, fast_tokenizer::core::RIGHT,
+                                 fast_tokenizer::core::LONGEST_FIRST);
+  }
+  std::vector<fast_tokenizer::core::Encoding> encodings;
+  tokenizer_.EncodeBatchStrings(prompts, &encodings);
+
+  std::vector<int64_t> input_ids;
+  for (auto& encoding : encodings) {
+    auto curr_ids = encoding.GetIds();
+    input_ids.insert(input_ids.end(), curr_ids.begin(), curr_ids.end());
+  }
+  encodings.clear();
+  // Get text encoder output
+  FDTensor text_intput_ids;
+  std::vector<FDTensor> inputs(1);
+  inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
+                            input_ids.data());
+
+  TensorInfo text_info = text_encoder_->GetInputInfo(0);
+  inputs[0].name = text_info.name;
+  int output_size = text_encoder_->GetOutputInfos().size();
+  std::vector<FDTensor> outputs(output_size);
+  text_encoder_->Infer(inputs, &outputs);
+
+  FDTensor text_embeddings;
+  function::Tile(outputs[0], {num_images_per_prompt, 1, 1}, &text_embeddings);
+
+  // here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+  // of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+  // corresponds to doing no classifier free guidance.
+  bool do_classifier_free_guidance = guidance_scale > 1.0;
+  if (do_classifier_free_guidance) {
+    std::vector<std::string> uncond_tokens;
+    if (negative_prompt.size() == 0) {
+      uncond_tokens = {""};
+    } else if (negative_prompt.size() != batch_size) {
+      FDASSERT(false,
+               "negative_prompt has batch size %d, but prompt has batch size "
+               "%d. Please make sure that passed `negative_prompt` matches the "
+               "batch size of `prompt`.",
+               prompts.size(), negative_prompt.size());
+    } else {
+      uncond_tokens = negative_prompt;
+    }
+    tokenizer_.EncodeBatchStrings(uncond_tokens, &encodings);
+    input_ids.clear();
+    for (auto& encoding : encodings) {
+      auto curr_ids = encoding.GetIds();
+      input_ids.insert(input_ids.end(), curr_ids.begin(), curr_ids.end());
+    }
+    inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
+                              input_ids.data());
+    text_encoder_->Infer(inputs, &outputs);
+    FDTensor uncond_embeddings;
+    function::Tile(outputs[0], {num_images_per_prompt, 1, 1},
+                   &uncond_embeddings);
+    function::Concat({uncond_embeddings, text_embeddings}, &text_embeddings);
+  }
+  std::vector<int64_t> latents_shape = {batch_size * num_images_per_prompt,
+                                        NUM_LATENT_CHANNELS, height / 8,
+                                        width / 8};
+  auto latents_dtype = text_embeddings.Dtype();
+  FDTensor actual_latents;
+  if (latents == nullptr) {
+    function::GaussianRandom(latents_shape, &actual_latents, latents_dtype);
+  } else {
+    bool result = std::equal(latents_shape.begin(), latents_shape.end(),
+                             latents->Shape().begin());
+    FDASSERT(result, "Unexpected latents shape, got %s, expected %s",
+             Str(latents_shape).c_str(), Str(latents->Shape()).c_str());
+    actual_latents = *latents;
+  }
+  FDTensor mask_t, mask_image_t;
+  PrepareMaskAndMaskedImage(image, mask_image, {height / 8, width / 8}, &mask_t,
+                            &mask_image_t);
+  function::Cast(mask_t, &mask_t, actual_latents.Dtype());
+  function::Cast(mask_image_t, &mask_image_t, actual_latents.Dtype());
+
+  // Get vae encoder output
+  TensorInfo vae_encoder_info = vae_encoder_->GetInputInfo(0);
+  mask_image_t.name = vae_encoder_info.name;
+  outputs.resize(vae_encoder_->GetOutputInfos().size());
+  inputs = {mask_image_t};
+  vae_encoder_->Infer(inputs, &outputs);
+  FDTensor masked_image_latents = 0.18215f * outputs[0];
+
+  std::vector<int64_t> mask_shape(mask_t.Shape().size(), 1);
+  mask_shape[0] = batch_size * num_images_per_prompt;
+  function::Tile(mask_t, mask_shape, &mask_t);
+
+  std::vector<int64_t> mask_image_shape(masked_image_latents.Shape().size(), 1);
+  mask_image_shape[0] = batch_size * num_images_per_prompt;
+  function::Tile(masked_image_latents, mask_image_shape, &masked_image_latents);
+
+  if (do_classifier_free_guidance) {
+    function::Concat({mask_t, mask_t}, &mask_t);
+    function::Concat({masked_image_latents, masked_image_latents},
+                     &masked_image_latents);
+  }
+  int num_channels_mask = mask_t.Shape()[1];
+  int num_channels_masked_image = masked_image_latents.Shape()[1];
+  FDASSERT(
+      NUM_LATENT_CHANNELS + num_channels_mask + num_channels_masked_image ==
+          NUM_UNET_INPUT_CHANNELS,
+      "Incorrect configuration settings! The config of `pipeline.unet` expects"
+      " %d but received `num_channels_latents`: %d + `num_channels_mask`: %d "
+      "+ `num_channels_masked_image`: %d"
+      " = %d. Please verify the config of `pipeline.unet` or your `mask_image` "
+      "or `image` input.",
+      NUM_UNET_INPUT_CHANNELS, NUM_LATENT_CHANNELS, num_channels_mask,
+      num_channels_masked_image,
+      NUM_LATENT_CHANNELS + num_channels_mask + num_channels_masked_image);
+
+  // set timesteps
+  scheduler_->SetTimesteps(num_inference_steps);
+
+  // scale the initial noise by the standard deviation required by the scheduler
+  actual_latents = actual_latents * scheduler_->InitNoiseSigma();
+
+  auto timestep = scheduler_->GetTimesteps();
+  int64_t* timestep_data = reinterpret_cast<int64_t*>(timestep.Data());
+  outputs.resize(unet_->GetOutputInfos().size());
+  inputs.resize(unet_->GetInputInfos().size());
+  inputs[2] = std::move(text_embeddings);
+  auto unet_infos = unet_->GetInputInfos();
+  for (int i = 0; i < timestep.Numel(); ++i) {
+    FDTensor t;
+    function::Slice(timestep, {0}, {i}, &t);
+    inputs[1] = t;
+    // expand the latents if we are doing classifier free guidance
+    FDTensor latent_model_input;
+    if (do_classifier_free_guidance) {
+      function::Concat({actual_latents, actual_latents}, &latent_model_input);
+    } else {
+      latent_model_input = actual_latents;
+    }
+    // concat latents, mask, masked_image_latnets in the channel dimension
+    function::Concat({latent_model_input, mask_t, masked_image_latents},
+                     &latent_model_input, 1);
+    scheduler_->ScaleModelInput(latent_model_input, &latent_model_input, {t});
+    inputs[0] = std::move(latent_model_input);
+    // predict the noise residual
+    for (int i = 0; i < unet_infos.size(); ++i) {
+      inputs[i].name = unet_infos[i].name;
+    }
+    unet_->Infer(inputs, &outputs);
+    FDTensor noise_pred = std::move(outputs[0]);
+    // perform guidance
+    if (do_classifier_free_guidance) {
+      std::vector<FDTensor> noise_preds;
+      int dim0 = noise_pred.Shape()[0];
+      function::Split(noise_pred, {dim0 - dim0 / 2, dim0 / 2}, &noise_preds);
+      noise_pred =
+          noise_preds[0] + guidance_scale * (noise_preds[1] - noise_preds[0]);
+    }
+
+    // compute the previous noisy sample x_t -> x_t-1
+    int64_t time = reinterpret_cast<int64_t*>(t.Data())[0];
+    scheduler_->Step(noise_pred, time, actual_latents, &actual_latents);
+
+    // call the callback, if provided
+    if (callback != nullptr && i % callback_steps == 0) {
+      callback(i, time, &actual_latents);
+    }
+  }
+  actual_latents = (1.0f / 0.18215f) * actual_latents;
+
+  // Get vae decoder output
+  int actual_latents_bs = actual_latents.Shape()[0];
+  TensorInfo vae_decoder_info = vae_decoder_->GetInputInfo(0);
+  inputs.resize(1);
+  outputs.resize(vae_decoder_->GetOutputInfos().size());
+  std::vector<FDTensor> decoder_reuslt;
+  for (int i = 0; i < actual_latents_bs; ++i) {
+    function::Slice(actual_latents, {0}, {i}, {i + 1}, &inputs[0]);
+    inputs[0].name = vae_decoder_info.name;
+    vae_decoder_->Infer(inputs, &outputs);
+    decoder_reuslt.emplace_back(std::move(outputs[0]));
+  }
+  FDTensor output_image;
+  function::Concat(decoder_reuslt, &output_image);
+
+  function::Clip(output_image / 2.0f + 0.5f, 0, 1, &output_image);
+  function::Transpose(output_image, &output_image, {0, 2, 3, 1});
+
+  if (output_cv_mat) {
+    output_image = output_image * 255.0f;
+    function::Round(output_image, &output_image);
+    function::Cast(output_image, &output_image, FDDataType::UINT8);
+  }
+  int output_batch_size = output_image.Shape()[0];
+  output_images->resize(output_batch_size);
+  for (int i = 0; i < output_batch_size; ++i) {
+    function::Slice(output_image, {0}, {i}, &(*output_images)[i]);
+    vision::FDMat mask_fdmat_t = vision::FDMat::Create((*output_images)[i]);
+    vision::RGB2BGR::Run(&mask_fdmat_t, vision::ProcLib::OPENCV);
+    mask_fdmat_t.CopyToTensor(&(*output_images)[i]);
+  }
+}
+}  // namespace fastdeploy
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "./scheduler.h"
+#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h"
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/runtime.h"
+#include "opencv2/core/core.hpp"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace fastdeploy {
+
+class StableDiffusionInpaintPipeline {
+ public:
+  typedef void (*callback_ptr)(int, int, FDTensor*);
+
+  StableDiffusionInpaintPipeline(
+      std::unique_ptr<Runtime> vae_encoder,
+      std::unique_ptr<Runtime> vae_decoder,
+      std::unique_ptr<Runtime> text_encoder, std::unique_ptr<Runtime> unet,
+      std::unique_ptr<Scheduler> scheduler,
+      const paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer&
+          tokenizer);
+  void Predict(const std::vector<std::string>& prompts, const cv::Mat& image,
+               const cv::Mat& mask_image, std::vector<FDTensor>* output_images,
+               int height = 512, int width = 512, int num_inference_steps = 50,
+               float guidance_scale = 7.5,
+               const std::vector<std::string>& negative_prompt = {},
+               int num_images_per_prompt = 1, float eta = 0.0,
+               uint32_t max_length = 77, const FDTensor* latents = nullptr,
+               bool output_cv_mat = true, callback_ptr callback = nullptr,
+               int callback_steps = 1);
+
+ private:
+  void PrepareMaskAndMaskedImage(const cv::Mat& image, const cv::Mat& mask_mat,
+                                 const std::vector<int64_t>& shape,
+                                 FDTensor* mask, FDTensor* mask_image);
+  std::unique_ptr<Runtime> vae_encoder_;
+  std::unique_ptr<Runtime> vae_decoder_;
+  std::unique_ptr<Runtime> text_encoder_;
+  std::unique_ptr<Runtime> unet_;
+  std::unique_ptr<Scheduler> scheduler_;
+  paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer tokenizer_;
+};
+
+}  // namespace fastdeploy
--- a/examples/multimodal/stable_diffusion/cpp/scheduler.h
+++ b/examples/multimodal/stable_diffusion/cpp/scheduler.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+
+class Scheduler {
+ public:
+  virtual void SetTimesteps(int num_inference_steps) = 0;
+  virtual FDTensor GetTimesteps() = 0;
+  virtual void Step(const FDTensor& model_output, int timestep,
+                    const FDTensor& sample, FDTensor* prev_sample) = 0;
+  virtual void ScaleModelInput(const FDTensor& sample, FDTensor* out,
+                               const std::vector<FDTensor>& timesteps = {}) = 0;
+  virtual void AddNoise(const FDTensor& original_samples, const FDTensor& noise,
+                        const FDTensor& timesteps, FDTensor* out) = 0;
+  virtual float InitNoiseSigma() = 0;
+};
+
+}  // namespace fastdeploy
--- a/examples/text/ernie-3.0/serving/README.md
+++ b/examples/text/ernie-3.0/serving/README.md
@@ -51,10 +51,10 @@ models
 # GPU镜像
 docker pull paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10
 # CPU镜像
-docker pull paddlepaddle/fastdeploy:z.y.z-cpu-only-21.10
+docker pull paddlepaddle/fastdeploy:x.y.z-cpu-only-21.10

 # 运行
-docker run  -it --net=host --name fastdeploy_server --shm-size="1g" -v /path/serving/models:/models paddlepaddle/fastdeploy:0.6.0-cpu-only-21.10 bash
+docker run  -it --net=host --name fastdeploy_server --shm-size="1g" -v /path/serving/models:/models paddlepaddle/fastdeploy:x.y.z-cpu-only-21.10 bash
 ```

 ## 部署模型
@@ -67,7 +67,7 @@ token_cls_rpc_client.py   # 序列标注任务发送pipeline预测请求的脚
 ```

 *注意*:启动服务时，Server的每个python后端进程默认申请`64M`内存，默认启动的docker无法启动多个python后端节点。有两个解决方案：
- 1.启动容器时设置`shm-size`参数, 比如:`docker run  -it --net=host --name fastdeploy_server --shm-size="1g" -v /path/serving/models:/models paddlepaddle/fastdeploy:0.6.0-gpu-cuda11.4-trt8.4-21.10 bash`
+- 1.启动容器时设置`shm-size`参数, 比如:`docker run  -it --net=host --name fastdeploy_server --shm-size="1g" -v /path/serving/models:/models paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10 bash`
 - 2.启动服务时设置python后端的`shm-default-byte-size`参数, 设置python后端的默认内存为10M： `tritonserver --model-repository=/models --backend-config=python,shm-default-byte-size=10485760`

 ### 分类任务
--- a/examples/text/uie/serving/README.md
+++ b/examples/text/uie/serving/README.md
@@ -34,10 +34,10 @@ models
 # GPU镜像
 docker pull paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10
 # CPU镜像
-docker pull paddlepaddle/fastdeploy:z.y.z-cpu-only-21.10
+docker pull paddlepaddle/fastdeploy:x.y.z-cpu-only-21.10

 # 运行容器.容器名字为 fd_serving, 并挂载当前目录为容器的 /uie_serving 目录
-docker run  -it --net=host --name fastdeploy_server --shm-size="1g" -v `pwd`/:/uie_serving paddlepaddle/fastdeploy:0.6.0-gpu-cuda11.4-trt8.4-21.10 bash
+docker run  -it --net=host --name fastdeploy_server --shm-size="1g" -v `pwd`/:/uie_serving paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10 bash

 # 启动服务(不设置CUDA_VISIBLE_DEVICES环境变量，会拥有所有GPU卡的调度权限)
 CUDA_VISIBLE_DEVICES=0 fastdeployserver --model-repository=/uie_serving/models --backend-config=python,shm-default-byte-size=10485760
--- a/examples/vision/classification/paddleclas/android/README.md
+++ b/examples/vision/classification/paddleclas/android/README.md
@@ -125,7 +125,7 @@ option.enableLiteFp16();
 model.init(modelFile, paramFile, configFile, option);
 // Bitmap读取、模型预测、资源释放 同上 ...
 ```
-更详细的用法请参考 [MainActivity](./app/src/main/java/com/baidu/paddle/fastdeploy/examples/MainActivity.java#L207) 中的用法
+更详细的用法请参考 [MainActivity](./app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/classification/ClassificationMainActivity.java) 中的用法

 ## 替换 FastDeploy 预测库和模型  
 替换FastDeploy预测库和模型的步骤非常简单。预测库所在的位置为 `app/libs/fastdeploy-android-xxx-shared`，其中 `xxx` 表示当前您使用的预测库版本号。模型所在的位置为，`app/src/main/assets/models/MobileNetV1_x0_25_infer`。  
--- a/examples/vision/classification/paddleclas/quantize/README.md
+++ b/examples/vision/classification/paddleclas/quantize/README.md
@@ -4,14 +4,14 @@ FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工

 ## FastDeploy一键模型自动化压缩工具
 FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/common_tools/auto_compression/)
 注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。

 ## 下载量化完成的PaddleClas模型
 用户也可以直接下载下表中的量化模型进行部署.

 Benchmark表格说明:
- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
 - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
 - 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
 - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
@@ -33,7 +33,7 @@ Benchmark表格说明:
 | [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        |  Paddle Inference  |    CPU    |     12.29  |   4.68  |     None|None|2.62       |77.89 | 71.36 |离线量化 |

 ### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | TensorRT         |    GPU    |  4.92| 2.28|2.24|2.23 |      2.21     | 79.12  | 79.06 | 离线量化 |
 | [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle-TensorRT  |    GPU    |  4.48|None |2.09|2.10 |      2.14   | 79.12  | 79.06 | 离线量化 |
--- a/examples/vision/classification/paddleclas/quantize/cpp/README.md
+++ b/examples/vision/classification/paddleclas/quantize/cpp/README.md
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)

 ## 以量化后的ResNet50_Vd模型为例, 进行部署，支持此模型需保证FastDeploy版本0.7.0以上(x.x.x>=0.7.0)
 在本目录执行如下命令即可完成编译,以及量化模型部署.
--- a/examples/vision/classification/paddleclas/quantize/python/README.md
+++ b/examples/vision/classification/paddleclas/quantize/python/README.md
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)


 ## 以量化后的ResNet50_Vd模型为例, 进行部署
--- a/examples/vision/classification/paddleclas/rv1126/README.md
+++ b/examples/vision/classification/paddleclas/rv1126/README.md
@@ -1,11 +1,11 @@
-# PaddleClas 量化模型在 RK1126 上的部署
-目前 FastDeploy 已经支持基于 PaddleLite 部署 PaddleClas 量化模型到 RK1126 上。
+# PaddleClas 量化模型在 RV1126 上的部署
+目前 FastDeploy 已经支持基于 PaddleLite 部署 PaddleClas 量化模型到 RV1126 上。

 模型的量化和量化模型的下载请参考：[模型量化](../quantize/README.md)


 ## 详细部署文档

-在 RK1126 上只支持 C++ 的部署。
+在 RV1126 上只支持 C++ 的部署。

 - [C++部署](cpp)
--- a/examples/vision/classification/paddleclas/rv1126/cpp/CMakeLists.txt
+++ b/examples/vision/classification/paddleclas/rv1126/cpp/CMakeLists.txt
--- a/examples/vision/classification/paddleclas/rv1126/cpp/README.md
+++ b/examples/vision/classification/paddleclas/rv1126/cpp/README.md
@@ -1,22 +1,22 @@
-# PaddleClas RK1126开发板 C++ 部署示例
-本目录下提供的 `infer.cc`，可以帮助用户快速完成 PaddleClas 量化模型在 RK1126 上的部署推理加速。
+# PaddleClas RV1126 开发板 C++ 部署示例
+本目录下提供的 `infer.cc`，可以帮助用户快速完成 PaddleClas 量化模型在 RV1126 上的部署推理加速。

 ## 部署准备
 ### FastDeploy 交叉编译环境准备
- 1. 软硬件环境满足要求，以及交叉编译环境的准备，请参考：[FastDeploy 交叉编译环境准备](../../../../../../docs/cn/build_and_install/rk1126.md#交叉编译环境搭建)  
+- 1. 软硬件环境满足要求，以及交叉编译环境的准备，请参考：[FastDeploy 交叉编译环境准备](../../../../../../docs/cn/build_and_install/rv1126.md#交叉编译环境搭建)  

 ### 量化模型准备
 - 1. 用户可以直接使用由 FastDeploy 提供的量化模型进行部署。
- 2. 用户可以使用 FastDeploy 提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/)，自行进行模型量化, 并使用产出的量化模型进行部署。(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此 yaml 文件, 用户从 FP32 模型文件夹下复制此 yaml 文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用 FastDeploy 提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/)，自行进行模型量化, 并使用产出的量化模型进行部署。(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此 yaml 文件, 用户从 FP32 模型文件夹下复制此 yaml 文件到量化后的模型文件夹内即可.)
 - 更多量化相关相关信息可查阅[模型量化](../../quantize/README.md)

-## 在 RK1126 上部署量化后的 ResNet50_Vd 分类模型
-请按照以下步骤完成在 RK1126 上部署 ResNet50_Vd 量化模型：
-1. 交叉编译编译 FastDeploy 库，具体请参考：[交叉编译 FastDeploy](../../../../../../docs/cn/build_and_install/rk1126.md#基于-paddlelite-的-fastdeploy-交叉编译库编译)
+## 在 RV1126 上部署量化后的 ResNet50_Vd 分类模型
+请按照以下步骤完成在 RV1126 上部署 ResNet50_Vd 量化模型：
+1. 交叉编译编译 FastDeploy 库，具体请参考：[交叉编译 FastDeploy](../../../../../../docs/cn/build_and_install/rv1126.md#基于-paddlelite-的-fastdeploy-交叉编译库编译)

 2. 将编译后的库拷贝到当前目录，可使用如下命令：
 ```bash
-cp -r FastDeploy/build/fastdeploy-tmivx/ FastDeploy/examples/vision/classification/paddleclas/rk1126/cpp/
+cp -r FastDeploy/build/fastdeploy-tmivx/ FastDeploy/examples/vision/classification/paddleclas/rv1126/cpp/
 ```

 3. 在当前路径下载部署所需的模型和示例图片：
@@ -32,7 +32,7 @@ cp -r ILSVRC2012_val_00000010.jpeg images
 4. 编译部署示例，可使入如下命令：
 ```bash
 mkdir build && cd build
-cmake -DCMAKE_TOOLCHAIN_FILE=../fastdeploy-tmivx/timvx.cmake -DFASTDEPLOY_INSTALL_DIR=fastdeploy-tmivx ..
+cmake -DCMAKE_TOOLCHAIN_FILE=${PWD}/../fastdeploy-tmivx/timvx.cmake -DFASTDEPLOY_INSTALL_DIR=${PWD}/../fastdeploy-tmivx ..
 make -j8
 make install
 # 成功编译之后，会生成 install 文件夹，里面有一个运行 demo 和部署所需的库
@@ -41,7 +41,7 @@ make install
 5. 基于 adb 工具部署 ResNet50_vd 分类模型到 Rockchip RV1126，可使用如下命令：
 ```bash
 # 进入 install 目录
-cd FastDeploy/examples/vision/classification/paddleclas/rk1126/cpp/build/install/
+cd FastDeploy/examples/vision/classification/paddleclas/rv1126/cpp/build/install/
 # 如下命令表示：bash run_with_adb.sh 需要运行的demo 模型路径 图片路径 设备的DEVICE_ID
 bash run_with_adb.sh infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg $DEVICE_ID
 ```
@@ -50,4 +50,4 @@ bash run_with_adb.sh infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg $

 <img width="640" src="https://user-images.githubusercontent.com/30516196/200767389-26519e50-9e4f-4fe1-8d52-260718f73476.png">

-需要特别注意的是，在 RK1126 上部署的模型需要是量化后的模型，模型的量化请参考：[模型量化](../../../../../../docs/cn/quantize.md)
+需要特别注意的是，在 RV1126 上部署的模型需要是量化后的模型，模型的量化请参考：[模型量化](../../../../../../docs/cn/quantize.md)
--- a/examples/vision/classification/paddleclas/rv1126/cpp/infer.cc
+++ b/examples/vision/classification/paddleclas/rv1126/cpp/infer.cc
--- a/examples/vision/classification/paddleclas/rv1126/cpp/run_with_adb.sh
+++ b/examples/vision/classification/paddleclas/rv1126/cpp/run_with_adb.sh
--- a/examples/vision/classification/paddleclas/serving/README.md
+++ b/examples/vision/classification/paddleclas/serving/README.md
@@ -28,10 +28,10 @@ mv ResNet50_vd_infer/inference.pdiparams models/runtime/1/model.pdiparams
 # GPU镜像
 docker pull paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10
 # CPU镜像
-docker pull paddlepaddle/fastdeploy:z.y.z-cpu-only-21.10
+docker pull paddlepaddle/fastdeploy:x.y.z-cpu-only-21.10

 # 运行容器.容器名字为 fd_serving, 并挂载当前目录为容器的 /serving 目录
-nvidia-docker run -it --net=host --name fd_serving -v `pwd`/:/serving paddlepaddle/fastdeploy:0.6.0-gpu-cuda11.4-trt8.4-21.10  bash
+nvidia-docker run -it --net=host --name fd_serving -v `pwd`/:/serving paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10  bash

 # 启动服务(不设置CUDA_VISIBLE_DEVICES环境变量，会拥有所有GPU卡的调度权限)
 CUDA_VISIBLE_DEVICES=0 fastdeployserver --model-repository=/serving/models --backend-config=python,shm-default-byte-size=10485760
--- a/examples/vision/detection/paddledetection/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/detection/DetectionMainActivity.java
+++ b/examples/vision/detection/paddledetection/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/detection/DetectionMainActivity.java
@@ -305,6 +305,8 @@ public class DetectionMainActivity extends Activity implements View.OnClickListe
        // Open camera until the permissions have been granted
        if (!checkAllPermissions()) {
            svPreview.disableCamera();
+        } else {
+            svPreview.enableCamera();
        }
        svPreview.onResume();
    }
--- a/examples/vision/detection/paddledetection/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/CameraSurfaceView.java
+++ b/examples/vision/detection/paddledetection/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/CameraSurfaceView.java
@@ -278,6 +278,10 @@ public class CameraSurfaceView extends GLSurfaceView implements Renderer,
        disableCamera = true;
    }

+    public void enableCamera() {
+        disableCamera = false;
+    }
+
    public void switchCamera() {
        releaseCamera();
        selectedCameraId = (selectedCameraId + 1) % numberOfCameras;
--- a/examples/vision/detection/paddledetection/coco_label_list.txt
+++ b/examples/vision/detection/paddledetection/coco_label_list.txt
@@ -1,80 +0,0 @@
-person
-bicycle
-car
-motorcycle
-airplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
-bird
-cat
-dog
-horse
-sheep
-cow
-elephant
-bear
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-couch
-potted plant
-bed
-dining table
-toilet
-tv
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
--- a/examples/vision/detection/paddledetection/cpp/infer_ppyoloe.cc
+++ b/examples/vision/detection/paddledetection/cpp/infer_ppyoloe.cc
--- a/examples/vision/detection/paddledetection/quantize/README.md
+++ b/examples/vision/detection/paddledetection/quantize/README.md
@@ -4,14 +4,14 @@ FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工

 ## FastDeploy一键模型自动化压缩工具
 FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/common_tools/auto_compression/)

 ## 下载量化完成的PP-YOLOE-l模型
 用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)


 Benchmark表格说明:
- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
 - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
 - 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
 - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
@@ -32,7 +32,7 @@ NOTE:
 - TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子

 #### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | TensorRT         |    GPU    |  35.75 | 15.42 |20.70|20.85  |      2.32      | 51.4  | 50.7 | 量化蒸馏训练 |
 | [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | Paddle-TensorRT |    GPU    | 33.48    |None  |  18.47 |18.03   |     1.81       | 51.4  | 50.5 | 量化蒸馏训练 |
--- a/examples/vision/detection/paddledetection/quantize/cpp/README.md
+++ b/examples/vision/detection/paddledetection/quantize/cpp/README.md
@@ -9,7 +9,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的检测模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)

 ## 以量化后的PP-YOLOE-l模型为例, 进行部署。支持此模型需保证FastDeploy版本0.7.0以上(x.x.x>=0.7.0)
 在本目录执行如下命令即可完成编译,以及量化模型部署.
--- a/examples/vision/detection/paddledetection/quantize/python/README.md
+++ b/examples/vision/detection/paddledetection/quantize/python/README.md
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)


 ## 以量化后的PP-YOLOE-l模型为例, 进行部署
--- a/examples/vision/detection/paddledetection/rk1126/download_models_and_libs.sh
+++ b/examples/vision/detection/paddledetection/rk1126/download_models_and_libs.sh
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-DETECTION_MODEL_DIR="$(pwd)/picodet_detection/models"
-LIBS_DIR="$(pwd)"
-
-DETECTION_MODEL_URL="https://paddlelite-demo.bj.bcebos.com/Paddle-Lite-Demo/models/picodetv2_relu6_coco_no_fuse.tar.gz"
-LIBS_URL="https://paddlelite-demo.bj.bcebos.com/Paddle-Lite-Demo/Paddle-Lite-libs.tar.gz"
-
-download_and_uncompress() {
-  local url="$1"
-  local dir="$2"
-  
-  echo "Start downloading ${url}"
-  curl -L ${url} > ${dir}/download.tar.gz
-  cd ${dir}
-  tar -zxvf download.tar.gz
-  rm -f download.tar.gz
-}
-
-download_and_uncompress "${DETECTION_MODEL_URL}" "${DETECTION_MODEL_DIR}"
-download_and_uncompress "${LIBS_URL}" "${LIBS_DIR}"
-
-echo "Download successful!"
--- a/examples/vision/detection/paddledetection/rk1126/picodet_detection/CMakeLists.txt
+++ b/examples/vision/detection/paddledetection/rk1126/picodet_detection/CMakeLists.txt
@@ -1,62 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-set(CMAKE_SYSTEM_NAME Linux)
-if(TARGET_ARCH_ABI STREQUAL "armv8")
-    set(CMAKE_SYSTEM_PROCESSOR aarch64)
-    set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
-    set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")
-elseif(TARGET_ARCH_ABI STREQUAL "armv7hf")
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc")
-    set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
-else()
-    message(FATAL_ERROR "Unknown arch abi ${TARGET_ARCH_ABI}, only support armv8 and armv7hf.")
-    return()
-endif()
-
-project(object_detection_demo)
-
-message(STATUS "TARGET ARCH ABI: ${TARGET_ARCH_ABI}")
-message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")
-include_directories(${PADDLE_LITE_DIR}/include)
-link_directories(${PADDLE_LITE_DIR}/libs/${TARGET_ARCH_ABI})
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-if(TARGET_ARCH_ABI STREQUAL "armv8")
-    set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}")
-    set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}")
-elseif(TARGET_ARCH_ABI STREQUAL "armv7hf")
-    set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
-    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
-endif()
-
-include_directories(${PADDLE_LITE_DIR}/libs/${TARGET_ARCH_ABI}/third_party/yaml-cpp/include)
-link_directories(${PADDLE_LITE_DIR}/libs/${TARGET_ARCH_ABI}/third_party/yaml-cpp)
-
-find_package(OpenMP REQUIRED)
-if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    message(STATUS "Found OpenMP ${OpenMP_VERSION} ${OpenMP_CXX_VERSION}")
-    message(STATUS "OpenMP C flags:  ${OpenMP_C_FLAGS}")
-    message(STATUS "OpenMP CXX flags:  ${OpenMP_CXX_FLAGS}")
-    message(STATUS "OpenMP OpenMP_CXX_LIB_NAMES:  ${OpenMP_CXX_LIB_NAMES}")
-    message(STATUS "OpenMP OpenMP_CXX_LIBRARIES:  ${OpenMP_CXX_LIBRARIES}")
-else()
-    message(FATAL_ERROR "Could not found OpenMP!")
-    return()
-endif()
-find_package(OpenCV REQUIRED)
-if(OpenCV_FOUND OR OpenCV_CXX_FOUND)
-    include_directories(${OpenCV_INCLUDE_DIRS})
-    message(STATUS "OpenCV library status:")
-    message(STATUS "    version: ${OpenCV_VERSION}")
-    message(STATUS "    libraries: ${OpenCV_LIBS}")
-    message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
-else()
-    message(FATAL_ERROR "Could not found OpenCV!")
-    return()
-endif()
-
-
-add_executable(object_detection_demo object_detection_demo.cc)
-
-target_link_libraries(object_detection_demo paddle_full_api_shared dl ${OpenCV_LIBS} yaml-cpp)
--- a/examples/vision/detection/paddledetection/rk1126/picodet_detection/README.md
+++ b/examples/vision/detection/paddledetection/rk1126/picodet_detection/README.md
@@ -1,343 +0,0 @@
-# 目标检测 C++ API Demo 使用指南
-
-在 ARMLinux 上实现实时的目标检测功能，此 Demo 有较好的的易用性和扩展性，如在 Demo 中跑自己训练好的模型等。
- - 如果该开发板使用搭载了芯原 NPU （瑞芯微、晶晨、JQL、恩智浦）的 Soc，将有更好的加速效果。
-
-## 如何运行目标检测 Demo
-
-### 环境准备
-
-* 准备 ARMLiunx 开发版，将系统刷为 Ubuntu，用于 Demo 编译和运行。请注意，本 Demo 是使用板上编译，而非交叉编译，因此需要图形界面的开发板操作系统。
-* 如果需要使用 芯原 NPU 的计算加速，对 NPU 驱动版本有严格要求，请务必注意事先参考 [芯原 TIM-VX 部署示例](https://paddle-lite.readthedocs.io/zh/develop/demo_guides/verisilicon_timvx.html#id6)，将 NPU 驱动改为要求的版本。
-* Paddle Lite 当前已验证的开发板为 Khadas VIM3（芯片为 Amlogic A311d）、荣品 RV1126、荣品RV1109，其它平台用户可自行尝试；
- - Khadas VIM3：由于 VIM3 出厂自带 Android 系统，请先刷成 Ubuntu 系统，在此提供刷机教程：[VIM3/3L Linux 文档](https://docs.khadas.com/linux/zh-cn/vim3)，其中有详细描述刷机方法。以及系统镜像：VIM3 Linux：VIM3_Ubuntu-gnome-focal_Linux-4.9_arm64_EMMC_V1.0.7-210625：[官方链接](http://dl.khadas.com/firmware/VIM3/Ubuntu/EMMC/VIM3_Ubuntu-gnome-focal_Linux-4.9_arm64_EMMC_V1.0.7-210625.img.xz)；[百度云备用链接](https://paddlelite-demo.bj.bcebos.com/devices/verisilicon/firmware/khadas/vim3/VIM3_Ubuntu-gnome-focal_Linux-4.9_arm64_EMMC_V1.0.7-210625.img.xz)
- - 荣品 RV1126、1109：由于出场自带 buildroot 系统，如果使用 GUI 界面的 demo，请先刷成 Ubuntu 系统，在此提供刷机教程：[RV1126/1109 教程](https://paddlelite-demo.bj.bcebos.com/Paddle-Lite-Demo/os_img/rockchip/RV1126-RV1109%E4%BD%BF%E7%94%A8%E6%8C%87%E5%AF%BC%E6%96%87%E6%A1%A3-V3.0.pdf)，[刷机工具](https://paddlelite-demo.bj.bcebos.com/Paddle-Lite-Demo/os_img/rockchip/RKDevTool_Release.zip)，以及镜像：[1126镜像](https://paddlelite-demo.bj.bcebos.com/Paddle-Lite-Demo/os_img/update-pro-rv1126-ubuntu20.04-5-720-1280-v2-20220505.img)，[1109镜像](https://paddlelite-demo.bj.bcebos.com/Paddle-Lite-Demo/os_img/update-pro-rv1109-ubuntu20.04-5.5-720-1280-v2-20220429.img)。完整的文档和各种镜像请参考[百度网盘链接](https://pan.baidu.com/s/1Id0LMC0oO2PwR2YcYUAaiQ#list/path=%2F&parentPath=%2Fsharelink2521613171-184070898837664)，密码：2345。
-* 准备 usb camera，注意使用 openCV capture 图像时，请注意 usb camera 的 video序列号作为入参。
-* 请注意，瑞芯微芯片不带有 HDMI 接口，图像显示是依赖 MIPI DSI，所以请准备好 MIPI 显示屏（我们提供的镜像是 720*1280 分辨率，网盘中有更多分辨率选择，注意：请选择 camera-gc2093x2 的镜像）。
-* 配置开发板的网络。如果是办公网络红区，可以将开发板和PC用以太网链接，然后PC共享网络给开发板。
-* gcc g++ opencv cmake 的安装（以下所有命令均在设备上操作）
-
-```bash
-$ sudo apt-get update
-$ sudo apt-get install gcc g++ make wget unzip libopencv-dev pkg-config
-$ wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz
-$ tar -zxvf cmake-3.10.3.tar.gz
-$ cd cmake-3.10.3
-$ ./configure
-$ make
-$ sudo make install
-```
-
-### 部署步骤
-
-1. 将本 repo 上传至 VIM3 开发板，或者直接开发板上下载或者 git clone 本 repo
-2. 目标检测 Demo 位于 `Paddle-Lite-Demo/object_detection/linux/picodet_detection` 目录
-3. 进入 `Paddle-Lite-Demo/object_detection/linux` 目录, 终端中执行 `download_models_and_libs.sh` 脚本自动下载模型和 Paddle Lite 预测库
-
-```shell
-cd Paddle-Lite-Demo/object_detection/linux   # 1. 终端中进入 Paddle-Lite-Demo/object_detection/linux
-sh download_models_and_libs.sh               # 2. 执行脚本下载依赖项 （需要联网）
-```
-
-下载完成后会出现提示： `Download successful!`
-4. 执行用例(保证 ARMLinux 环境准备完成)
-
-```shell
-cd picodet_detection        # 1. 终端中进入
-sh build.sh armv8           # 2. 编译 Demo 可执行程序，默认编译 armv8，如果是 32bit 环境，则改成 sh build.sh armv7hf。
-sh run.sh armv8             # 3. 执行物体检测（picodet 模型） demo，会直接开启摄像头，启动图形界面并呈现检测结果。如果是 32bit 环境，则改成 sh run.sh armv7hf
-```
-
-### Demo 结果如下:（注意，示例的 picodet 仅使用 coco 数据集，在实际场景中效果一般，请使用实际业务场景重新训练）
-
-  <img src="https://paddlelite-demo.bj.bcebos.com/Paddle-Lite-Demo/demo_view.jpg" alt="demo_view" style="zoom: 10%;" />
-
-## 更新预测库
-
-* Paddle Lite 项目：https://github.com/PaddlePaddle/Paddle-Lite
- * 参考 [芯原 TIM-VX 部署示例](https://paddle-lite.readthedocs.io/zh/develop/demo_guides/verisilicon_timvx.html#tim-vx)，编译预测库
- * 编译最终产物位于 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx`
-    * 替换 c++ 库
-        * 头文件
-          将生成的 `build.lite.linux.armv8.gcc/inference_lite_lib.armlinux.armv8.nnadapter/cxx/include` 文件夹替换 Demo 中的 `Paddle-Lite-Demo/object_detection/linux/Paddle-Lite/include`
-        * armv8
-          将生成的 `build.lite.linux.armv8.gcc/inference_lite_lib.armlinux.armv8.nnadapter/cxx/libs/libpaddle_full_api_shared.so、libnnadapter.so、libtim-vx.so、libverisilicon_timvx.so` 库替换 Demo 中的 `Paddle-Lite-Demo/object_detection/linux/Paddle-Lite/libs/armv8/` 目录下同名 so
-        * armv7hf
-          将生成的 `build.lite.linux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.nnadapter/cxx/libs/libpaddle_full_api_shared.so、libnnadapter.so、libtim-vx.so、libverisilicon_timvx.so` 库替换 Demo 中的 `Paddle-Lite-Demo/object_detection/linux/Paddle-Lite/libs/armv7hf/` 目录下同名 so
-
-## Demo 内容介绍
-
-先整体介绍下目标检测 Demo 的代码结构，然后再简要地介绍 Demo 每部分功能.
-
-1. `object_detection_demo.cc`： C++ 预测代码
-
-```shell
-# 位置：
-Paddle-Lite-Demo/object_detection/linux/picodet_detection/object_detection_demo.cc
-```
-
-2. `models` : 模型文件夹 (执行 download_models_and_libs.sh 后会下载 picodet Paddle 模型), label 使用 Paddle-Lite-Demo/object_detection/assets/labels 目录下 coco_label_list.txt
-
-```shell
-# 位置：
-Paddle-Lite-Demo/object_detection/linux/picodet_detection/models/picodetv2_relu6_coco_no_fuse
-Paddle-Lite-Demo/object_detection/assets/labels/coco_label_list.txt
-```
-
-3. `Paddle-Lite`：内含 Paddle-Lite 头文件和 动态库，默认带有 timvx 加速库，以及第三方库 yaml-cpp 用于解析 yml 配置文件（执行 download_models_and_libs.sh 后会下载）
-
-```shell
-# 位置
-# 如果要替换动态库 so，则将新的动态库 so 更新到此目录下
-Paddle-Lite-Demo/object_detection/linux/Paddle-Lite/libs/armv8
-Paddle-Lite-Demo/object_detection/linux/Paddle-Lite/include
-```
-
-4. `CMakeLists.txt` : C++ 预测代码的编译脚本，用于生成可执行文件
-
-```shell
-# 位置
-Paddle-Lite-Demo/object_detection/linux/picodet_detection/CMakeLists.txt
-# 如果有cmake 编译选项更新，可以在 CMakeLists.txt 进行修改即可，默认编译 armv8 可执行文件；
-```
-
-5. `build.sh` : 编译脚本
-
-```shell
-# 位置
-Paddle-Lite-Demo/object_detection/linux/picodet_detection/build.sh
-```
-
-6. `run.sh` : 运行脚本，请注意设置 arm-aarch，armv8 或者 armv7hf。默认为armv8
-
-```shell
-# 位置
-Paddle-Lite-Demo/object_detection/linux/picodet_detection/run.sh
-```
- 请注意，运行需要5个元素：测试程序、模型、label 文件、异构配置、yaml 文件。
-
-## 代码讲解 （使用 Paddle Lite `C++ API` 执行预测）
-
-ARMLinux 示例基于 C++ API 开发，调用 Paddle Lite `C++s API` 包括以下五步。更详细的 `API` 描述参考：[Paddle Lite C++ API ](https://paddle-lite.readthedocs.io/zh/latest/api_reference/cxx_api_doc.html)。
-
-```c++
-#include <iostream>
-// 引入 C++ API
-#include "include/paddle_api.h"
-#include "include/paddle_use_ops.h"
-#include "include/paddle_use_kernels.h"
-
-// 使用在线编译模型的方式（等价于使用 opt 工具）
-
-// 1. 设置 CxxConfig
-paddle::lite_api::CxxConfig cxx_config;
-std::vector<paddle::lite_api::Place> valid_places;
-valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt8)});
-valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)});
-// 如果只需要 cpu 计算，那到此结束即可，下面是设置 NPU 的代码段
-valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kNNAdapter), PRECISION(kInt8)});
-cxx_config.set_valid_places(valid_places);
-std::string device = "verisilicon_timvx";
-cxx_config.set_nnadapter_device_names({device});
-// 设置定制化的异构策略 （如需要）
-cxx_config.set_nnadapter_subgraph_partition_config_buffer(
-            nnadapter_subgraph_partition_config_string);
-
-// 2. 生成 nb 模型 （等价于 opt 工具的产出）
-std::shared_ptr<paddle::lite_api::PaddlePredictor> predictor = nullptr;
-predictor = paddle::lite_api::CreatePaddlePredictor(cxx_config);
-predictor->SaveOptimizedModel(
-        model_path, paddle::lite_api::LiteModelType::kNaiveBuffer);
-
-// 3. 设置 MobileConfig
-MobileConfig config;
-config.set_model_from_file(modelPath); // 设置 NaiveBuffer 格式模型路径
-config.set_power_mode(LITE_POWER_NO_BIND); // 设置 CPU 运行模式
-config.set_threads(4); // 设置工作线程数
-
-// 4. 创建 PaddlePredictor
-predictor = CreatePaddlePredictor<MobileConfig>(config);
-
-// 5. 设置输入数据，注意，如果是带后处理的 picodet ，则是有两个输入
-std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-input_tensor->Resize({1, 3, 416, 416});
-auto* data = input_tensor->mutable_data<float>();
-// scale_factor tensor
-auto scale_factor_tensor = predictor->GetInput(1);
-scale_factor_tensor->Resize({1, 2});
-auto scale_factor_data = scale_factor_tensor->mutable_data<float>();
-scale_factor_data[0] = 1.0f;
-scale_factor_data[1] = 1.0f;
-
-// 6. 执行预测
-predictor->run();
-
-// 7. 获取输出数据
-std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
-
-```
-
-## 如何更新模型和输入/输出预处理
-
-### 更新模型
-1. 请参考 PaddleDetection 中 [picodet 重训和全量化文档](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/configs/picodet/FULL_QUANTIZATION.md)，基于用户自己数据集重训并且重新全量化
-2. 将模型存放到目录 `object_detection_demo/models/` 下；
-3. 模型名字跟工程中模型名字一模一样，即均是使用 `model`、`params`；
-
-```shell
-# shell 脚本 `object_detection_demo/run.sh`
-TARGET_ABI=armv8 # for 64bit, such as Amlogic A311D
-#TARGET_ABI=armv7hf # for 32bit, such as Rockchip 1109/1126
-if [ -n "$1" ]; then
-    TARGET_ABI=$1
-fi
-export LD_LIBRARY_PATH=../Paddle-Lite/libs/$TARGET_ABI/
-export GLOG_v=0 # Paddle-Lite 日志等级
-export VSI_NN_LOG_LEVEL=0 # TIM-VX 日志等级
-export VIV_VX_ENABLE_GRAPH_TRANSFORM=-pcq:1 # NPU 开启 perchannel 量化模型
-export VIV_VX_SET_PER_CHANNEL_ENTROPY=100 # 同上
-build/object_detection_demo models/picodetv2_relu6_coco_no_fuse ../../assets/labels/coco_label_list.txt models/picodetv2_relu6_coco_no_fuse/subgraph.txt models/picodetv2_relu6_coco_no_fuse/picodet.yml  # 执行 Demo 程序，4个 arg 分别为：模型、 label 文件、 自定义异构配置、 yaml
-```
-
- 如果需要更新 `label_list` 或者 `yaml` 文件，则修改 `object_detection_demo/run.sh` 中执行命令的第二个和第四个 arg 指定为新的 label 文件和 yaml 配置文件；
-
-```shell
-# 代码文件 `object_detection_demo/rush.sh`
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE_LITE_DIR}/libs/${TARGET_ARCH_ABI}
-build/object_detection_demo {模型} {label} {自定义异构配置文件} {yaml}
-```
-
-### 更新输入/输出预处理
-
-1. 更新输入预处理
-预处理完全根据 yaml 文件来，如果完全按照 PaddleDetection 中 picodet 重训，只需要替换 yaml 文件即可
-
-2. 更新输出预处理
-此处需要更新 `object_detection_demo/object_detection_demo.cc` 中的 `postprocess` 方法
-
-```c++
-std::vector<RESULT> postprocess(const float *output_data, int64_t output_size,
-                                const std::vector<std::string> &word_labels,
-                                const float score_threshold,
-                                cv::Mat &output_image, double time) {
-  std::vector<RESULT> results;
-  std::vector<cv::Scalar> colors = {
-      cv::Scalar(237, 189, 101), cv::Scalar(0, 0, 255),
-      cv::Scalar(102, 153, 153), cv::Scalar(255, 0, 0),
-      cv::Scalar(9, 255, 0),     cv::Scalar(0, 0, 0),
-      cv::Scalar(51, 153, 51)};
-  for (int64_t i = 0; i < output_size; i += 6) {
-    if (output_data[i + 1] < score_threshold) {
-      continue;
-    }
-    int class_id = static_cast<int>(output_data[i]);
-    float score = output_data[i + 1];
-    RESULT result;
-    std::string class_name = "Unknown";
-    if (word_labels.size() > 0 && class_id >= 0 &&
-        class_id < word_labels.size()) {
-      class_name = word_labels[class_id];
-    }
-    result.class_name = class_name;
-    result.score = score;
-    result.left = output_data[i + 2] / 416; // 此处416根据输入的 HW 得来
-    result.top = output_data[i + 3] / 416;
-    result.right = output_data[i + 4] / 416;
-    result.bottom = output_data[i + 5] / 416;
-    int lx = static_cast<int>(result.left * output_image.cols);
-    int ly = static_cast<int>(result.top * output_image.rows);
-    int w = static_cast<int>(result.right * output_image.cols) - lx;
-    int h = static_cast<int>(result.bottom * output_image.rows) - ly;
-    cv::Rect bounding_box =
-        cv::Rect(lx, ly, w, h) &
-        cv::Rect(0, 0, output_image.cols, output_image.rows);
-    if (w > 0 && h > 0 && score <= 1) {
-      cv::Scalar color = colors[results.size() % colors.size()];
-      cv::rectangle(output_image, bounding_box, color);
-      cv::rectangle(output_image, cv::Point2d(lx, ly),
-                    cv::Point2d(lx + w, ly - 10), color, -1);
-      cv::putText(output_image, std::to_string(results.size()) + "." +
-                                    class_name + ":" + std::to_string(score),
-                  cv::Point2d(lx, ly), cv::FONT_HERSHEY_PLAIN, 1,
-                  cv::Scalar(255, 255, 255));
-      results.push_back(result);
-    }
-  }
-  return results;
-}
-```
-
-## 更新模型后，自定义 NPU-CPU 异构配置（如需使用 NPU 加速）
-由于使用芯原 NPU 在 8bit 量化的情况下有最优的性能，因此部署时，我们往往会考虑量化
- - 由于量化可能会引入一定程度的精度问题，所以我们可以通过自定义的异构定制，来将部分有精度问题的 layer 异构至cpu，从而达到最优的精度
-
-### 第一步，确定模型量化后在 arm cpu 上的精度
-如果在 arm cpu 上，精度都无法满足，那量化本身就是失败的，此时可以考虑修改训练集或者预处理。
- - 修改 Demo 程序，仅用 arm cpu 计算
-```c++
-paddle::lite_api::CxxConfig cxx_config;
-std::vector<paddle::lite_api::Place> valid_places;
-valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt8)});
-valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)});
-// 仅用 arm cpu 计算， 注释如下代码即可
-/*
-valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kNNAdapter), PRECISION(kInt8)});
-valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kNNAdapter), PRECISION(kFloat)});
-*/
-```
-如果 arm cpu 计算结果精度达标，则继续
-
-### 第二步，获取整网拓扑信息
- - 回退第一步的修改，不再注释，使用 NPU 加速
- - 运行 Demo，如果此时精度良好，则无需参考后面步骤，模型部署和替换非常顺利，enjoy it。
- - 如果精度不行，请参考后续步骤。
-
-### 第三步，获取整网拓扑信息
- - 回退第一步的修改，使用
- - 修改 run.sh ，将其中 export GLOG_v=0 改为 export GLOG_v=5
- - 运行 Demo，等摄像头启动，即可 ctrl+c 关闭 Demo
- - 收集日志，搜索关键字 "subgraph operators" 随后那一段，便是整个模型的拓扑信息，其格式如下：
-    - 每行记录由『算子类型:输入张量名列表:输出张量名列表』组成（即以分号分隔算子类型、输入和输出张量名列表），以逗号分隔输入、输出张量名列表中的每个张量名；
-    - 示例说明：
-    ```
-      op_type0:var_name0,var_name1:var_name2          表示将算子类型为 op_type0、输入张量为var_name0 和 var_name1、输出张量为 var_name2 的节点强制运行在 ARM CPU 上
-    ```
-
-### 第四步，修改异构配置文件
- - 首先看到示例 Demo 中 Paddle-Lite-Demo/object_detection/linux/picodet_detection/models/picodetv2_relu6_coco_no_fuse 目录下的 subgraph.txt 文件。(feed 和 fetch 分别代表整个模型的输入和输入)
-  ```
-  feed:feed:scale_factor
-  feed:feed:image
-
-  sqrt:tmp_3:sqrt_0.tmp_0
-  reshape2:sqrt_0.tmp_0:reshape2_0.tmp_0,reshape2_0.tmp_1
-
-  matmul_v2:softmax_0.tmp_0,auto_113_:linear_0.tmp_0
-  reshape2:linear_0.tmp_0:reshape2_2.tmp_0,reshape2_2.tmp_1
-
-  sqrt:tmp_6:sqrt_1.tmp_0
-  reshape2:sqrt_1.tmp_0:reshape2_3.tmp_0,reshape2_3.tmp_1
-
-  matmul_v2:softmax_1.tmp_0,auto_113_:linear_1.tmp_0
-  reshape2:linear_1.tmp_0:reshape2_5.tmp_0,reshape2_5.tmp_1
-
-  sqrt:tmp_9:sqrt_2.tmp_0
-  reshape2:sqrt_2.tmp_0:reshape2_6.tmp_0,reshape2_6.tmp_1
-
-  matmul_v2:softmax_2.tmp_0,auto_113_:linear_2.tmp_0
-  ...
-  ```
- - 在 txt 中的都是需要异构至 cpu 计算的 layer，在示例 Demo 中，我们把 picodet 后处理的部分异构至 arm cpu 做计算，不必担心，Paddle-Lite 的 arm kernel 性能也是非常卓越。
- - 如果新训练的模型没有额外修改 layer，则直接复制使用示例 Demo 中的 subgraph.txt 即可
- - 此时 ./run.sh 看看精度是否符合预期，如果精度符合预期，恭喜，可以跳过本章节，enjoy it。
- - 如果精度不符合预期，则将上文『第二步，获取整网拓扑信息』中获取的拓扑信息，从 "feed" 之后第一行，直到 "sqrt" 之前，都复制进 sugraph.txt。这一步代表了将大量的 backbone 部分算子放到 arm cpu 计算。
- - 此时 ./run.sh 看看精度是否符合预期，如果精度达标，那说明在 backbone 中确实存在引入 NPU 精度异常的层（再次重申，在 subgraph.txt 的代表强制在 arm cpu 计算）。
- - 逐行删除、成片删除、二分法，发挥开发人员的耐心，找到引入 NPU 精度异常的 layer，将其留在 subgraph.txt 中，按照经验，如果有 NPU 精度问题，可能会有 1~5 层conv layer 需要异构。
- - 剩余没有精度问题的 layer 在 subgraph.txt 中删除即可
--- a/examples/vision/detection/paddledetection/rk1126/picodet_detection/build.sh
+++ b/examples/vision/detection/paddledetection/rk1126/picodet_detection/build.sh
@@ -1,17 +0,0 @@
-#!/bin/bash
-USE_FULL_API=TRUE
-# configure
-TARGET_ARCH_ABI=armv8 # for RK3399, set to default arch abi
-#TARGET_ARCH_ABI=armv7hf # for Raspberry Pi 3B
-PADDLE_LITE_DIR=../Paddle-Lite
-THIRD_PARTY_DIR=./third_party
-if [ "x$1" != "x" ]; then
-    TARGET_ARCH_ABI=$1
-fi
-
-# build
-rm -rf build
-mkdir build
-cd build
-cmake -DPADDLE_LITE_DIR=${PADDLE_LITE_DIR} -DTARGET_ARCH_ABI=${TARGET_ARCH_ABI} -DTHIRD_PARTY_DIR=${THIRD_PARTY_DIR} ..
-make
--- a/examples/vision/detection/paddledetection/rk1126/picodet_detection/object_detection_demo.cc
+++ b/examples/vision/detection/paddledetection/rk1126/picodet_detection/object_detection_demo.cc
@@ -1,411 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle_api.h"
-#include "yaml-cpp/yaml.h"
-#include <arm_neon.h>
-#include <fstream>
-#include <limits>
-#include <opencv2/core.hpp>
-#include <opencv2/highgui.hpp>
-#include <opencv2/opencv.hpp>
-#include <stdio.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <vector>
-
-int WARMUP_COUNT = 0;
-int REPEAT_COUNT = 1;
-const int CPU_THREAD_NUM = 2;
-const paddle::lite_api::PowerMode CPU_POWER_MODE =
-    paddle::lite_api::PowerMode::LITE_POWER_HIGH;
-const std::vector<int64_t> INPUT_SHAPE = {1, 3, 416, 416};
-std::vector<float> INPUT_MEAN = {0.f, 0.f, 0.f};
-std::vector<float> INPUT_STD = {1.f, 1.f, 1.f};
-float INPUT_SCALE = 1 / 255.f;
-const float SCORE_THRESHOLD = 0.35f;
-
-struct RESULT {
-  std::string class_name;
-  float score;
-  float left;
-  float top;
-  float right;
-  float bottom;
-};
-
-inline int64_t get_current_us() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1000000LL * (int64_t)time.tv_sec + (int64_t)time.tv_usec;
-}
-
-bool read_file(const std::string &filename, std::vector<char> *contents,
-               bool binary = true) {
-  FILE *fp = fopen(filename.c_str(), binary ? "rb" : "r");
-  if (!fp)
-    return false;
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  fseek(fp, 0, SEEK_SET);
-  contents->clear();
-  contents->resize(size);
-  size_t offset = 0;
-  char *ptr = reinterpret_cast<char *>(&(contents->at(0)));
-  while (offset < size) {
-    size_t already_read = fread(ptr, 1, size - offset, fp);
-    offset += already_read;
-    ptr += already_read;
-  }
-  fclose(fp);
-  return true;
-}
-
-std::vector<std::string> load_labels(const std::string &path) {
-  std::ifstream file;
-  std::vector<std::string> labels;
-  file.open(path);
-  while (file) {
-    std::string line;
-    std::getline(file, line);
-    labels.push_back(line);
-  }
-  file.clear();
-  file.close();
-  return labels;
-}
-
-bool load_yaml_config(std::string yaml_path) {
-  YAML::Node cfg;
-  try {
-    std::cout << "before loadFile" << std::endl;
-    cfg = YAML::LoadFile(yaml_path);
-  } catch (YAML::BadFile &e) {
-    std::cout << "Failed to load yaml file " << yaml_path
-              << ", maybe you should check this file." << std::endl;
-    return false;
-  }
-  auto preprocess_cfg = cfg["TestReader"]["sample_transforms"];
-  for (const auto &op : preprocess_cfg) {
-    if (!op.IsMap()) {
-      std::cout << "Require the transform information in yaml be Map type."
-                << std::endl;
-      std::abort();
-    }
-    auto op_name = op.begin()->first.as<std::string>();
-    if (op_name == "NormalizeImage") {
-      INPUT_MEAN = op.begin()->second["mean"].as<std::vector<float>>();
-      INPUT_STD = op.begin()->second["std"].as<std::vector<float>>();
-      INPUT_SCALE = op.begin()->second["scale"].as<float>();
-    }
-  }
-  return true;
-}
-
-void preprocess(cv::Mat &input_image, std::vector<float> &input_mean,
-                std::vector<float> &input_std, float input_scale,
-                int input_width, int input_height, float *input_data) {
-  cv::Mat resize_image;
-  cv::resize(input_image, resize_image, cv::Size(input_width, input_height), 0,
-             0);
-  if (resize_image.channels() == 4) {
-    cv::cvtColor(resize_image, resize_image, cv::COLOR_BGRA2RGB);
-  }
-  cv::Mat norm_image;
-  resize_image.convertTo(norm_image, CV_32FC3, input_scale);
-  // NHWC->NCHW
-  int image_size = input_height * input_width;
-  const float *image_data = reinterpret_cast<const float *>(norm_image.data);
-  float32x4_t vmean0 = vdupq_n_f32(input_mean[0]);
-  float32x4_t vmean1 = vdupq_n_f32(input_mean[1]);
-  float32x4_t vmean2 = vdupq_n_f32(input_mean[2]);
-  float32x4_t vscale0 = vdupq_n_f32(1.0f / input_std[0]);
-  float32x4_t vscale1 = vdupq_n_f32(1.0f / input_std[1]);
-  float32x4_t vscale2 = vdupq_n_f32(1.0f / input_std[2]);
-  float *input_data_c0 = input_data;
-  float *input_data_c1 = input_data + image_size;
-  float *input_data_c2 = input_data + image_size * 2;
-  int i = 0;
-  for (; i < image_size - 3; i += 4) {
-    float32x4x3_t vin3 = vld3q_f32(image_data);
-    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
-    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
-    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
-    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
-    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
-    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
-    vst1q_f32(input_data_c0, vs0);
-    vst1q_f32(input_data_c1, vs1);
-    vst1q_f32(input_data_c2, vs2);
-    image_data += 12;
-    input_data_c0 += 4;
-    input_data_c1 += 4;
-    input_data_c2 += 4;
-  }
-  for (; i < image_size; i++) {
-    *(input_data_c0++) = (*(image_data++) - input_mean[0]) / input_std[0];
-    *(input_data_c1++) = (*(image_data++) - input_mean[1]) / input_std[1];
-    *(input_data_c2++) = (*(image_data++) - input_mean[2]) / input_std[2];
-  }
-}
-
-std::vector<RESULT> postprocess(const float *output_data, int64_t output_size,
-                                const std::vector<std::string> &word_labels,
-                                const float score_threshold,
-                                cv::Mat &output_image, double time) {
-  std::vector<RESULT> results;
-  std::vector<cv::Scalar> colors = {
-      cv::Scalar(237, 189, 101), cv::Scalar(0, 0, 255),
-      cv::Scalar(102, 153, 153), cv::Scalar(255, 0, 0),
-      cv::Scalar(9, 255, 0),     cv::Scalar(0, 0, 0),
-      cv::Scalar(51, 153, 51)};
-  for (int64_t i = 0; i < output_size; i += 6) {
-    if (output_data[i + 1] < score_threshold) {
-      continue;
-    }
-    int class_id = static_cast<int>(output_data[i]);
-    float score = output_data[i + 1];
-    RESULT result;
-    std::string class_name = "Unknown";
-    if (word_labels.size() > 0 && class_id >= 0 &&
-        class_id < word_labels.size()) {
-      class_name = word_labels[class_id];
-    }
-    result.class_name = class_name;
-    result.score = score;
-    result.left = output_data[i + 2] / 416;
-    result.top = output_data[i + 3] / 416;
-    result.right = output_data[i + 4] / 416;
-    result.bottom = output_data[i + 5] / 416;
-    int lx = static_cast<int>(result.left * output_image.cols);
-    int ly = static_cast<int>(result.top * output_image.rows);
-    int w = static_cast<int>(result.right * output_image.cols) - lx;
-    int h = static_cast<int>(result.bottom * output_image.rows) - ly;
-    cv::Rect bounding_box =
-        cv::Rect(lx, ly, w, h) &
-        cv::Rect(0, 0, output_image.cols, output_image.rows);
-    if (w > 0 && h > 0 && score <= 1) {
-      cv::Scalar color = colors[results.size() % colors.size()];
-      cv::rectangle(output_image, bounding_box, color);
-      cv::rectangle(output_image, cv::Point2d(lx, ly),
-                    cv::Point2d(lx + w, ly - 10), color, -1);
-      cv::putText(output_image, std::to_string(results.size()) + "." +
-                                    class_name + ":" + std::to_string(score),
-                  cv::Point2d(lx, ly), cv::FONT_HERSHEY_PLAIN, 1,
-                  cv::Scalar(255, 255, 255));
-      results.push_back(result);
-    }
-  }
-  return results;
-}
-
-cv::Mat process(cv::Mat &input_image, std::vector<std::string> &word_labels,
-                std::shared_ptr<paddle::lite_api::PaddlePredictor> &predictor) {
-  // Preprocess image and fill the data of input tensor
-  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
-      std::move(predictor->GetInput(0)));
-  input_tensor->Resize(INPUT_SHAPE);
-  int input_width = INPUT_SHAPE[3];
-  int input_height = INPUT_SHAPE[2];
-  auto *input_data = input_tensor->mutable_data<float>();
-#if 1
-  // scale_factor tensor
-  auto scale_factor_tensor = predictor->GetInput(1);
-  scale_factor_tensor->Resize({1, 2});
-  auto scale_factor_data = scale_factor_tensor->mutable_data<float>();
-  scale_factor_data[0] = 1.0f;
-  scale_factor_data[1] = 1.0f;
-#endif
-
-  double preprocess_start_time = get_current_us();
-  preprocess(input_image, INPUT_MEAN, INPUT_STD, INPUT_SCALE, input_width,
-             input_height, input_data);
-  double preprocess_end_time = get_current_us();
-  double preprocess_time =
-      (preprocess_end_time - preprocess_start_time) / 1000.0f;
-
-  double prediction_time;
-  // Run predictor
-  // warm up to skip the first inference and get more stable time, remove it in
-  // actual products
-  for (int i = 0; i < WARMUP_COUNT; i++) {
-    predictor->Run();
-  }
-  // repeat to obtain the average time, set REPEAT_COUNT=1 in actual products
-  double max_time_cost = 0.0f;
-  double min_time_cost = std::numeric_limits<float>::max();
-  double total_time_cost = 0.0f;
-  for (int i = 0; i < REPEAT_COUNT; i++) {
-    auto start = get_current_us();
-    predictor->Run();
-    auto end = get_current_us();
-    double cur_time_cost = (end - start) / 1000.0f;
-    if (cur_time_cost > max_time_cost) {
-      max_time_cost = cur_time_cost;
-    }
-    if (cur_time_cost < min_time_cost) {
-      min_time_cost = cur_time_cost;
-    }
-    total_time_cost += cur_time_cost;
-    prediction_time = total_time_cost / REPEAT_COUNT;
-    printf("iter %d cost: %f ms\n", i, cur_time_cost);
-  }
-  printf("warmup: %d repeat: %d, average: %f ms, max: %f ms, min: %f ms\n",
-         WARMUP_COUNT, REPEAT_COUNT, prediction_time, max_time_cost,
-         min_time_cost);
-
-  // Get the data of output tensor and postprocess to output detected objects
-  std::unique_ptr<const paddle::lite_api::Tensor> output_tensor(
-      std::move(predictor->GetOutput(0)));
-  const float *output_data = output_tensor->mutable_data<float>();
-  int64_t output_size = 1;
-  for (auto dim : output_tensor->shape()) {
-    output_size *= dim;
-  }
-  cv::Mat output_image = input_image.clone();
-  double postprocess_start_time = get_current_us();
-  std::vector<RESULT> results =
-      postprocess(output_data, output_size, word_labels, SCORE_THRESHOLD,
-                  output_image, prediction_time);
-  double postprocess_end_time = get_current_us();
-  double postprocess_time =
-      (postprocess_end_time - postprocess_start_time) / 1000.0f;
-
-  printf("results: %d\n", results.size());
-  for (int i = 0; i < results.size(); i++) {
-    printf("[%d] %s - %f %f,%f,%f,%f\n", i, results[i].class_name.c_str(),
-           results[i].score, results[i].left, results[i].top, results[i].right,
-           results[i].bottom);
-  }
-  printf("Preprocess time: %f ms\n", preprocess_time);
-  printf("Prediction time: %f ms\n", prediction_time);
-  printf("Postprocess time: %f ms\n\n", postprocess_time);
-
-  return output_image;
-}
-
-int main(int argc, char **argv) {
-  if (argc < 5 || argc == 6) {
-    printf("Usage: \n"
-           "./object_detection_demo model_dir label_path [input_image_path] "
-           "[output_image_path]"
-           "use images from camera if input_image_path and input_image_path "
-           "isn't provided.");
-    return -1;
-  }
-
-  std::string model_path = argv[1];
-  std::string label_path = argv[2];
-  std::vector<std::string> word_labels = load_labels(label_path);
-  std::string nnadapter_subgraph_partition_config_path = argv[3];
-
-  std::string yaml_path = argv[4];
-  if (yaml_path != "null") {
-    load_yaml_config(yaml_path);
-  }
-
-  // Run inference by using full api with CxxConfig
-  paddle::lite_api::CxxConfig cxx_config;
-  if (1) { // combined model
-    cxx_config.set_model_file(model_path + "/model");
-    cxx_config.set_param_file(model_path + "/params");
-  } else {
-    cxx_config.set_model_dir(model_path);
-  }
-  cxx_config.set_threads(CPU_THREAD_NUM);
-  cxx_config.set_power_mode(CPU_POWER_MODE);
-
-  std::shared_ptr<paddle::lite_api::PaddlePredictor> predictor = nullptr;
-  std::vector<paddle::lite_api::Place> valid_places;
-  valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt8)});
-  valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)});
-  valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kNNAdapter), PRECISION(kInt8)});
-  valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kNNAdapter), PRECISION(kFloat)});
-  cxx_config.set_valid_places(valid_places);
-  std::string device = "verisilicon_timvx";
-  cxx_config.set_nnadapter_device_names({device});
-  // cxx_config.set_nnadapter_context_properties(nnadapter_context_properties);
-
-  // cxx_config.set_nnadapter_model_cache_dir(nnadapter_model_cache_dir);
-  // Set the subgraph custom partition configuration file
-
-  if (!nnadapter_subgraph_partition_config_path.empty()) {
-    std::vector<char> nnadapter_subgraph_partition_config_buffer;
-    if (read_file(nnadapter_subgraph_partition_config_path,
-                  &nnadapter_subgraph_partition_config_buffer, false)) {
-      if (!nnadapter_subgraph_partition_config_buffer.empty()) {
-        std::string nnadapter_subgraph_partition_config_string(
-            nnadapter_subgraph_partition_config_buffer.data(),
-            nnadapter_subgraph_partition_config_buffer.size());
-        cxx_config.set_nnadapter_subgraph_partition_config_buffer(
-            nnadapter_subgraph_partition_config_string);
-      }
-    } else {
-      printf("Failed to load the subgraph custom partition configuration file "
-             "%s\n",
-             nnadapter_subgraph_partition_config_path.c_str());
-    }
-  }
-
-  try {
-    predictor = paddle::lite_api::CreatePaddlePredictor(cxx_config);
-    predictor->SaveOptimizedModel(
-        model_path, paddle::lite_api::LiteModelType::kNaiveBuffer);
-  } catch (std::exception e) {
-    printf("An internal error occurred in PaddleLite(cxx config).\n");
-  }
-
-  paddle::lite_api::MobileConfig config;
-  config.set_model_from_file(model_path + ".nb");
-  config.set_threads(CPU_THREAD_NUM);
-  config.set_power_mode(CPU_POWER_MODE);
-  config.set_nnadapter_device_names({device});
-  predictor =
-      paddle::lite_api::CreatePaddlePredictor<paddle::lite_api::MobileConfig>(
-          config);
-  if (argc > 5) {
-    WARMUP_COUNT = 1;
-    REPEAT_COUNT = 5;
-    std::string input_image_path = argv[5];
-    std::string output_image_path = argv[6];
-    cv::Mat input_image = cv::imread(input_image_path);
-    cv::Mat output_image = process(input_image, word_labels, predictor);
-    cv::imwrite(output_image_path, output_image);
-    cv::imshow("Object Detection Demo", output_image);
-    cv::waitKey(0);
-  } else {
-    cv::VideoCapture cap(1);
-    cap.set(cv::CAP_PROP_FRAME_WIDTH, 640);
-    cap.set(cv::CAP_PROP_FRAME_HEIGHT, 480);
-    if (!cap.isOpened()) {
-      return -1;
-    }
-    while (1) {
-      cv::Mat input_image;
-      cap >> input_image;
-      cv::Mat output_image = process(input_image, word_labels, predictor);
-      cv::imshow("Object Detection Demo", output_image);
-      if (cv::waitKey(1) == char('q')) {
-        break;
-      }
-    }
-    cap.release();
-    cv::destroyAllWindows();
-  }
-  return 0;
-}
--- a/examples/vision/detection/paddledetection/rk1126/picodet_detection/run.sh
+++ b/examples/vision/detection/paddledetection/rk1126/picodet_detection/run.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-#run
-
-TARGET_ABI=armv8 # for 64bit
-#TARGET_ABI=armv7hf # for 32bit
-if [ -n "$1" ]; then
-    TARGET_ABI=$1
-fi
-export LD_LIBRARY_PATH=../Paddle-Lite/libs/$TARGET_ABI/
-export GLOG_v=0
-export VSI_NN_LOG_LEVEL=0
-export VIV_VX_ENABLE_GRAPH_TRANSFORM=-pcq:1
-export VIV_VX_SET_PER_CHANNEL_ENTROPY=100
-export TIMVX_BATCHNORM_FUSION_MAX_ALLOWED_QUANT_SCALE_DEVIATION=30000
-build/object_detection_demo models/picodetv2_relu6_coco_no_fuse ../../assets/labels/coco_label_list.txt models/picodetv2_relu6_coco_no_fuse/subgraph.txt models/picodetv2_relu6_coco_no_fuse/picodet.yml 
--- a/examples/vision/detection/paddledetection/rknpu2/README.md
+++ b/examples/vision/detection/paddledetection/rknpu2/README.md
@@ -113,5 +113,7 @@ Preprocess:
  type: Resize
 ```

+## 其他链接
+- [Cpp部署](./cpp)
 - [Python部署](./python)
 - [视觉模型预测结果](../../../../../docs/api/vision_results/)
--- a/examples/vision/detection/paddledetection/rv1126/README.md
+++ b/examples/vision/detection/paddledetection/rv1126/README.md
@@ -0,0 +1,11 @@
+# PP-YOLOE  量化模型在 RV1126 上的部署
+目前 FastDeploy 已经支持基于 PaddleLite 部署 PP-YOLOE  量化模型到 RV1126 上。
+
+模型的量化和量化模型的下载请参考：[模型量化](../quantize/README.md)
+
+
+## 详细部署文档
+
+在 RV1126 上只支持 C++ 的部署。
+
+- [C++部署](cpp)
--- a/examples/vision/detection/paddledetection/rv1126/cpp/CMakeLists.txt
+++ b/examples/vision/detection/paddledetection/rv1126/cpp/CMakeLists.txt
@@ -0,0 +1,38 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+include_directories(${FastDeploy_INCLUDE_DIRS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer_ppyoloe.cc)
+# 添加FastDeploy库依赖
+target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
+
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR}/build/install)
+
+install(TARGETS infer_demo DESTINATION ./)
+
+install(DIRECTORY models DESTINATION ./)
+install(DIRECTORY images DESTINATION ./)
+# install(DIRECTORY run_with_adb.sh DESTINATION ./)
+
+file(GLOB FASTDEPLOY_LIBS ${FASTDEPLOY_INSTALL_DIR}/lib/*)
+install(PROGRAMS ${FASTDEPLOY_LIBS} DESTINATION lib)
+
+file(GLOB OPENCV_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/opencv/lib/lib*)
+install(PROGRAMS ${OPENCV_LIBS} DESTINATION lib)
+
+file(GLOB PADDLELITE_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/paddlelite/lib/lib*)
+install(PROGRAMS ${PADDLELITE_LIBS} DESTINATION lib)
+
+file(GLOB TIMVX_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/paddlelite/lib/verisilicon_timvx/*)
+install(PROGRAMS ${TIMVX_LIBS} DESTINATION lib)
+
+file(GLOB ADB_TOOLS run_with_adb.sh)
+install(PROGRAMS ${ADB_TOOLS} DESTINATION ./)
--- a/examples/vision/detection/paddledetection/rv1126/cpp/README.md
+++ b/examples/vision/detection/paddledetection/rv1126/cpp/README.md
@@ -0,0 +1,55 @@
+# PP-YOLOE  量化模型 C++ 部署示例
+
+本目录下提供的 `infer.cc`，可以帮助用户快速完成 PP-YOLOE 量化模型在 RV1126 上的部署推理加速。
+
+## 部署准备
+### FastDeploy 交叉编译环境准备
+- 1. 软硬件环境满足要求，以及交叉编译环境的准备，请参考：[FastDeploy 交叉编译环境准备](../../../../../../docs/cn/build_and_install/rv1126.md#交叉编译环境搭建)  
+
+### 模型准备
+- 1. 用户可以直接使用由 FastDeploy 提供的量化模型进行部署。
+- 2. 用户可以先使用 PaddleDetection 自行导出 Float32 模型，注意导出模型模型时设置参数：use_shared_conv=False，更多细节请参考：[PP-YOLOE](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/ppyoloe)
+- 3. 用户可以使用 FastDeploy 提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/)，自行进行模型量化, 并使用产出的量化模型进行部署。（注意: 推理量化后的检测模型仍然需要FP32模型文件夹下的 infer_cfg.yml 文件，自行量化的模型文件夹内不包含此 yaml 文件，用户从 FP32 模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。）
+- 更多量化相关相关信息可查阅[模型量化](../../quantize/README.md)
+
+## 在 RV1126 上部署量化后的 PP-YOLOE  检测模型
+请按照以下步骤完成在 RV1126 上部署 PP-YOLOE  量化模型：
+1. 交叉编译编译 FastDeploy 库，具体请参考：[交叉编译 FastDeploy](../../../../../../docs/cn/build_and_install/rv1126.md#基于-paddlelite-的-fastdeploy-交叉编译库编译)
+
+2. 将编译后的库拷贝到当前目录，可使用如下命令：
+```bash
+cp -r FastDeploy/build/fastdeploy-tmivx/ FastDeploy/examples/vision/detection/yolov5/rv1126/cpp
+```
+
+3. 在当前路径下载部署所需的模型和示例图片：
+```bash
+mkdir models && mkdir images
+wget https://bj.bcebos.com/fastdeploy/models/ppyoloe_noshare_qat.tar.gz
+tar -xvf ppyoloe_noshare_qat.tar.gz
+cp -r ppyoloe_noshare_qat models
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+cp -r 000000014439.jpg images
+```
+
+4. 编译部署示例，可使入如下命令：
+```bash
+mkdir build && cd build
+cmake -DCMAKE_TOOLCHAIN_FILE=${PWD}/../fastdeploy-tmivx/timvx.cmake -DFASTDEPLOY_INSTALL_DIR=${PWD}/../fastdeploy-tmivx ..
+make -j8
+make install
+# 成功编译之后，会生成 install 文件夹，里面有一个运行 demo 和部署所需的库
+```
+
+5. 基于 adb 工具部署 PP-YOLOE  检测模型到 Rockchip RV1126，可使用如下命令：
+```bash
+# 进入 install 目录
+cd FastDeploy/examples/vision/detection/paddledetection/rv1126/cpp/build/install/
+# 如下命令表示：bash run_with_adb.sh 需要运行的demo 模型路径 图片路径 设备的DEVICE_ID
+bash run_with_adb.sh infer_demo ppyoloe_noshare_qat 000000014439.jpg $DEVICE_ID
+```
+
+部署成功后运行结果如下：
+
+<img width="640" src="https://user-images.githubusercontent.com/30516196/203708564-43c49485-9b48-4eb2-8fe7-0fa517979fff.png">
+
+需要特别注意的是，在 RV1126 上部署的模型需要是量化后的模型，模型的量化请参考：[模型量化](../../../../../../docs/cn/quantize.md)
--- a/examples/vision/detection/paddledetection/rv1126/cpp/infer_ppyoloe.cc
+++ b/examples/vision/detection/paddledetection/rv1126/cpp/infer_ppyoloe.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+void InitAndInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto config_file = model_dir + sep + "infer_cfg.yml";
+  auto subgraph_file = model_dir + sep + "subgraph.txt";
+
+  fastdeploy::RuntimeOption option;
+  option.UseTimVX();
+  option.SetLiteSubgraphPartitionPath(subgraph_file);
+
+  auto model = fastdeploy::vision::detection::PPYOLOE(model_file, params_file,
+                                                      config_file, option);
+  assert(model.Initialized());
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::DetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisDetection(im, res, 0.5);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "Usage: infer_demo path/to/quant_model "
+                 "path/to/image "
+                 "run_option, "
+                 "e.g ./infer_demo ./PPYOLOE_L_quant ./test.jpeg"
+              << std::endl;
+    return -1;
+  }
+
+  std::string model_dir = argv[1];
+  std::string test_image = argv[2];
+  InitAndInfer(model_dir, test_image);
+  return 0;
+}
--- a/examples/vision/detection/paddledetection/rv1126/cpp/run_with_adb.sh
+++ b/examples/vision/detection/paddledetection/rv1126/cpp/run_with_adb.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+HOST_SPACE=${PWD}
+echo ${HOST_SPACE}
+WORK_SPACE=/data/local/tmp/test
+
+# The first parameter represents the demo name
+DEMO_NAME=image_classification_demo
+if [ -n "$1" ]; then
+  DEMO_NAME=$1
+fi
+
+# The second parameter represents the model name
+MODEL_NAME=mobilenet_v1_fp32_224
+if [ -n "$2" ]; then
+  MODEL_NAME=$2
+fi
+
+# The third parameter indicates the name of the image to be tested
+IMAGE_NAME=0001.jpg
+if [ -n "$3" ]; then
+  IMAGE_NAME=$3
+fi
+
+# The fourth parameter represents the ID of the device
+ADB_DEVICE_NAME=
+if [ -n "$4" ]; then
+  ADB_DEVICE_NAME="-s $4"
+fi
+
+# Set the environment variables required during the running process
+EXPORT_ENVIRONMENT_VARIABLES="export GLOG_v=5; export SUBGRAPH_ONLINE_MODE=true; export RKNPU_LOGLEVEL=5; export RKNN_LOG_LEVEL=5; ulimit -c unlimited; export VIV_VX_ENABLE_GRAPH_TRANSFORM=-pcq:1; export VIV_VX_SET_PER_CHANNEL_ENTROPY=100; export TIMVX_BATCHNORM_FUSION_MAX_ALLOWED_QUANT_SCALE_DEVIATION=300000; export VSI_NN_LOG_LEVEL=5;"
+
+EXPORT_ENVIRONMENT_VARIABLES="${EXPORT_ENVIRONMENT_VARIABLES}export LD_LIBRARY_PATH=${WORK_SPACE}/lib:\$LD_LIBRARY_PATH;"
+
+# Please install adb, and DON'T run this in the docker.
+set -e
+adb $ADB_DEVICE_NAME shell "rm -rf $WORK_SPACE"
+adb $ADB_DEVICE_NAME shell "mkdir -p $WORK_SPACE"
+
+# Upload the demo, librarys, model and test images to the device
+adb $ADB_DEVICE_NAME push ${HOST_SPACE}/lib $WORK_SPACE
+adb $ADB_DEVICE_NAME push ${HOST_SPACE}/${DEMO_NAME} $WORK_SPACE
+adb $ADB_DEVICE_NAME push models $WORK_SPACE
+adb $ADB_DEVICE_NAME push images $WORK_SPACE
+
+# Execute the deployment demo
+adb $ADB_DEVICE_NAME shell "cd $WORK_SPACE; ${EXPORT_ENVIRONMENT_VARIABLES} chmod +x ./${DEMO_NAME}; ./${DEMO_NAME} ./models/${MODEL_NAME} ./images/$IMAGE_NAME"
--- a/examples/vision/detection/rkyolo/README.md
+++ b/examples/vision/detection/rkyolo/README.md
@@ -0,0 +1,18 @@
+# RKYOLO准备部署模型
+
+RKYOLO参考[rknn_model_zoo](https://github.com/airockchip/rknn_model_zoo/tree/main/models/CV/object_detection/yolo)的代码
+对RKYOLO系列模型进行了封装，目前支持RKYOLOV5系列模型的部署。
+
+## 支持模型列表
+
+* RKYOLOV5
+
+## 模型转换example
+
+请参考[RKNN_model_convert](https://github.com/airockchip/rknn_model_zoo/tree/main/models/CV/object_detection/yolo/RKNN_model_convert)
+
+
+## 其他链接
+- [Cpp部署](./cpp)
+- [Python部署](./python)
+- [视觉模型预测结果](../../../../docs/api/vision_results/)
--- a/examples/vision/detection/rkyolo/cpp/CMakeLists.txt
+++ b/examples/vision/detection/rkyolo/cpp/CMakeLists.txt
@@ -0,0 +1,37 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 3.10)
+project(rknpu2_test)
+
+set(CMAKE_CXX_STANDARD 14)
+
+# 指定下载解压后的fastdeploy库路径
+set(FASTDEPLOY_INSTALL_DIR "thirdpartys/fastdeploy-0.0.3")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeployConfig.cmake)
+include_directories(${FastDeploy_INCLUDE_DIRS})
+
+add_executable(infer_rkyolo infer_rkyolo.cc)
+target_link_libraries(infer_rkyolo ${FastDeploy_LIBS})
+
+
+
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR}/build/install)
+
+install(TARGETS infer_rkyolo DESTINATION ./)
+
+install(DIRECTORY model DESTINATION ./)
+install(DIRECTORY images DESTINATION ./)
+
+file(GLOB FASTDEPLOY_LIBS ${FASTDEPLOY_INSTALL_DIR}/lib/*)
+message("${FASTDEPLOY_LIBS}")
+install(PROGRAMS ${FASTDEPLOY_LIBS} DESTINATION lib)
+
+file(GLOB ONNXRUNTIME_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/onnxruntime/lib/*)
+install(PROGRAMS ${ONNXRUNTIME_LIBS} DESTINATION lib)
+
+install(DIRECTORY ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/opencv/lib DESTINATION ./)
+
+file(GLOB PADDLETOONNX_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/paddle2onnx/lib/*)
+install(PROGRAMS ${PADDLETOONNX_LIBS} DESTINATION lib)
+
+file(GLOB RKNPU2_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/rknpu2_runtime/${RKNN2_TARGET_SOC}/lib/*)
+install(PROGRAMS ${RKNPU2_LIBS} DESTINATION lib)
--- a/examples/vision/detection/rkyolo/cpp/README.md
+++ b/examples/vision/detection/rkyolo/cpp/README.md
@@ -0,0 +1,69 @@
+# RKYOLO C++部署示例
+
+本目录下提供`infer_xxxxx.cc`快速完成RKYOLO模型在Rockchip板子上上通过二代NPU加速部署的示例。
+
+在部署前，需确认以下两个步骤:
+
+1. 软硬件环境满足要求
+2. 根据开发环境，下载预编译部署库或者从头编译FastDeploy仓库
+
+以上步骤请参考[RK2代NPU部署库编译](../../../../../docs/cn/build_and_install/rknpu2.md)实现
+
+## 生成基本目录文件
+
+该例程由以下几个部分组成
+```text
+.
+├── CMakeLists.txt
+├── build  # 编译文件夹
+├── image  # 存放图片的文件夹
+├── infer_rkyolo.cc
+├── model  # 存放模型文件的文件夹
+└── thirdpartys  # 存放sdk的文件夹
+```
+
+首先需要先生成目录结构
+```bash
+mkdir build
+mkdir images
+mkdir model
+mkdir thirdpartys
+```
+
+## 编译
+
+### 编译并拷贝SDK到thirdpartys文件夹
+
+请参考[RK2代NPU部署库编译](../../../../../../docs/cn/build_and_install/rknpu2.md)仓库编译SDK，编译完成后，将在build目录下生成
+fastdeploy-0.0.3目录，请移动它至thirdpartys目录下.
+
+### 拷贝模型文件，以及配置文件至model文件夹
+在Paddle动态图模型 -> Paddle静态图模型 -> ONNX模型的过程中，将生成ONNX文件以及对应的yaml配置文件，请将配置文件存放到model文件夹内。
+转换为RKNN后的模型文件也需要拷贝至model。
+
+### 准备测试图片至image文件夹
+```bash
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+cp 000000014439.jpg ./images
+```
+
+### 编译example
+
+```bash
+cd build
+cmake ..
+make -j8
+make install
+```
+
+## 运行例程
+
+```bash
+cd ./build/install
+./infer_picodet model/ images/000000014439.jpg
+```
+
+
+- [模型介绍](../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../../docs/api/vision_results/)
--- a/examples/vision/detection/rkyolo/cpp/infer_rkyolo.cc
+++ b/examples/vision/detection/rkyolo/cpp/infer_rkyolo.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "fastdeploy/vision.h"
+
+void RKNPU2Infer(const std::string& model_file, const std::string& image_file) {
+  struct timeval start_time, stop_time;
+
+  auto option = fastdeploy::RuntimeOption();
+  option.UseRKNPU2();
+
+  auto format = fastdeploy::ModelFormat::RKNN;
+
+  auto model = fastdeploy::vision::detection::RKYOLOV5(
+      model_file, option,format);
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::DetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+  auto vis_im = fastdeploy::vision::VisDetection(im, res,0.5);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout
+        << "Usage: infer_demo path/to/model_dir path/to/image run_option, "
+           "e.g ./infer_model ./picodet_model_dir ./test.jpeg"
+        << std::endl;
+    return -1;
+  }
+
+  RKNPU2Infer(argv[1], argv[2]);
+
+  return 0;
+}
+
--- a/examples/vision/detection/rkyolo/python/README.md
+++ b/examples/vision/detection/rkyolo/python/README.md
@@ -0,0 +1,34 @@
+# RKYOLO Python部署示例
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/rknpu2.md)
+
+本目录下提供`infer.py`快速完成Picodet在RKNPU上部署的示例。执行如下脚本即可完成
+
+```bash
+# 下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy/examples/vision/detection/rkyolo/python
+
+# 下载图片
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+
+# copy model
+cp -r ./model /path/to/FastDeploy/examples/vision/detection/rkyolo/python
+
+# 推理
+python3 infer.py --model_file ./model/  \
+                  --image 000000014439.jpg
+```
+
+
+## 注意事项
+RKNPU上对模型的输入要求是使用NHWC格式，且图片归一化操作会在转RKNN模型时，内嵌到模型中，因此我们在使用FastDeploy部署时，
+
+## 其它文档
+
+- [PaddleDetection 模型介绍](..)
+- [PaddleDetection C++部署](../cpp)
+- [模型预测结果说明](../../../../../../docs/api/vision_results/)
+- [转换PaddleDetection RKNN模型文档](../README.md)
--- a/examples/vision/detection/rkyolo/python/infer.py
+++ b/examples/vision/detection/rkyolo/python/infer.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import fastdeploy as fd
+import cv2
+import os
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_file", required=True, help="Path of rknn model.")
+    parser.add_argument(
+        "--image", type=str, required=True, help="Path of test image file.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    model_file = args.model_file
+    params_file = ""
+
+    # 配置runtime，加载模型
+    runtime_option = fd.RuntimeOption()
+    runtime_option.use_rknpu2()
+
+    model = fd.vision.detection.RKYOLOV5(
+        model_file,
+        runtime_option=runtime_option,
+        model_format=fd.ModelFormat.RKNN)
+
+    # 预测图片分割结果
+    im = cv2.imread(args.image)
+    result = model.predict(im)
+    print(result)
+
+    # 可视化结果
+    vis_im = fd.vision.vis_detection(im, result, score_threshold=0.5)
+    cv2.imwrite("visualized_result.jpg", vis_im)
+    print("Visualized result save in ./visualized_result.jpg")
--- a/examples/vision/detection/yolov5/quantize/README.md
+++ b/examples/vision/detection/yolov5/quantize/README.md
@@ -4,13 +4,13 @@ FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工

 ## FastDeploy一键模型自动化压缩工具
 FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/common_tools/auto_compression/)

 ## 下载量化完成的YOLOv5s模型
 用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)

 Benchmark表格说明:
- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
 - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
 - 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
 - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
@@ -29,7 +29,7 @@ Benchmark表格说明:
 | [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | Paddle Inference|    CPU    |      213.73  |   130.19     |  None  | None |   1.64     |37.6 | 35.2 | 量化蒸馏训练 |

 #### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | TensorRT   |    GPU    |  24.61   | 21.20 |  20.78     | 20.94     |      1.18         | 37.6  | 36.7 | 量化蒸馏训练 |
 | [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | Paddle-TensorRT  |    GPU   |  23.53    |  None |  21.98    | 19.84     |      1.28        | 37.6  | 36.8 | 量化蒸馏训练 |
--- a/examples/vision/detection/yolov5/quantize/README_EN.md
+++ b/examples/vision/detection/yolov5/quantize/README_EN.md
@@ -6,7 +6,7 @@ Users can use the one-click model quantization tool to quantize and deploy the m
 ## FastDeploy One-Click Model Quantization Tool

 FastDeploy provides a one-click quantization tool that allows users to quantize a model simply with a configuration file.
-For a detailed tutorial, please refer to: [One-Click Model Quantization Tool](../../../../../tools/auto_compression/)
+For a detailed tutorial, please refer to: [One-Click Model Quantization Tool](../../../../../tools/common_tools/auto_compression/)

 ## Download Quantized YOLOv5s Model

--- a/examples/vision/detection/yolov5/quantize/cpp/README.md
+++ b/examples/vision/detection/yolov5/quantize/cpp/README.md
@@ -9,7 +9,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv5s模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.支持此模型需保证FastDeploy版本0.7.0以上(x.x.x>=0.7.0)
--- a/examples/vision/detection/yolov5/quantize/python/README.md
+++ b/examples/vision/detection/yolov5/quantize/python/README.md
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.


 ## 以量化后的YOLOv5s模型为例, 进行部署
--- a/examples/vision/detection/yolov5/rv1126/README.md
+++ b/examples/vision/detection/yolov5/rv1126/README.md
@@ -0,0 +1,11 @@
+# YOLOv5 量化模型在 RV1126 上的部署
+目前 FastDeploy 已经支持基于 PaddleLite 部署 YOLOv5 量化模型到 RV1126 上。
+
+模型的量化和量化模型的下载请参考：[模型量化](../quantize/README.md)
+
+
+## 详细部署文档
+
+在 RV1126 上只支持 C++ 的部署。
+
+- [C++部署](cpp)
--- a/examples/vision/detection/yolov5/rv1126/cpp/CMakeLists.txt
+++ b/examples/vision/detection/yolov5/rv1126/cpp/CMakeLists.txt
@@ -0,0 +1,37 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+include_directories(${FastDeploy_INCLUDE_DIRS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
+# 添加FastDeploy库依赖
+target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
+
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR}/build/install)
+
+install(TARGETS infer_demo DESTINATION ./)
+
+install(DIRECTORY models DESTINATION ./)
+install(DIRECTORY images DESTINATION ./)
+
+file(GLOB FASTDEPLOY_LIBS ${FASTDEPLOY_INSTALL_DIR}/lib/*)
+install(PROGRAMS ${FASTDEPLOY_LIBS} DESTINATION lib)
+
+file(GLOB OPENCV_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/opencv/lib/lib*)
+install(PROGRAMS ${OPENCV_LIBS} DESTINATION lib)
+
+file(GLOB PADDLELITE_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/paddlelite/lib/lib*)
+install(PROGRAMS ${PADDLELITE_LIBS} DESTINATION lib)
+
+file(GLOB TIMVX_LIBS ${FASTDEPLOY_INSTALL_DIR}/third_libs/install/paddlelite/lib/verisilicon_timvx/*)
+install(PROGRAMS ${TIMVX_LIBS} DESTINATION lib)
+
+file(GLOB ADB_TOOLS run_with_adb.sh)
+install(PROGRAMS ${ADB_TOOLS} DESTINATION ./)
--- a/examples/vision/detection/yolov5/rv1126/cpp/README.md
+++ b/examples/vision/detection/yolov5/rv1126/cpp/README.md
@@ -0,0 +1,54 @@
+# YOLOv5 量化模型 C++ 部署示例
+
+本目录下提供的 `infer.cc`，可以帮助用户快速完成 YOLOv5 量化模型在 RV1126 上的部署推理加速。
+
+## 部署准备
+### FastDeploy 交叉编译环境准备
+- 1. 软硬件环境满足要求，以及交叉编译环境的准备，请参考：[FastDeploy 交叉编译环境准备](../../../../../../docs/cn/build_and_install/rv1126.md#交叉编译环境搭建)  
+
+### 量化模型准备
+- 1. 用户可以直接使用由 FastDeploy 提供的量化模型进行部署。
+- 2. 用户可以使用 FastDeploy 提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署。
+- 更多量化相关相关信息可查阅[模型量化](../../quantize/README.md)
+
+## 在 RV1126 上部署量化后的 YOLOv5 检测模型
+请按照以下步骤完成在 RV1126 上部署 YOLOv5 量化模型：
+1. 交叉编译编译 FastDeploy 库，具体请参考：[交叉编译 FastDeploy](../../../../../../docs/cn/build_and_install/rv1126.md#基于-paddlelite-的-fastdeploy-交叉编译库编译)
+
+2. 将编译后的库拷贝到当前目录，可使用如下命令：
+```bash
+cp -r FastDeploy/build/fastdeploy-tmivx/ FastDeploy/examples/vision/detection/yolov5/rv1126/cpp
+```
+
+3. 在当前路径下载部署所需的模型和示例图片：
+```bash
+mkdir models && mkdir images
+wget https://bj.bcebos.com/fastdeploy/models/yolov5s_ptq_model.tar.gz
+tar -xvf yolov5s_ptq_model.tar.gz
+cp -r yolov5s_ptq_model models
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+cp -r 000000014439.jpg images
+```
+
+4. 编译部署示例，可使入如下命令：
+```bash
+mkdir build && cd build
+cmake -DCMAKE_TOOLCHAIN_FILE=${PWD}/../fastdeploy-tmivx/timvx.cmake -DFASTDEPLOY_INSTALL_DIR=${PWD}/../fastdeploy-tmivx ..
+make -j8
+make install
+# 成功编译之后，会生成 install 文件夹，里面有一个运行 demo 和部署所需的库
+```
+
+5. 基于 adb 工具部署 YOLOv5 检测模型到 Rockchip RV1126，可使用如下命令：
+```bash
+# 进入 install 目录
+cd FastDeploy/examples/vision/detection/yolov5/rv1126/cpp/build/install/
+# 如下命令表示：bash run_with_adb.sh 需要运行的demo 模型路径 图片路径 设备的DEVICE_ID
+bash run_with_adb.sh infer_demo yolov5s_ptq_model 000000014439.jpg $DEVICE_ID
+```
+
+部署成功后，vis_result.jpg 保存的结果如下：
+
+<img width="640" src="https://user-images.githubusercontent.com/30516196/203706969-dd58493c-6635-4ee7-9421-41c2e0c9524b.png">
+
+需要特别注意的是，在 RV1126 上部署的模型需要是量化后的模型，模型的量化请参考：[模型量化](../../../../../../docs/cn/quantize.md)
--- a/examples/vision/detection/yolov5/rv1126/cpp/infer.cc
+++ b/examples/vision/detection/yolov5/rv1126/cpp/infer.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+void InitAndInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto subgraph_file = model_dir + sep + "subgraph.txt";
+
+  fastdeploy::RuntimeOption option;
+  option.UseTimVX();
+  option.SetLiteSubgraphPartitionPath(subgraph_file);
+
+  auto model = fastdeploy::vision::detection::YOLOv5(
+      model_file, params_file, option, fastdeploy::ModelFormat::PADDLE);
+  assert(model.Initialized());
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::DetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "Usage: infer_demo path/to/quant_model "
+                 "path/to/image "
+                 "run_option, "
+                 "e.g ./infer_demo ./yolov5s_quant ./000000014439.jpg"
+              << std::endl;
+    return -1;
+  }
+
+  std::string model_dir = argv[1];
+  std::string test_image = argv[2];
+  InitAndInfer(model_dir, test_image);
+  return 0;
+}
--- a/examples/vision/detection/yolov5/rv1126/cpp/run_with_adb.sh
+++ b/examples/vision/detection/yolov5/rv1126/cpp/run_with_adb.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+HOST_SPACE=${PWD}
+echo ${HOST_SPACE}
+WORK_SPACE=/data/local/tmp/test
+
+# The first parameter represents the demo name
+DEMO_NAME=image_classification_demo
+if [ -n "$1" ]; then
+  DEMO_NAME=$1
+fi
+
+# The second parameter represents the model name
+MODEL_NAME=mobilenet_v1_fp32_224
+if [ -n "$2" ]; then
+  MODEL_NAME=$2
+fi
+
+# The third parameter indicates the name of the image to be tested
+IMAGE_NAME=0001.jpg
+if [ -n "$3" ]; then
+  IMAGE_NAME=$3
+fi
+
+# The fourth parameter represents the ID of the device
+ADB_DEVICE_NAME=
+if [ -n "$4" ]; then
+  ADB_DEVICE_NAME="-s $4"
+fi
+
+# Set the environment variables required during the running process
+EXPORT_ENVIRONMENT_VARIABLES="export GLOG_v=5; export VIV_VX_ENABLE_GRAPH_TRANSFORM=-pcq:1; export VIV_VX_SET_PER_CHANNEL_ENTROPY=100; export TIMVX_BATCHNORM_FUSION_MAX_ALLOWED_QUANT_SCALE_DEVIATION=300000; export VSI_NN_LOG_LEVEL=5;"
+
+EXPORT_ENVIRONMENT_VARIABLES="${EXPORT_ENVIRONMENT_VARIABLES}export LD_LIBRARY_PATH=${WORK_SPACE}/lib:\$LD_LIBRARY_PATH;"
+
+# Please install adb, and DON'T run this in the docker.
+set -e
+adb $ADB_DEVICE_NAME shell "rm -rf $WORK_SPACE"
+adb $ADB_DEVICE_NAME shell "mkdir -p $WORK_SPACE"
+
+# Upload the demo, librarys, model and test images to the device
+adb $ADB_DEVICE_NAME push ${HOST_SPACE}/lib $WORK_SPACE
+adb $ADB_DEVICE_NAME push ${HOST_SPACE}/${DEMO_NAME} $WORK_SPACE
+adb $ADB_DEVICE_NAME push models $WORK_SPACE
+adb $ADB_DEVICE_NAME push images $WORK_SPACE
+
+# Execute the deployment demo
+adb $ADB_DEVICE_NAME shell "cd $WORK_SPACE; ${EXPORT_ENVIRONMENT_VARIABLES} chmod +x ./${DEMO_NAME}; ./${DEMO_NAME} ./models/${MODEL_NAME} ./images/$IMAGE_NAME"
--- a/examples/vision/detection/yolov5/serving/README.md
+++ b/examples/vision/detection/yolov5/serving/README.md
@@ -22,10 +22,10 @@ mv yolov5s.onnx models/runtime/1/model.onnx
 # GPU镜像
 docker pull paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10
 # CPU镜像
-docker pull paddlepaddle/fastdeploy:z.y.z-cpu-only-21.10
+docker pull paddlepaddle/fastdeploy:x.y.z-cpu-only-21.10

 # 运行容器.容器名字为 fd_serving, 并挂载当前目录为容器的 /yolov5_serving 目录
-nvidia-docker run -it --net=host --name fd_serving -v `pwd`/:/yolov5_serving paddlepaddle/fastdeploy:0.6.0-gpu-cuda11.4-trt8.4-21.10  bash
+nvidia-docker run -it --net=host --name fd_serving -v `pwd`/:/yolov5_serving paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10  bash

 # 启动服务(不设置CUDA_VISIBLE_DEVICES环境变量，会拥有所有GPU卡的调度权限)
 CUDA_VISIBLE_DEVICES=0 fastdeployserver --model-repository=/yolov5_serving/models --backend-config=python,shm-default-byte-size=10485760
@@ -49,7 +49,7 @@ I0928 04:51:15.826578 206 http_server.cc:167] Started Metrics Service at 0.0.0.0
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg

 #安装客户端依赖
-python3 -m pip install tritonclient\[all\]
+python3 -m pip install tritonclient[all]

 # 发送请求
 python3 yolov5_grpc_client.py
--- a/examples/vision/detection/yolov5/serving/README_EN.md
+++ b/examples/vision/detection/yolov5/serving/README_EN.md
@@ -9,11 +9,11 @@ wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s.onnx
 # Save the model under models/infer/1 and rename it as model.onnx
 mv yolov5s.onnx models/infer/1/

-# Pull fastdeploy image
-docker pull paddlepaddle/fastdeploy:0.6.0-gpu-cuda11.4-trt8.4-21.10
+# Pull fastdeploy image, x.y.z is FastDeploy version, example 1.0.0.
+docker pull paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10

 # Run the docker. The docker name is fd_serving, and the current directory is mounted as the docker's /yolov5_serving directory
-nvidia-docker run -it --net=host --name fd_serving -v `pwd`/:/yolov5_serving paddlepaddle/fastdeploy:0.6.0-gpu-cuda11.4-trt8.4-21.10  bash
+nvidia-docker run -it --net=host --name fd_serving -v `pwd`/:/yolov5_serving paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10  bash

 # Start the service (Without setting the CUDA_VISIBLE_DEVICES environment variable, it will have scheduling privileges for all GPU cards)
 CUDA_VISIBLE_DEVICES=0 fastdeployserver --model-repository=models --backend-config=python,shm-default-byte-size=10485760
--- a/examples/vision/detection/yolov6/quantize/README.md
+++ b/examples/vision/detection/yolov6/quantize/README.md
@@ -4,12 +4,12 @@ FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工

 ## FastDeploy一键模型自动化压缩工具
 FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/common_tools/auto_compression/)
 ## 下载量化完成的YOLOv6s模型
 用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)

 Benchmark表格说明:
- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
 - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
 - 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
 - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
@@ -28,7 +28,7 @@ Benchmark表格说明:


 #### 端到端 Benchmark
-| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| 模型                 |推理后端            |部署硬件    | FP32 End2End时延   | INT8 End2End时延 | INT8 + FP16 End2End时延  | INT8+FP16+PM End2End时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
 | ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
 | [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | TensorRT  |    GPU    |       15.66    |   11.30   |  10.25      |9.59   |  1.63           | 42.5 | 40.7|量化蒸馏训练 |
 | [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | Paddle-TensorRT |    GPU    |       15.03   | None|  11.36 | 9.32       |  1.61            | 42.5 | 40.7|量化蒸馏训练 |
--- a/examples/vision/detection/yolov6/quantize/README_EN.md
+++ b/examples/vision/detection/yolov6/quantize/README_EN.md
@@ -6,7 +6,7 @@ Users can use the one-click model quantization tool to quantize and deploy the m
 ## FastDeploy One-Click Model Quantization Tool

 FastDeploy provides a one-click quantization tool that allows users to quantize a model simply with a configuration file.
-For detailed tutorial, please refer to : [One-Click Model Quantization Tool](../../../../../tools/auto_compression/)
+For detailed tutorial, please refer to : [One-Click Model Quantization Tool](../../../../../tools/common_tools/auto_compression/)

 ## Download Quantized YOLOv6s Model

--- a/examples/vision/detection/yolov6/quantize/cpp/README.md
+++ b/examples/vision/detection/yolov6/quantize/cpp/README.md
@@ -9,7 +9,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv6s模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.支持此模型需保证FastDeploy版本0.7.0以上(x.x.x>=0.7.0)
--- a/examples/vision/detection/yolov6/quantize/python/README.md
+++ b/examples/vision/detection/yolov6/quantize/python/README.md
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../../tools/common_tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv6s模型为例, 进行部署
 ```bash
--- a/Show More
+++ b/Show More