diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1335c2865..51ba10c83 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/utils.cmake)
 # Set C++11 as standard for the whole project
 if(NOT MSVC)
   set(CMAKE_CXX_STANDARD 11)
-  set(CMAKE_CXX_FLAGS "-Wno-format")
+  set(CMAKE_CXX_FLAGS "-Wno-format -g0 -O3")
   add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
 endif(NOT MSVC)
 
@@ -68,10 +68,12 @@ option(ENABLE_TEXT "Whether to enable text models usage." OFF)
 option(ENABLE_FLYCV "Whether to enable flycv to boost image preprocess." OFF)
 option(ENABLE_CVCUDA "Whether to enable NVIDIA CV-CUDA to boost image preprocess." OFF)
 option(ENABLE_ENCRYPTION "Whether to enable ENCRYPTION." OFF)
+option(ENABLE_BENCHMARK "Whether to enable Benchmark mode." OFF)
 option(WITH_ASCEND "Whether to compile for Huawei Ascend deploy." OFF)
 option(WITH_TIMVX "Whether to compile for TIMVX deploy." OFF)
 option(WITH_KUNLUNXIN "Whether to compile for KunlunXin XPU deploy." OFF)
 option(WITH_TESTING "Whether to compile with unittest." OFF)
+option(WITH_CAPI "Whether to compile with c api." OFF)
 
 ############################# Options for Android cross compiling #########################
 if(ANDROID)
@@ -153,6 +155,8 @@ get_osx_architecture()
 
 ##################################### Building: FastDeploy C++ SDK #######################################
 add_definitions(-DFASTDEPLOY_LIB)
+# set CMAKE_BUILD_TYPE to Release
+add_definitions(-DCMAKE_BUILD_TYPE=Release)
 # configure files before glob sources.
 configure_file(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/core/config.h.in ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/core/config.h)
 configure_file(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/pybind/main.cc.in ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/pybind/main.cc)
@@ -413,6 +417,14 @@ if(ENABLE_PADDLE2ONNX)
   list(APPEND DEPEND_LIBS external_paddle2onnx)
 endif(ENABLE_PADDLE2ONNX)
 
+if(WITH_CAPI)
+  include(${PROJECT_SOURCE_DIR}/c_api/CMakeLists.txt)
+  if(MSVC)
+  add_definitions(-DFD_CAPI)
+  endif()
+endif()
+
+
 configure_file(${PROJECT_SOURCE_DIR}/FastDeploy.cmake.in ${PROJECT_SOURCE_DIR}/FastDeploy.cmake @ONLY)
 configure_file(${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py.in ${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py)
 configure_file(${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py.in ${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py)
@@ -466,7 +478,7 @@ if(ANDROID)
   list(APPEND DEPEND_LIBS ${log-lib})
   if(WITH_LITE_STATIC)
     # need omp for static Paddle Lite lib
-    set(WITH_OPENMP ON CACHE BOOL "Force WITH_OPENMP=ON while WITH_LITE_STATIC=ON" FORCE) 
+    set(WITH_OPENMP ON CACHE BOOL "Force WITH_OPENMP=ON while WITH_LITE_STATIC=ON" FORCE)
     message(STATUS "Force WITH_OPENMP=${WITH_OPENMP} while WITH_LITE_STATIC=ON")
   endif()
   if(WITH_OPENMP)
@@ -482,13 +494,13 @@ if(ANDROID AND WITH_JAVA)
 endif()
 
 if(ANDROID AND WITH_STATIC_LIB)
-  # Here, we use a dummy target (fastdelpoy_dummy) 
+  # Here, we use a dummy target (fastdelpoy_dummy)
   # to form a build dependency tree for fastdeploy_static lib.
   add_library(fastdelpoy_dummy STATIC ${ALL_DEPLOY_SRCS})
-  # Still add ${DEPEND_LIBS} for cmake to form link_libraries 
-  # property tree for a static library. 
+  # Still add ${DEPEND_LIBS} for cmake to form link_libraries
+  # property tree for a static library.
   target_link_libraries(fastdelpoy_dummy ${DEPEND_LIBS})
-  # Build fastdelpoy_dummy when the third-party 
+  # Build fastdelpoy_dummy when the third-party
   # libraries (opencv, paddle lite, flycv) are ready.
   add_dependencies(fastdelpoy_dummy ${LIBRARY_NAME})
   # Add WITH_STATIC_LIB compile definitions, see lite_backend.cc.
@@ -541,9 +553,9 @@ if(WIN32)
     RUNTIME DESTINATION lib
   )
 elseif(ANDROID)
-  if(WITH_STATIC_LIB) 
+  if(WITH_STATIC_LIB)
     install(
-      FILES 
+      FILES
       ${CMAKE_CURRENT_BINARY_DIR}/libfastdeploy_static.a
       DESTINATION lib/${ANDROID_ABI}
     )
@@ -553,11 +565,11 @@ elseif(ANDROID)
       LIBRARY DESTINATION lib/${ANDROID_ABI}
     )
   endif()
-  # Install omp into fastdeploy lib dir if WITH_OPENMP=ON 
+  # Install omp into fastdeploy lib dir if WITH_OPENMP=ON
   # and WITH_LITE_STATIC=OFF.
   if(WITH_OPENMP AND (NOT WITH_LITE_STATIC) AND OpenMP_CXX_FOUND AND ENABLE_OPENMP_SHARED)
     install(
-      FILES 
+      FILES
       ${OpenMP_CXX_LIBRARIES}
       DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}
     )
@@ -594,7 +606,7 @@ else()
   # The headers and libs of opencv must be install.
   if(ENABLE_VISION)
     if(WITH_OPENCV_STATIC AND WITH_STATIC_LIB)
-      # Only need to install headers while building 
+      # Only need to install headers while building
       # FastDeploy static lib. (TODO:qiuyanjun)
       install(
         DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/opencv/sdk/native/jni/include
@@ -606,29 +618,29 @@ else()
         DESTINATION ${CMAKE_INSTALL_PREFIX}/third_libs/install
       )
     endif()
-  endif()
-  # only need flycv's headers (may also install libs? TODO:qiuyanjun)
-  if(ENABLE_FLYCV)
-    if(WITH_FLYCV_STATIC)
-      install(
-        DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/flycv/include
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/third_libs/install/flycv
-      )
-    else()
-      install(
-        DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/flycv
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/third_libs/install
-      )
+    # only need flycv's headers (may also install libs? TODO:qiuyanjun)
+    if(ENABLE_FLYCV)
+      if(WITH_FLYCV_STATIC)
+        install(
+          DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/flycv/include
+          DESTINATION ${CMAKE_INSTALL_PREFIX}/third_libs/install/flycv
+        )
+      else()
+        install(
+          DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/flycv
+          DESTINATION ${CMAKE_INSTALL_PREFIX}/third_libs/install
+        )
+      endif()
     endif()
-  endif()
-  # fast_tokenizer's static lib is not avaliable now! 
+  endif(ENABLE_VISION)
+  # fast_tokenizer's static lib is not avaliable now!
   # may support some days later(TODO:qiuyanjun)
   if(ENABLE_TEXT)
     install(
       DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/fast_tokenizer
       DESTINATION ${CMAKE_INSTALL_PREFIX}/third_libs/install
     )
-  endif()  
+  endif()
   # some libs may not to install while in static mode
   if(ENABLE_LITE_BACKEND)
     if(WITH_LITE_STATIC)
diff --git a/FastDeploy.cmake.in b/FastDeploy.cmake.in
index f8285e36a..d622660f4 100644
--- a/FastDeploy.cmake.in
+++ b/FastDeploy.cmake.in
@@ -33,8 +33,8 @@ set(ORT_DIRECTORY "@ORT_DIRECTORY@")
 set(OPENVINO_DIRECTORY "@OPENVINO_DIRECTORY@")
 set(RKNN2_TARGET_SOC "@RKNN2_TARGET_SOC@")
 set(WITH_KUNLUNXIN @WITH_KUNLUNXIN@)
-# Whether to use FastDeploy static lib. The default 
-# value for this option is determined by the SDK 
+# Whether to use FastDeploy static lib. The default
+# value for this option is determined by the SDK
 # build-time options.
 set(WITH_STATIC_LIB @WITH_STATIC_LIB@)
 
@@ -62,8 +62,8 @@ if(WITH_STATIC_LIB)
   # add_definitions(-DWITH_STATIC_WARNING)
 endif()
 
-# Still need omp while using FastDeploy static lib. 
-# This is due to the use of openmp for Paddle Lite's 
+# Still need omp while using FastDeploy static lib.
+# This is due to the use of openmp for Paddle Lite's
 # static library.
 if(ANDROID AND WITH_STATIC_LIB AND WITH_LITE_STATIC)
   include(${CMAKE_CURRENT_LIST_DIR}/openmp.cmake)
@@ -72,10 +72,10 @@ endif()
 if(ANDROID)
   add_library(fastdeploy STATIC IMPORTED GLOBAL)
   if(WITH_STATIC_LIB)
-    set_property(TARGET fastdeploy PROPERTY IMPORTED_LOCATION 
+    set_property(TARGET fastdeploy PROPERTY IMPORTED_LOCATION
                  ${CMAKE_CURRENT_LIST_DIR}/lib/${ANDROID_ABI}/lib${LIBRARY_NAME}_static.a)
   else()
-    set_property(TARGET fastdeploy PROPERTY IMPORTED_LOCATION 
+    set_property(TARGET fastdeploy PROPERTY IMPORTED_LOCATION
                  ${CMAKE_CURRENT_LIST_DIR}/lib/${ANDROID_ABI}/lib${LIBRARY_NAME}.so)
   endif()
   list(APPEND FASTDEPLOY_LIBS fastdeploy)
@@ -226,7 +226,7 @@ if(ENABLE_VISION)
         find_package(OpenCV REQUIRED PATHS ${OpenCV_DIR})
         list(APPEND FASTDEPLOY_INCS ${OpenCV_INCLUDE_DIRS})
         # For now, we still need to link OpenCV static libs.
-        # Users may use some of opencv's apis, but they may 
+        # Users may use some of opencv's apis, but they may
         # not have been compiled into fastdeploy.
         # list(APPEND FASTDEPLOY_LIBS ${OpenCV_LIBS})
         list(APPEND FASTDEPLOY_LIBS opencv_core opencv_video opencv_highgui opencv_imgproc opencv_imgcodecs)
@@ -264,8 +264,8 @@ if(ENABLE_VISION)
         add_library(flycv_shared STATIC IMPORTED GLOBAL)
         set_property(TARGET flycv_shared PROPERTY IMPORTED_LOCATION ${FLYCV_LIB_DIR}/${ANDROID_ABI}/libflycv_shared.so)
         list(APPEND FASTDEPLOY_LIBS flycv_shared)
-      else()  
-        # This code may be needed later. Therefore, I choose to 
+      else()
+        # This code may be needed later. Therefore, I choose to
         # comment it rather than delete it. (TODO:qiuyanjun)
         # add_library(flycv_static STATIC IMPORTED GLOBAL)
         # add_library(flycv_png16 STATIC IMPORTED GLOBAL)
@@ -273,25 +273,25 @@ if(ENABLE_VISION)
         # add_library(flycv_z STATIC IMPORTED GLOBAL)
         # set_property(TARGET flycv_static PROPERTY IMPORTED_LOCATION ${FLYCV_LIB_DIR}/${ANDROID_ABI}/libflycv_static.a)
         # set_property(TARGET flycv_png16 PROPERTY IMPORTED_LOCATION ${FLYCV_LIB_DIR}/${ANDROID_ABI}/libpng16.a)
-        # set_property(TARGET flycv_turbojpeg PROPERTY IMPORTED_LOCATION ${FLYCV_LIB_DIR}/${ANDROID_ABI}/libturbojpeg.a) 
-        # set_property(TARGET flycv_z PROPERTY IMPORTED_LOCATION ${FLYCV_LIB_DIR}/${ANDROID_ABI}/libz.a)  
+        # set_property(TARGET flycv_turbojpeg PROPERTY IMPORTED_LOCATION ${FLYCV_LIB_DIR}/${ANDROID_ABI}/libturbojpeg.a)
+        # set_property(TARGET flycv_z PROPERTY IMPORTED_LOCATION ${FLYCV_LIB_DIR}/${ANDROID_ABI}/libz.a)
         # list(APPEND FASTDEPLOY_LIBS flycv_static)
-        # list(APPEND FASTDEPLOY_LIBS flycv_png16) 
-        # list(APPEND FASTDEPLOY_LIBS flycv_turbojpeg) 
-        # list(APPEND FASTDEPLOY_LIBS flycv_z)        
+        # list(APPEND FASTDEPLOY_LIBS flycv_png16)
+        # list(APPEND FASTDEPLOY_LIBS flycv_turbojpeg)
+        # list(APPEND FASTDEPLOY_LIBS flycv_z)
       endif()
     else()
       find_library(FLYCV_LIB flycv_shared ${FLYCV_LIB_DIR} NO_DEFAULT_PATH)
       list(APPEND FASTDEPLOY_LIBS ${FLYCV_LIB})
     endif()
   endif()
-  
+
   if(ENABLE_CVCUDA)
     find_library(CVCUDA_LIB cvcuda ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/cvcuda/lib NO_DEFAULT_PATH)
     find_library(NVCV_TYPES_LIB nvcv_types ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/cvcuda/lib NO_DEFAULT_PATH)
     list(APPEND FASTDEPLOY_LIBS ${CVCUDA_LIB} ${NVCV_TYPES_LIB})
   endif()
-  
+
 endif()
 
 if (ENABLE_TEXT)
@@ -404,7 +404,7 @@ if(ANDROID)
   endif()
   message(STATUS "  WITH_OPENMP:              : ${WITH_OPENMP}")
   message(STATUS "  WITH_JAVA:                : ${WITH_JAVA}")
-endif() 
+endif()
 message(STATUS "  DEPENDENCY_LIBS           : ${FASTDEPLOY_LIBS}")
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -415,7 +415,7 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
 endif()
 
 function(install_fastdeploy_libraries DESTINATION_DIR)
-  # No dynamic libs need to install while using 
+  # No dynamic libs need to install while using
   # FastDeploy static lib.
   if(WITH_STATIC_LIB)
     return()
@@ -442,9 +442,9 @@ function(install_fastdeploy_libraries DESTINATION_DIR)
       file(GLOB_RECURSE ALL_OPENCV_DYN_LIBS ${OpenCV_NATIVE_DIR}/libs/${DYN_LIB_SUFFIX})
     else()
       file(GLOB_RECURSE ALL_OPENCV_DYN_LIBS ${OpenCV_DIR}/${DYN_LIB_SUFFIX})
-    endif()  
+    endif()
     list(REMOVE_ITEM ALL_DEPS_DYN_LIBS ${ALL_OPENCV_DYN_LIBS})
-  
+
     if(NOT WITH_OPENCV_STATIC)
       if(WIN32)
         file(GLOB OPENCV_DYN_LIBS ${OpenCV_DIR}/x64/vc15/bin/${DYN_LIB_SUFFIX})
diff --git a/README_CN.md b/README_CN.md
index 88d0fea59..c03ee7b54 100755
--- a/README_CN.md
+++ b/README_CN.md
@@ -70,7 +70,7 @@
     - **微信**：扫描二维码，填写问卷加入技术社区，与社区开发者交流部署产业落地痛点问题
 
 <div align="center">
-    <img src="https://user-images.githubusercontent.com/54695910/200145290-d5565d18-6707-4a0b-a9af-85fd36d35d13.jpg" width = "150" height = "150" />
+    <img src="https://user-images.githubusercontent.com/54695910/216615983-bbb78319-0231-4635-86d1-f2ebf9eac85d.jpg" width = "150" height = "150" />
 </div>
 
 
diff --git a/README_EN.md b/README_EN.md
index 8390d948e..48e66e506 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -67,7 +67,7 @@ Including [image classification](examples/vision/classification), [object detect
   - **Wechat**：Scan the QR code below using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group, and share the deployment industry implementation pain points with the community developers
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/54695910/207262688-4225bc39-4337-4966-a5cc-26bd6557d226.jpg"  width = "150" height = "150" />
+<img src="https://user-images.githubusercontent.com/54695910/216615983-bbb78319-0231-4635-86d1-f2ebf9eac85d.jpg"  width = "150" height = "150" />
 </div>
 
 ## 🌌 Inference Backend and Abilities
diff --git a/benchmark/cpp/CMakeLists.txt b/benchmark/cpp/CMakeLists.txt
new file mode 100755
index 000000000..c79e679c3
--- /dev/null
+++ b/benchmark/cpp/CMakeLists.txt
@@ -0,0 +1,20 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# specify the decompress directory of FastDeploy SDK
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+include(${FASTDEPLOY_INSTALL_DIR}/utils/gflags.cmake)
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(benchmark_yolov5 ${PROJECT_SOURCE_DIR}/benchmark_yolov5.cc)
+add_executable(benchmark_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov8.cc)
+
+if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
+  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags pthread)
+else()
+  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
+endif()
diff --git a/benchmark/cpp/benchmark_ppyolov8.cc b/benchmark/cpp/benchmark_ppyolov8.cc
new file mode 100644
index 000000000..4bd6e0df4
--- /dev/null
+++ b/benchmark/cpp/benchmark_ppyolov8.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/benchmark/utils.h"
+#include "fastdeploy/vision.h"
+#include "flags.h"
+
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+bool RunModel(std::string model_dir, std::string image_file, size_t warmup,
+              size_t repeats, size_t dump_period, std::string cpu_mem_file_name,
+              std::string gpu_mem_file_name) {
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option)) {
+    PrintUsage();
+    return false;
+  }
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto config_file = model_dir + sep + "infer_cfg.yml";
+
+  if (FLAGS_profile_mode == "runtime") {
+    option.EnableProfiling(FLAGS_include_h2d_d2h, repeats, warmup);
+  }
+  auto model = fastdeploy::vision::detection::PaddleYOLOv8(
+      model_file, params_file, config_file, option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return false;
+  }
+  auto im = cv::imread(image_file);
+  // For Runtime
+  if (FLAGS_profile_mode == "runtime") {
+    fastdeploy::vision::DetectionResult res;
+    if (!model.Predict(im, &res)) {
+      std::cerr << "Failed to predict." << std::endl;
+      return false;
+    }
+    double profile_time = model.GetProfileTime() * 1000;
+    std::cout << "Runtime(ms): " << profile_time << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  } else {
+    // For End2End
+    // Step1: warm up for warmup times
+    std::cout << "Warmup " << warmup << " times..." << std::endl;
+    for (int i = 0; i < warmup; i++) {
+      fastdeploy::vision::DetectionResult res;
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+    }
+    std::vector<float> end2end_statis;
+    // Step2: repeat for repeats times
+    std::cout << "Counting time..." << std::endl;
+    fastdeploy::TimeCounter tc;
+    fastdeploy::vision::DetectionResult res;
+    for (int i = 0; i < repeats; i++) {
+      if (FLAGS_collect_memory_info && i % dump_period == 0) {
+        fastdeploy::benchmark::DumpCurrentCpuMemoryUsage(cpu_mem_file_name);
+#if defined(WITH_GPU)
+        fastdeploy::benchmark::DumpCurrentGpuMemoryUsage(gpu_mem_file_name,
+                                                         FLAGS_device_id);
+#endif
+      }
+      tc.Start();
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+      tc.End();
+      end2end_statis.push_back(tc.Duration() * 1000);
+    }
+    float end2end = std::accumulate(end2end_statis.end() - repeats,
+                                    end2end_statis.end(), 0.f) /
+                    repeats;
+    std::cout << "End2End(ms): " << end2end << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  }
+
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  int repeats = FLAGS_repeat;
+  int warmup = FLAGS_warmup;
+  int dump_period = FLAGS_dump_period;
+  std::string cpu_mem_file_name = "result_cpu.txt";
+  std::string gpu_mem_file_name = "result_gpu.txt";
+  // Run model
+  if (RunModel(FLAGS_model, FLAGS_image, warmup, repeats, dump_period,
+               cpu_mem_file_name, gpu_mem_file_name) != true) {
+    exit(1);
+  }
+  if (FLAGS_collect_memory_info) {
+    float cpu_mem = fastdeploy::benchmark::GetCpuMemoryUsage(cpu_mem_file_name);
+    std::cout << "cpu_pss_mb: " << cpu_mem << "MB." << std::endl;
+#if defined(WITH_GPU)
+    float gpu_mem = fastdeploy::benchmark::GetGpuMemoryUsage(gpu_mem_file_name);
+    std::cout << "gpu_pss_mb: " << gpu_mem << "MB." << std::endl;
+#endif
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/benchmark/cpp/benchmark_yolov5.cc b/benchmark/cpp/benchmark_yolov5.cc
new file mode 100644
index 000000000..ae16dd8d8
--- /dev/null
+++ b/benchmark/cpp/benchmark_yolov5.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/benchmark/utils.h"
+#include "fastdeploy/vision.h"
+#include "flags.h"
+
+bool RunModel(std::string model_file, std::string image_file, size_t warmup,
+              size_t repeats, size_t dump_period, std::string cpu_mem_file_name,
+              std::string gpu_mem_file_name) {
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option)) {
+    PrintUsage();
+    return false;
+  }
+  if (FLAGS_profile_mode == "runtime") {
+    option.EnableProfiling(FLAGS_include_h2d_d2h, repeats, warmup);
+  }
+  auto model = fastdeploy::vision::detection::YOLOv5(model_file, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return false;
+  }
+  auto im = cv::imread(image_file);
+  // For Runtime
+  if (FLAGS_profile_mode == "runtime") {
+    fastdeploy::vision::DetectionResult res;
+    if (!model.Predict(im, &res)) {
+      std::cerr << "Failed to predict." << std::endl;
+      return false;
+    }
+    double profile_time = model.GetProfileTime() * 1000;
+    std::cout << "Runtime(ms): " << profile_time << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  } else {
+    // For End2End
+    // Step1: warm up for warmup times
+    std::cout << "Warmup " << warmup << " times..." << std::endl;
+    for (int i = 0; i < warmup; i++) {
+      fastdeploy::vision::DetectionResult res;
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+    }
+    std::vector<float> end2end_statis;
+    // Step2: repeat for repeats times
+    std::cout << "Counting time..." << std::endl;
+    fastdeploy::TimeCounter tc;
+    fastdeploy::vision::DetectionResult res;
+    for (int i = 0; i < repeats; i++) {
+      if (FLAGS_collect_memory_info && i % dump_period == 0) {
+        fastdeploy::benchmark::DumpCurrentCpuMemoryUsage(cpu_mem_file_name);
+#if defined(WITH_GPU)
+        fastdeploy::benchmark::DumpCurrentGpuMemoryUsage(gpu_mem_file_name,
+                                                         FLAGS_device_id);
+#endif
+      }
+      tc.Start();
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+      tc.End();
+      end2end_statis.push_back(tc.Duration() * 1000);
+    }
+    float end2end = std::accumulate(end2end_statis.end() - repeats,
+                                    end2end_statis.end(), 0.f) /
+                    repeats;
+    std::cout << "End2End(ms): " << end2end << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  }
+
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  int repeats = FLAGS_repeat;
+  int warmup = FLAGS_warmup;
+  int dump_period = FLAGS_dump_period;
+  std::string cpu_mem_file_name = "result_cpu.txt";
+  std::string gpu_mem_file_name = "result_gpu.txt";
+  // Run model
+  if (RunModel(FLAGS_model, FLAGS_image, warmup, repeats, dump_period,
+               cpu_mem_file_name, gpu_mem_file_name) != true) {
+    exit(1);
+  }
+  if (FLAGS_collect_memory_info) {
+    float cpu_mem = fastdeploy::benchmark::GetCpuMemoryUsage(cpu_mem_file_name);
+    std::cout << "cpu_pss_mb: " << cpu_mem << "MB." << std::endl;
+#if defined(WITH_GPU)
+    float gpu_mem = fastdeploy::benchmark::GetGpuMemoryUsage(gpu_mem_file_name);
+    std::cout << "gpu_pss_mb: " << gpu_mem << "MB." << std::endl;
+#endif
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/benchmark/cpp/flags.h b/benchmark/cpp/flags.h
new file mode 100755
index 000000000..c9a8e8d91
--- /dev/null
+++ b/benchmark/cpp/flags.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "gflags/gflags.h"
+#include "fastdeploy/utils/perf.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(image, "", "Path of the image file.");
+DEFINE_string(device, "cpu",
+              "Type of inference device, support 'cpu' or 'gpu'.");
+DEFINE_int32(device_id, 0, "device(gpu) id.");
+DEFINE_int32(warmup, 200, "Number of warmup for profiling.");
+DEFINE_int32(repeat, 1000, "Number of repeats for profiling.");
+DEFINE_string(profile_mode, "runtime", "runtime or end2end.");
+DEFINE_string(backend, "default",
+              "The inference runtime backend, support: ['default', 'ort', "
+              "'paddle', 'ov', 'trt', 'paddle_trt', 'lite']");
+DEFINE_int32(cpu_thread_nums, 8, "Set numbers of cpu thread.");
+DEFINE_bool(
+    include_h2d_d2h, false, "Whether run profiling with h2d and d2h.");
+DEFINE_bool(
+    use_fp16, false,
+    "Whether to use FP16 mode, only support 'trt', 'paddle_trt' "
+    "and 'lite' backend");
+DEFINE_bool(
+    collect_memory_info, false, "Whether to collect memory info");
+DEFINE_int32(dump_period, 100, "How often to collect memory info.");
+
+void PrintUsage() {
+  std::cout << "Usage: infer_demo --model model_path --image img_path --device "
+               "[cpu|gpu] --backend "
+               "[default|ort|paddle|ov|trt|paddle_trt] "
+               "--use_fp16 false"
+            << std::endl;
+  std::cout << "Default value of device: cpu" << std::endl;
+  std::cout << "Default value of backend: default" << std::endl;
+  std::cout << "Default value of use_fp16: false" << std::endl;
+}
+
+bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
+  if (FLAGS_device == "gpu") {
+    option->UseGpu();
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleInferBackend();
+    } else if (FLAGS_backend == "trt" || FLAGS_backend == "paddle_trt") {
+      option->UseTrtBackend();
+      if (FLAGS_backend == "paddle_trt") {
+        option->EnablePaddleToTrt();
+      }
+      if (FLAGS_use_fp16) {
+        option->EnableTrtFP16();
+      }
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with GPU, only support "
+                   "default/ort/paddle/trt/paddle_trt now, "
+                << FLAGS_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else if (FLAGS_device == "cpu") {
+    option->SetCpuThreadNum(FLAGS_cpu_thread_nums);
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "ov") {
+      option->UseOpenVINOBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleInferBackend();
+    } else if (FLAGS_backend == "lite") {
+      option->UsePaddleLiteBackend();
+      if (FLAGS_use_fp16) {
+        option->EnableLiteFP16();
+      }
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with CPU, only support "
+                   "default/ort/ov/paddle/lite now, "
+                << FLAGS_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else {
+    std::cerr << "Only support device CPU/GPU now, " << FLAGS_device
+              << " is not supported." << std::endl;
+    return false;
+  }
+
+  return true;
+}
diff --git a/benchmark/README.md b/benchmark/python/README.md
similarity index 100%
rename from benchmark/README.md
rename to benchmark/python/README.md
diff --git a/benchmark/benchmark_ernie_seq_cls.py b/benchmark/python/benchmark_ernie_seq_cls.py
similarity index 100%
rename from benchmark/benchmark_ernie_seq_cls.py
rename to benchmark/python/benchmark_ernie_seq_cls.py
diff --git a/benchmark/benchmark_ppcls.py b/benchmark/python/benchmark_ppcls.py
similarity index 86%
rename from benchmark/benchmark_ppcls.py
rename to benchmark/python/benchmark_ppcls.py
index 6b88658ee..20a62c9fc 100755
--- a/benchmark/benchmark_ppcls.py
+++ b/benchmark/python/benchmark_ppcls.py
@@ -17,6 +17,7 @@ import cv2
 import os
 import numpy as np
 import time
+from tqdm import tqdm
 
 
 def parse_arguments():
@@ -35,11 +36,22 @@ def parse_arguments():
     parser.add_argument(
         "--device_id", type=int, default=0, help="device(gpu) id")
     parser.add_argument(
-        "--iter_num",
+        "--profile_mode",
+        type=str,
+        default="runtime",
+        help="runtime or end2end.")
+    parser.add_argument(
+        "--repeat",
         required=True,
         type=int,
-        default=300,
-        help="number of iterations for computing performace.")
+        default=1000,
+        help="number of repeats for profiling.")
+    parser.add_argument(
+        "--warmup",
+        required=True,
+        type=int,
+        default=50,
+        help="number of warmup for profiling.")
     parser.add_argument(
         "--device",
         default="cpu",
@@ -59,6 +71,11 @@ def parse_arguments():
         type=ast.literal_eval,
         default=False,
         help="whether enable collect memory info")
+    parser.add_argument(
+        "--include_h2d_d2h",
+        type=ast.literal_eval,
+        default=False,
+        help="whether run profiling with h2d and d2h")
     args = parser.parse_args()
     return args
 
@@ -68,6 +85,8 @@ def build_option(args):
     device = args.device
     backend = args.backend
     enable_trt_fp16 = args.enable_trt_fp16
+    if args.profile_mode == "runtime":
+        option.enable_profiling(args.include_h2d_d2h, args.repeat, args.warmup)
     option.set_cpu_thread_num(args.cpu_num_thread)
     if device == "gpu":
         option.use_gpu()
@@ -229,7 +248,6 @@ if __name__ == '__main__':
     gpu_id = args.device_id
     enable_collect_memory_info = args.enable_collect_memory_info
     dump_result = dict()
-    end2end_statis = list()
     cpu_mem = list()
     gpu_mem = list()
     gpu_util = list()
@@ -258,18 +276,28 @@ if __name__ == '__main__':
             monitor = Monitor(enable_gpu, gpu_id)
             monitor.start()
 
-        model.enable_record_time_of_runtime()
         im_ori = cv2.imread(args.image)
-        for i in range(args.iter_num):
-            im = im_ori
+        if args.profile_mode == "runtime":
+            result = model.predict(im_ori)
+            profile_time = model.get_profile_time()
+            dump_result["runtime"] = profile_time * 1000
+            f.writelines("Runtime(ms): {} \n".format(
+                str(dump_result["runtime"])))
+            print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        else:
+            # end2end
+            for i in range(args.warmup):
+                result = model.predict(im_ori)
+
             start = time.time()
-            result = model.predict(im)
-            end2end_statis.append(time.time() - start)
+            for i in tqdm(range(args.repeat)):
+                result = model.predict(im_ori)
+            end = time.time()
+            dump_result["end2end"] = ((end - start) / args.repeat) * 1000.0
+            f.writelines("End2End(ms): {} \n".format(
+                str(dump_result["end2end"])))
+            print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
 
-        runtime_statis = model.print_statis_info_of_runtime()
-
-        warmup_iter = args.iter_num // 5
-        end2end_statis_repeat = end2end_statis[warmup_iter:]
         if enable_collect_memory_info:
             monitor.stop()
             mem_info = monitor.output()
@@ -280,13 +308,6 @@ if __name__ == '__main__':
             dump_result["gpu_util"] = mem_info['gpu'][
                 'utilization.gpu'] if 'gpu' in mem_info else 0
 
-        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
-        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-
-        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
-        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
-        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
         if enable_collect_memory_info:
             f.writelines("cpu_rss_mb: {} \n".format(
                 str(dump_result["cpu_rss_mb"])))
@@ -297,7 +318,8 @@ if __name__ == '__main__':
             print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
             print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
             print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
-    except:
+    except Exception as e:
         f.writelines("!!!!!Infer Failed\n")
+        raise e
 
     f.close()
diff --git a/benchmark/benchmark_ppdet.py b/benchmark/python/benchmark_ppdet.py
similarity index 86%
rename from benchmark/benchmark_ppdet.py
rename to benchmark/python/benchmark_ppdet.py
index 9133122b1..c2b1da6b1 100755
--- a/benchmark/benchmark_ppdet.py
+++ b/benchmark/python/benchmark_ppdet.py
@@ -17,14 +17,16 @@ import cv2
 import os
 import numpy as np
 import time
+from sympy import EX
 from tqdm import tqdm
 
+
 def parse_arguments():
     import argparse
     import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--model", required=True, help="Path of PaddleDetection model.")
+        "--model", required=True, help="Path of PaddleClas model.")
     parser.add_argument(
         "--image", type=str, required=False, help="Path of test image file.")
     parser.add_argument(
@@ -35,20 +37,31 @@ def parse_arguments():
     parser.add_argument(
         "--device_id", type=int, default=0, help="device(gpu) id")
     parser.add_argument(
-        "--iter_num",
+        "--profile_mode",
+        type=str,
+        default="runtime",
+        help="runtime or end2end.")
+    parser.add_argument(
+        "--repeat",
         required=True,
         type=int,
-        default=300,
-        help="number of iterations for computing performace.")
+        default=1000,
+        help="number of repeats for profiling.")
+    parser.add_argument(
+        "--warmup",
+        required=True,
+        type=int,
+        default=50,
+        help="number of warmup for profiling.")
     parser.add_argument(
         "--device",
         default="cpu",
-        help="Type of inference device, support 'cpu', 'gpu', 'kunlunxin', 'ascend' etc.")
+        help="Type of inference device, support 'cpu' or 'gpu'.")
     parser.add_argument(
         "--backend",
         type=str,
         default="default",
-        help="inference backend, default, ort, ov, trt, paddle, paddle_trt, lite.")
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
         type=ast.literal_eval,
@@ -58,12 +71,17 @@ def parse_arguments():
         "--enable_lite_fp16",
         type=ast.literal_eval,
         default=False,
-        help="whether enable fp16 in lite backend")    
+        help="whether enable fp16 in Paddle Lite backend")
     parser.add_argument(
         "--enable_collect_memory_info",
         type=ast.literal_eval,
         default=False,
         help="whether enable collect memory info")
+    parser.add_argument(
+        "--include_h2d_d2h",
+        type=ast.literal_eval,
+        default=False,
+        help="whether run profiling with h2d and d2h")
     args = parser.parse_args()
     return args
 
@@ -74,6 +92,8 @@ def build_option(args):
     backend = args.backend
     enable_trt_fp16 = args.enable_trt_fp16
     enable_lite_fp16 = args.enable_lite_fp16
+    if args.profile_mode == "runtime":
+        option.enable_profiling(args.include_h2d_d2h, args.repeat, args.warmup)
     option.set_cpu_thread_num(args.cpu_num_thread)
     if device == "gpu":
         option.use_gpu()
@@ -130,7 +150,7 @@ def build_option(args):
         else:
             raise Exception(
                 "While inference with CPU, only support default/ort/lite/paddle now, {} is not supported.".
-                format(backend))    
+                format(backend))
     elif device == "ascend":
         option.use_ascend()
         if backend == "lite":
@@ -142,11 +162,11 @@ def build_option(args):
         else:
             raise Exception(
                 "While inference with CPU, only support default/lite now, {} is not supported.".
-                format(backend))                
+                format(backend))
     else:
         raise Exception(
-            "Only support device CPU/GPU/Kunlunxin/Ascend now, {} is not supported.".format(
-                device))
+            "Only support device CPU/GPU/Kunlunxin/Ascend now, {} is not supported.".
+            format(device))
 
     return option
 
@@ -267,7 +287,6 @@ if __name__ == '__main__':
     gpu_id = args.device_id
     enable_collect_memory_info = args.enable_collect_memory_info
     dump_result = dict()
-    end2end_statis = list()
     cpu_mem = list()
     gpu_mem = list()
     gpu_util = list()
@@ -317,18 +336,28 @@ if __name__ == '__main__':
             monitor = Monitor(enable_gpu, gpu_id)
             monitor.start()
 
-        model.enable_record_time_of_runtime()
         im_ori = cv2.imread(args.image)
-        for i in tqdm(range(args.iter_num)):
-            im = im_ori
+        if args.profile_mode == "runtime":
+            result = model.predict(im_ori)
+            profile_time = model.get_profile_time()
+            dump_result["runtime"] = profile_time * 1000
+            f.writelines("Runtime(ms): {} \n".format(
+                str(dump_result["runtime"])))
+            print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+        else:
+            # end2end
+            for i in range(args.warmup):
+                result = model.predict(im_ori)
+
             start = time.time()
-            result = model.predict(im)
-            end2end_statis.append(time.time() - start)
+            for i in tqdm(range(args.repeat)):
+                result = model.predict(im_ori)
+            end = time.time()
+            dump_result["end2end"] = ((end - start) / args.repeat) * 1000.0
+            f.writelines("End2End(ms): {} \n".format(
+                str(dump_result["end2end"])))
+            print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
 
-        runtime_statis = model.print_statis_info_of_runtime()
-
-        warmup_iter = args.iter_num // 5
-        end2end_statis_repeat = end2end_statis[warmup_iter:]
         if enable_collect_memory_info:
             monitor.stop()
             mem_info = monitor.output()
@@ -339,13 +368,6 @@ if __name__ == '__main__':
             dump_result["gpu_util"] = mem_info['gpu'][
                 'utilization.gpu'] if 'gpu' in mem_info else 0
 
-        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
-        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-
-        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
-        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
-        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
         if enable_collect_memory_info:
             f.writelines("cpu_rss_mb: {} \n".format(
                 str(dump_result["cpu_rss_mb"])))
@@ -356,7 +378,8 @@ if __name__ == '__main__':
             print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
             print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
             print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
-    except:
+    except Exception as e:
         f.writelines("!!!!!Infer Failed\n")
+        raise e
 
     f.close()
diff --git a/benchmark/benchmark_ppocr.py b/benchmark/python/benchmark_ppocr.py
similarity index 100%
rename from benchmark/benchmark_ppocr.py
rename to benchmark/python/benchmark_ppocr.py
diff --git a/benchmark/benchmark_ppseg.py b/benchmark/python/benchmark_ppseg.py
similarity index 100%
rename from benchmark/benchmark_ppseg.py
rename to benchmark/python/benchmark_ppseg.py
diff --git a/benchmark/benchmark_uie.py b/benchmark/python/benchmark_uie.py
similarity index 100%
rename from benchmark/benchmark_uie.py
rename to benchmark/python/benchmark_uie.py
diff --git a/benchmark/benchmark_yolo.py b/benchmark/python/benchmark_yolo.py
similarity index 100%
rename from benchmark/benchmark_yolo.py
rename to benchmark/python/benchmark_yolo.py
diff --git a/benchmark/convert_info.py b/benchmark/python/convert_info.py
similarity index 100%
rename from benchmark/convert_info.py
rename to benchmark/python/convert_info.py
diff --git a/benchmark/requirements.txt b/benchmark/python/requirements.txt
similarity index 100%
rename from benchmark/requirements.txt
rename to benchmark/python/requirements.txt
diff --git a/benchmark/run_benchmark_ernie_seq_cls.sh b/benchmark/python/run_benchmark_ernie_seq_cls.sh
similarity index 100%
rename from benchmark/run_benchmark_ernie_seq_cls.sh
rename to benchmark/python/run_benchmark_ernie_seq_cls.sh
diff --git a/benchmark/run_benchmark_ppcls.sh b/benchmark/python/run_benchmark_ppcls.sh
similarity index 100%
rename from benchmark/run_benchmark_ppcls.sh
rename to benchmark/python/run_benchmark_ppcls.sh
diff --git a/benchmark/run_benchmark_ppdet.sh b/benchmark/python/run_benchmark_ppdet.sh
similarity index 100%
rename from benchmark/run_benchmark_ppdet.sh
rename to benchmark/python/run_benchmark_ppdet.sh
diff --git a/benchmark/run_benchmark_ppocr.sh b/benchmark/python/run_benchmark_ppocr.sh
similarity index 100%
rename from benchmark/run_benchmark_ppocr.sh
rename to benchmark/python/run_benchmark_ppocr.sh
diff --git a/benchmark/run_benchmark_ppseg.sh b/benchmark/python/run_benchmark_ppseg.sh
similarity index 100%
rename from benchmark/run_benchmark_ppseg.sh
rename to benchmark/python/run_benchmark_ppseg.sh
diff --git a/benchmark/run_benchmark_uie.sh b/benchmark/python/run_benchmark_uie.sh
similarity index 100%
rename from benchmark/run_benchmark_uie.sh
rename to benchmark/python/run_benchmark_uie.sh
diff --git a/benchmark/run_benchmark_yolo.sh b/benchmark/python/run_benchmark_yolo.sh
similarity index 100%
rename from benchmark/run_benchmark_yolo.sh
rename to benchmark/python/run_benchmark_yolo.sh
diff --git a/c_api/CMakeLists.txt b/c_api/CMakeLists.txt
new file mode 100644
index 000000000..7c7a16626
--- /dev/null
+++ b/c_api/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##################################### Building: FastDeploy C API #######################################
+message("----start--CAPI-------")
+
+if(NOT WITH_CAPI)
+  return()
+endif()
+
+file(GLOB_RECURSE DEPLOY_CAPI_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/*.cc)
+if(NOT ENABLE_VISION)
+    file(GLOB_RECURSE DEPLOY_VISION_CAPI_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/vision/*.cc)
+    list(REMOVE_ITEM DEPLOY_CAPI_SRCS ${DEPLOY_VISION_CAPI_SRCS})
+endif()
+list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_CAPI_SRCS})
+include_directories(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api)
diff --git a/c_api/fastdeploy_capi/fd_common.h b/c_api/fastdeploy_capi/fd_common.h
new file mode 100644
index 000000000..6374cf9b5
--- /dev/null
+++ b/c_api/fastdeploy_capi/fd_common.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#ifdef FD_CAPI
+#define FASTDEPLOY_CAPI_EXPORT __declspec(dllexport)
+#else
+#define FASTDEPLOY_CAPI_EXPORT __declspec(dllimport)
+#endif  // FD_CAPI
+#else
+#define FASTDEPLOY_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+///
+/// __fd_give means that a new object is returned. The user should make sure
+/// that the returned pointer is used exactly once as a value for an __fd_take
+/// argument. In between, it can be used as a value for as many __fd_keep
+/// arguments as the user likes.
+///
+#ifndef __fd_give
+#define __fd_give
+#endif
+///
+/// __fd_take means that the object the argument points to is taken over by the
+/// function and may no longer be used by the user as an argument to any other
+/// function. The pointer value must be one returned by a function returning an
+/// __fd_give pointer.
+///
+#ifndef __fd_take
+#define __fd_take
+#endif
+///
+/// __fd_keep means that the function will only use the object temporarily. The
+/// object which the argument points to is not taken over by the function. After
+/// the function has finished, the user can still use it as an argument to other
+/// functions.
+///
+#ifndef __fd_keep
+#define __fd_keep
+#endif
+
+typedef int8_t FD_C_Bool;
+#define TRUE 1
+#define FALSE 0
+
+#define FD_ENUM(type)                                                          \
+  typedef int32_t type;                                                        \
+  enum
+
+FD_ENUM(FD_C_ModelFormat){
+    AUTOREC,      ///< Auto recognize the model format by model file name
+    PADDLE,       ///< Model with paddlepaddle format
+    ONNX,         ///< Model with ONNX format
+    RKNN,         ///< Model with RKNN format
+    TORCHSCRIPT,  ///< Model with TorchScript format
+    SOPHGO,       ///< Model with SOPHGO format
+};
+
+FD_ENUM(FD_C_rknpu2_CpuName){
+    RK356X = 0, /* run on RK356X. */
+    RK3588 = 1, /* default,run on RK3588. */
+    UNDEFINED,
+};
+
+FD_ENUM(FD_C_rknpu2_CoreMask){
+    RKNN_NPU_CORE_AUTO = 0,  //< default, run on NPU core randomly.
+    RKNN_NPU_CORE_0 = 1,     //< run on NPU core 0.
+    RKNN_NPU_CORE_1 = 2,     //< run on NPU core 1.
+    RKNN_NPU_CORE_2 = 4,     //< run on NPU core 2.
+    RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 |
+                        RKNN_NPU_CORE_1,  //< run on NPU core 1 and core 2.
+    RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 |
+                          RKNN_NPU_CORE_2,  //< run on NPU core 1 and core 2.
+    RKNN_NPU_CORE_UNDEFINED,
+};
+
+FD_ENUM(FD_C_LitePowerMode){
+    LITE_POWER_HIGH = 0,       ///< Use Lite Backend with high power mode
+    LITE_POWER_LOW = 1,        ///< Use Lite Backend with low power mode
+    LITE_POWER_FULL = 2,       ///< Use Lite Backend with full power mode
+    LITE_POWER_NO_BIND = 3,    ///< Use Lite Backend with no bind power mode
+    LITE_POWER_RAND_HIGH = 4,  ///< Use Lite Backend with rand high mode
+    LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
+};
diff --git a/c_api/fastdeploy_capi/fd_type.h b/c_api/fastdeploy_capi/fd_type.h
new file mode 100644
index 000000000..75daf9db6
--- /dev/null
+++ b/c_api/fastdeploy_capi/fd_type.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "fastdeploy_capi/fd_common.h"  // NOLINT
+
+typedef struct FD_C_OneDimArrayUint8 {
+  size_t size;
+  uint8_t* data;
+} FD_C_OneDimArrayUint8;  // std::vector<int32_t>
+
+typedef struct FD_C_OneDimArrayInt32 {
+  size_t size;
+  int32_t* data;
+} FD_C_OneDimArrayInt32;  // std::vector<int32_t>
+
+typedef struct FD_C_OneDimArraySize {
+  size_t size;
+  size_t* data;
+} FD_C_OneDimArraySize;  // std::vector<size_t>
+
+typedef struct FD_C_OneDimArrayInt64 {
+  size_t size;
+  int64_t* data;
+} FD_C_OneDimArrayInt64;  // std::vector<int64_t>
+
+typedef struct FD_C_OneDimArrayFloat {
+  size_t size;
+  float* data;
+} FD_C_OneDimArrayFloat;  // std::vector<float>
+
+typedef struct FD_C_Cstr {
+  size_t size;
+  char* data;
+} FD_C_Cstr;  // std::string
+
+typedef struct FD_C_OneDimArrayCstr {
+  size_t size;
+  FD_C_Cstr* data;
+} FD_C_OneDimArrayCstr;  // std::vector<std::string>
+
+typedef struct FD_C_TwoDimArraySize {
+  size_t size;
+  FD_C_OneDimArraySize* data;
+} FD_C_TwoDimArraySize;  // std::vector<std::vector<size_t>>
+
+typedef struct FD_C_TwoDimArrayFloat {
+  size_t size;
+  FD_C_OneDimArrayFloat* data;
+} FD_C_TwoDimArrayFloat;  // std::vector<std::vector<float>>
+
+typedef void* FD_C_Mat;
diff --git a/c_api/fastdeploy_capi/runtime_option.cc b/c_api/fastdeploy_capi/runtime_option.cc
new file mode 100644
index 000000000..3c9b4022d
--- /dev/null
+++ b/c_api/fastdeploy_capi/runtime_option.cc
@@ -0,0 +1,418 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/runtime_option.h"
+
+#include "fastdeploy/utils/utils.h"
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+FD_C_RuntimeOptionWrapper* FD_C_CreateRuntimeOptionWrapper() {
+  FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper =
+      new FD_C_RuntimeOptionWrapper();
+  fd_c_runtime_option_wrapper->runtime_option =
+      std::unique_ptr<fastdeploy::RuntimeOption>(
+          new fastdeploy::RuntimeOption());
+  return fd_c_runtime_option_wrapper;
+}
+
+void FD_C_DestroyRuntimeOption(
+    __fd_take FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  delete fd_c_runtime_option_wrapper;
+}
+
+void FD_C_RuntimeOptionWrapperSetModelPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_path, const char* params_path,
+    const FD_C_ModelFormat format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetModelPath(std::string(model_path),
+                               std::string(params_path),
+                               static_cast<fastdeploy::ModelFormat>(format));
+}
+
+void FD_C_RuntimeOptionWrapperSetModelBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_buffer, const char* params_buffer,
+    const FD_C_ModelFormat format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetModelBuffer(model_buffer, params_buffer,
+                                 static_cast<fastdeploy::ModelFormat>(format));
+}
+
+void FD_C_RuntimeOptionWrapperUseCpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseCpu();
+}
+
+void FD_C_RuntimeOptionWrapperUseGpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int gpu_id) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseGpu(gpu_id);
+}
+
+void FD_C_RuntimeOptionWrapperUseRKNPU2(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_rknpu2_CpuName rknpu2_name, FD_C_rknpu2_CoreMask rknpu2_core) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseRKNPU2(
+      static_cast<fastdeploy::rknpu2::CpuName>(rknpu2_name),
+      static_cast<fastdeploy::rknpu2::CoreMask>(rknpu2_core));
+}
+
+void FD_C_RuntimeOptionWrapperUseTimVX(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseTimVX();
+}
+
+void FD_C_RuntimeOptionWrapperUseAscend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseAscend();
+}
+
+void FD_C_RuntimeOptionWrapperUseKunlunXin(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int kunlunxin_id, int l3_workspace_size, FD_C_Bool locked,
+    FD_C_Bool autotune, const char* autotune_file, const char* precision,
+    FD_C_Bool adaptive_seqlen, FD_C_Bool enable_multi_stream) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseKunlunXin(kunlunxin_id, l3_workspace_size, bool(locked),
+                               bool(autotune), std::string(autotune_file),
+                               std::string(precision), bool(adaptive_seqlen),
+                               bool(enable_multi_stream));
+}
+
+void FD_C_RuntimeOptionWrapperUseSophgo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseSophgo();
+}
+
+void FD_C_RuntimeOptionWrapperSetExternalStream(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    void* external_stream) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetExternalStream(external_stream);
+}
+
+void FD_C_RuntimeOptionWrapperSetCpuThreadNum(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int thread_num) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetCpuThreadNum(thread_num);
+}
+
+void FD_C_RuntimeOptionWrapperSetOrtGraphOptLevel(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int level) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetOrtGraphOptLevel(level);
+}
+
+void FD_C_RuntimeOptionWrapperUsePaddleBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UsePaddleBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUsePaddleInferBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  return FD_C_RuntimeOptionWrapperUsePaddleBackend(fd_c_runtime_option_wrapper);
+}
+
+void FD_C_RuntimeOptionWrapperUseOrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseOrtBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseSophgoBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseSophgoBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseTrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseTrtBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUsePorosBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UsePorosBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseOpenVINOBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseOpenVINOBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseLiteBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUsePaddleLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  return FD_C_RuntimeOptionWrapperUseLiteBackend(fd_c_runtime_option_wrapper);
+}
+
+void FD_C_RuntimeOptionWrapperSetPaddleMKLDNN(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool pd_mkldnn) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetPaddleMKLDNN(pd_mkldnn);
+}
+
+void FD_C_RuntimeOptionWrapperEnablePaddleToTrt(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePaddleToTrt();
+}
+
+void FD_C_RuntimeOptionWrapperDeletePaddleBackendPass(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* delete_pass_name) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DeletePaddleBackendPass(std::string(delete_pass_name));
+}
+
+void FD_C_RuntimeOptionWrapperEnablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePaddleLogInfo();
+}
+
+void FD_C_RuntimeOptionWrapperDisablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisablePaddleLogInfo();
+}
+
+void FD_C_RuntimeOptionWrapperSetPaddleMKLDNNCacheSize(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int size) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetPaddleMKLDNNCacheSize(size);
+}
+
+void FD_C_RuntimeOptionWrapperSetOpenVINODevice(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* name) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetOpenVINODevice(std::string(name));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteOptimizedModelDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* optimized_model_dir) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteOptimizedModelDir(std::string(optimized_model_dir));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_path) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteSubgraphPartitionPath(
+      std::string(nnadapter_subgraph_partition_config_path));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionConfigBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_buffer) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteSubgraphPartitionConfigBuffer(
+      std::string(nnadapter_subgraph_partition_config_buffer));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteContextProperties(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_context_properties) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteContextProperties(
+      std::string(nnadapter_context_properties));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteModelCacheDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_model_cache_dir) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteModelCacheDir(std::string(nnadapter_model_cache_dir));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteMixedPrecisionQuantizationConfigPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_mixed_precision_quantization_config_path) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+}
+
+void FD_C_RuntimeOptionWrapperEnableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnableLiteFP16();
+}
+
+void FD_C_RuntimeOptionWrapperDisableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisableLiteFP16();
+}
+
+void FD_C_RuntimeOptionWrapperEnableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnableLiteInt8();
+}
+
+void FD_C_RuntimeOptionWrapperDisableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisableLiteInt8();
+}
+
+void FD_C_RuntimeOptionWrapperSetLitePowerMode(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_LitePowerMode mode) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLitePowerMode(
+      static_cast<fastdeploy::LitePowerMode>(mode));
+}
+
+void FD_C_RuntimeOptionWrapperEnableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnableTrtFP16();
+}
+
+void FD_C_RuntimeOptionWrapperDisableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisableTrtFP16();
+}
+
+void FD_C_RuntimeOptionWrapperSetTrtCacheFile(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* cache_file_path) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetTrtCacheFile(std::string(cache_file_path));
+}
+
+void FD_C_RuntimeOptionWrapperEnablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePinnedMemory();
+}
+
+void FD_C_RuntimeOptionWrapperDisablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisablePinnedMemory();
+}
+
+void FD_C_RuntimeOptionWrapperEnablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePaddleTrtCollectShape();
+}
+
+void FD_C_RuntimeOptionWrapperDisablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisablePaddleTrtCollectShape();
+}
+
+void FD_C_RuntimeOptionWrapperSetOpenVINOStreams(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int num_streams) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetOpenVINOStreams(num_streams);
+}
+
+void FD_C_RuntimeOptionWrapperUseIpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int device_num, int micro_batch_size, FD_C_Bool enable_pipelining,
+    int batches_per_step) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseIpu(device_num, micro_batch_size, enable_pipelining,
+                         batches_per_step);
+}
+
+void FD_C_RuntimeOptionWrapperSetIpuConfig(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool enable_fp16, int replica_num, float available_memory_proportion,
+    FD_C_Bool enable_half_partial) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetIpuConfig(enable_fp16, replica_num,
+                               available_memory_proportion,
+                               enable_half_partial);
+}
+
+}  // extern "C"
diff --git a/c_api/fastdeploy_capi/runtime_option.h b/c_api/fastdeploy_capi/runtime_option.h
new file mode 100644
index 000000000..cfc087473
--- /dev/null
+++ b/c_api/fastdeploy_capi/runtime_option.h
@@ -0,0 +1,517 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "fastdeploy_capi/fd_common.h"
+
+typedef struct FD_C_RuntimeOptionWrapper FD_C_RuntimeOptionWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Create a new FD_C_RuntimeOptionWrapper object
+ *
+ * \return Return a pointer to FD_C_RuntimeOptionWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_RuntimeOptionWrapper*
+FD_C_CreateRuntimeOptionWrapper();
+
+/** \brief Destroy a FD_C_RuntimeOptionWrapper object
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyRuntimeOptionWrapper(
+    __fd_take FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/** \brief Set path of model file and parameter file
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] model_path Path of model file, e.g ResNet50/model.pdmodel for Paddle format model / ResNet50/model.onnx for ONNX format model
+ * \param[in] params_path Path of parameter file, this only used when the model format is Paddle, e.g Resnet50/model.pdiparams
+ * \param[in] format Format of the loaded model
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetModelPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_path, const char* params_path,
+    const FD_C_ModelFormat format);
+
+/** \brief Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] model_buffer The memory buffer of model
+ * \param[in] params_buffer The memory buffer of the combined parameters file
+ * \param[in] format Format of the loaded model
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetModelBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_buffer, const char* params_buffer,
+    const FD_C_ModelFormat);
+
+/** \brief Use cpu to inference, the runtime will inference on CPU by default
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseCpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/** \brief Use Nvidia GPU to inference
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseGpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int gpu_id);
+
+/** \brief Use RKNPU2 to inference
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] rknpu2_name  CpuName enum value
+ * \param[in] rknpu2_core CoreMask enum value
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseRKNPU2(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_rknpu2_CpuName rknpu2_name, FD_C_rknpu2_CoreMask rknpu2_core);
+
+/** \brief Use TimVX to inference
+ *
+ *  \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseTimVX(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/** \brief Use Huawei Ascend to inference
+ *
+ *  \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseAscend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+///
+/// \brief Turn on KunlunXin XPU.
+///
+/// \param[in] fd_c_runtime_option_wrapper pointer to \
+                    FD_C_RuntimeOptionWrapper object
+/// \param[in] kunlunxin_id the KunlunXin XPU card to use\
+                    (default is 0).
+/// \param[in] l3_workspace_size The size of the video memory allocated\
+///         by the l3 cache, the maximum is 16M.
+/// \param[in] locked Whether the allocated L3 cache can be locked. If false,
+///       it means that the L3 cache is not locked, and the allocated L3
+///       cache can be shared by multiple models, and multiple models
+///       sharing the L3 cache will be executed sequentially on the card.
+/// \param[in] autotune Whether to autotune the conv operator in the model. If
+///       true, when the conv operator of a certain dimension is executed
+///       for the first time, it will automatically search for a better
+///       algorithm to improve the performance of subsequent conv operators
+///       of the same dimension.
+/// \param[in] autotune_file Specify the path of the autotune file. If
+///       autotune_file is specified, the algorithm specified in the
+///       file will be used and autotune will not be performed again.
+/// \param[in] precision Calculation accuracy of multi_encoder
+/// \param[in] adaptive_seqlen Is the input of multi_encoder variable length
+/// \param[in] enable_multi_stream Whether to enable the multi stream of
+///        KunlunXin XPU.
+///
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseKunlunXin(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int kunlunxin_id, int l3_workspace_size, FD_C_Bool locked,
+    FD_C_Bool autotune, const char* autotune_file, const char* precision,
+    FD_C_Bool adaptive_seqlen, FD_C_Bool enable_multi_stream);
+
+/** Use Sophgo to inference
+ *
+ *  \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseSophgo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetExternalStream(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    void* external_stream);
+
+/**
+  * @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] thread_num number of threads
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetCpuThreadNum(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int thread_num);
+
+/**
+  * @brief Set ORT graph opt level, default is decide by ONNX Runtime itself
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] level optimization level
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetOrtGraphOptLevel(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int level);
+
+/**
+  * @brief Set Paddle Inference as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUsePaddleBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Wrapper function of UsePaddleBackend()
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperUsePaddleInferBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set ONNX Runtime as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseOrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set SOPHGO Runtime as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseSophgoBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set TensorRT as inference backend, only support GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseTrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set Poros backend as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUsePorosBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set OpenVINO as inference backend, only support CPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseOpenVINOBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set Paddle Lite as inference backend, only support arm cpu
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Wrapper function of UseLiteBackend()
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperUsePaddleLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set mkldnn switch while using Paddle Inference as inference backend
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] pd_mkldnn whether to use mkldnn
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetPaddleMKLDNN(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool pd_mkldnn);
+
+/**
+  * @brief If TensorRT backend is used, EnablePaddleToTrt will change to use Paddle Inference backend, and use its integrated TensorRT instead.
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnablePaddleToTrt(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Delete pass by name while using Paddle Inference as inference backend, this can be called multiple times to delete a set of passes
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] delete_pass_name pass name
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperDeletePaddleBackendPass(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* delete_pass_name);
+
+/**
+  * @brief Enable print debug information while using Paddle Inference as inference backend, the backend disable the debug information by default
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Disable print debug information while using Paddle Inference as inference backend
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperDisablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set shape cache size while using Paddle Inference with mkldnn, by default it will cache all the difference shape
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] size cache size
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetPaddleMKLDNNCacheSize(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper, int size);
+
+/**
+  * @brief Set device name for OpenVINO, default 'CPU', can also be 'AUTO', 'GPU', 'GPU.1'....
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] name device name
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetOpenVINODevice(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* name);
+
+/**
+ * @brief Set optimzed model dir for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] optimized_model_dir optimzed model dir
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteOptimizedModelDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* optimized_model_dir);
+
+/**
+ * @brief Set subgraph partition path for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_subgraph_partition_config_path subgraph partition path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_path);
+
+/**
+ * @brief Set subgraph partition path for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_subgraph_partition_config_buffer subgraph partition path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionConfigBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_buffer);
+
+/**
+ * @brief Set context properties for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_context_properties context properties
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteContextProperties(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_context_properties);
+
+/**
+ * @brief Set model cache dir for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_model_cache_dir model cache dir
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteModelCacheDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_model_cache_dir);
+
+/**
+ * @brief Set mixed precision quantization config path for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_mixed_precision_quantization_config_path mixed precision quantization config path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteMixedPrecisionQuantizationConfigPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_mixed_precision_quantization_config_path);
+
+/**
+ * @brief enable half precision while use paddle lite backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief disable half precision, change to full precision(float32)
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief enable int8 precision while use paddle lite backend
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief disable int8 precision, change to full precision(float32)
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Set power mode while using Paddle Lite as inference backend, mode(0: LITE_POWER_HIGH; 1: LITE_POWER_LOW; 2: LITE_POWER_FULL; 3: LITE_POWER_NO_BIND, 4: LITE_POWER_RAND_HIGH; 5: LITE_POWER_RAND_LOW, refer [paddle lite](https://paddle-lite.readthedocs.io/zh/latest/api_reference/cxx_api_doc.html#set-power-mode) for more details)
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] mode power mode
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetLitePowerMode(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_LitePowerMode mode);
+
+/**
+ * @brief Enable FP16 inference while using TensorRT backend. Notice: not all the GPU device support FP16, on those device doesn't support FP16, FastDeploy will fallback to FP32 automaticly
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Disable FP16 inference while using TensorRT backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] cache_file_path cache file path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetTrtCacheFile(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* cache_file_path);
+
+/**
+ * @brief Enable pinned memory. Pinned memory can be utilized to speedup the data transfer between CPU and GPU. Currently it's only suppurted in TRT backend and Paddle Inference backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Disable pinned memory
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Enable to collect shape in paddle trt backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperEnablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Disable to collect shape in paddle trt backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperDisablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set number of streams by the OpenVINO backends
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] num_streams number of streams
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetOpenVINOStreams(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int num_streams);
+
+/**
+ * @brief \Use Graphcore IPU to inference.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] device_num the number of IPUs.
+ * \param[in] micro_batch_size the batch size in the graph, only work when graph has no batch shape info.
+ * \param[in] enable_pipelining enable pipelining.
+ * \param[in] batches_per_step the number of batches per run in pipelining.
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseIpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int device_num, int micro_batch_size, FD_C_Bool enable_pipelining,
+    int batches_per_step);
+
+/** \brief Set IPU config.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] enable_fp16 enable fp16.
+ * \param[in] replica_num the number of graph replication.
+ * \param[in] available_memory_proportion the available memory proportion for matmul/conv.
+ * \param[in] enable_half_partial enable fp16 partial for matmul, only work with fp16.
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetIpuConfig(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool enable_fp16, int replica_num, float available_memory_proportion,
+    FD_C_Bool enable_half_partial);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/c_api/fastdeploy_capi/types_internal.cc b/c_api/fastdeploy_capi/types_internal.cc
new file mode 100644
index 000000000..807f5dd21
--- /dev/null
+++ b/c_api/fastdeploy_capi/types_internal.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/types_internal.h"
+
+namespace fastdeploy {
+
+#ifdef ENABLE_VISION
+
+std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>&
+FD_C_CheckAndConvertPaddleClasModelWrapper(
+    FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper) {
+  FDASSERT(
+      fd_c_paddleclas_model_wrapper != nullptr,
+      "The pointer of fd_c_paddleclas_model_wrapper shouldn't be nullptr.");
+  return fd_c_paddleclas_model_wrapper->paddleclas_model;
+}
+
+std::unique_ptr<fastdeploy::vision::detection::PPYOLOE>&
+FD_C_CheckAndConvertPPYOLOEWrapper(FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper) {
+  FDASSERT(fd_c_ppyoloe_wrapper != nullptr,
+           "The pointer of fd_c_ppyoloe_wrapper shouldn't be nullptr.");
+  return fd_c_ppyoloe_wrapper->ppyoloe_model;
+}
+
+std::unique_ptr<fastdeploy::vision::ClassifyResult>&
+FD_C_CheckAndConvertClassifyResultWrapper(
+    FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  FDASSERT(fd_c_classify_result_wrapper != nullptr,
+           "The pointer of fd_c_classify_result_wrapper shouldn't be nullptr.");
+  return fd_c_classify_result_wrapper->classify_result;
+}
+
+std::unique_ptr<fastdeploy::vision::DetectionResult>&
+FD_C_CheckAndConvertDetectionResultWrapper(
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  FDASSERT(
+      fd_c_detection_result_wrapper != nullptr,
+      "The pointer of fd_c_detection_result_wrapper shouldn't be nullptr.");
+  return fd_c_detection_result_wrapper->detection_result;
+}
+#endif
+
+std::unique_ptr<fastdeploy::RuntimeOption>&
+FD_C_CheckAndConvertRuntimeOptionWrapper(
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  FDASSERT(fd_c_runtime_option_wrapper != nullptr,
+           "The pointer of fd_c_runtime_option_wrapper shouldn't be nullptr.");
+  return fd_c_runtime_option_wrapper->runtime_option;
+}
+
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/c_api/fastdeploy_capi/types_internal.h b/c_api/fastdeploy_capi/types_internal.h
new file mode 100644
index 000000000..f8a2cfbe9
--- /dev/null
+++ b/c_api/fastdeploy_capi/types_internal.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/runtime/runtime_option.h"
+#include "fastdeploy_capi/fd_type.h"
+#include <memory>
+
+#ifdef ENABLE_VISION
+#include "fastdeploy/vision/classification/ppcls/model.h"
+#include "fastdeploy/vision/common/result.h"
+#include "fastdeploy/vision/detection/ppdet/model.h"
+
+typedef struct FD_C_ClassifyResultWrapper {
+  std::unique_ptr<fastdeploy::vision::ClassifyResult> classify_result;
+} FD_C_ClassifyResultWrapper;
+
+typedef struct FD_C_DetectionResultWrapper {
+  std::unique_ptr<fastdeploy::vision::DetectionResult> detection_result;
+} FD_C_DetectionResultWrapper;
+
+typedef struct FD_C_PaddleClasModelWrapper {
+  std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>
+      paddleclas_model;
+} FD_C_PaddleClasModelWrapper;
+
+typedef struct FD_C_PPYOLOEWrapper {
+  std::unique_ptr<fastdeploy::vision::detection::PPYOLOE> ppyoloe_model;
+} FD_C_PPYOLOEWrapper;
+
+namespace fastdeploy {
+std::unique_ptr<fastdeploy::vision::ClassifyResult>&
+FD_C_CheckAndConvertClassifyResultWrapper(
+    FD_C_ClassifyResultWrapper* fd_classify_result_wrapper);
+std::unique_ptr<fastdeploy::vision::DetectionResult>&
+FD_C_CheckAndConvertDetectionResultWrapper(
+    FD_C_DetectionResultWrapper* fd_detection_result_wrapper);
+std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>&
+FD_C_CheckAndConvertPaddleClasModelWrapper(
+    FD_C_PaddleClasModelWrapper* fd_paddleclas_model_wrapper);
+std::unique_ptr<fastdeploy::vision::detection::PPYOLOE>&
+FD_C_CheckAndConvertPPYOLOEWrapper(FD_C_PPYOLOEWrapper* fd_ppyoloe_wrapper);
+}  // namespace fastdeploy
+
+#endif
+
+typedef struct FD_C_RuntimeOptionWrapper {
+  std::unique_ptr<fastdeploy::RuntimeOption> runtime_option;
+} FD_C_RuntimeOptionWrapper;
+
+namespace fastdeploy {
+std::unique_ptr<fastdeploy::RuntimeOption>&
+FD_C_CheckAndConvertRuntimeOptionWrapper(
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+}
+
+#define CHECK_AND_CONVERT_FD_TYPE(TYPENAME, variable_name)                     \
+  fastdeploy::FD_C_CheckAndConvert##TYPENAME(variable_name)
diff --git a/c_api/fastdeploy_capi/vision/classification/ppcls/model.cc b/c_api/fastdeploy_capi/vision/classification/ppcls/model.cc
new file mode 100644
index 000000000..3ed62f26a
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/classification/ppcls/model.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/classification/ppcls/model.h"
+
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+FD_C_PaddleClasModelWrapper* FD_C_CreatePaddleClasModelWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper =
+      new FD_C_PaddleClasModelWrapper();
+  fd_c_paddleclas_model_wrapper->paddleclas_model =
+      std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>(
+          new fastdeploy::vision::classification::PaddleClasModel(
+              std::string(model_file), std::string(params_file),
+              std::string(config_file), *runtime_option,
+              static_cast<fastdeploy::ModelFormat>(model_format)));
+  return fd_c_paddleclas_model_wrapper;
+}
+
+void FD_C_DestroyPaddleClasModelWrapper(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper) {
+  delete fd_c_paddleclas_model_wrapper;
+}
+
+FD_C_Bool FD_C_PaddleClasModelWrapperPredict(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper,
+    FD_C_Mat img, FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  cv::Mat* im = reinterpret_cast<cv::Mat*>(img);
+  auto& paddleclas_model = CHECK_AND_CONVERT_FD_TYPE(
+      PaddleClasModelWrapper, fd_c_paddleclas_model_wrapper);
+  auto& classify_result = CHECK_AND_CONVERT_FD_TYPE(
+      ClassifyResultWrapper, fd_c_classify_result_wrapper);
+  return paddleclas_model->Predict(im, classify_result.get());
+}
+}
\ No newline at end of file
diff --git a/c_api/fastdeploy_capi/vision/classification/ppcls/model.h b/c_api/fastdeploy_capi/vision/classification/ppcls/model.h
new file mode 100644
index 000000000..db117e605
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/classification/ppcls/model.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+#include "fastdeploy_capi/runtime_option.h"
+#include "fastdeploy_capi/vision/result.h"
+
+typedef struct FD_C_PaddleClasModelWrapper FD_C_PaddleClasModelWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Create a new FD_C_PaddleClasModelWrapper object
+ *
+ * \param[in] model_file Path of model file, e.g resnet/model.pdmodel
+ * \param[in] params_file Path of parameter file, e.g resnet/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+ * \param[in] config_file Path of configuration file for deployment, e.g resnet/infer_cfg.yml
+ * \param[in] fd_c_runtime_option_wrapper RuntimeOption for inference, the default will use cpu, and choose the backend defined in `valid_cpu_backends`
+ * \param[in] model_format Model format of the loaded model, default is Paddle format
+ *
+ * \return Return a pointer to FD_C_PaddleClasModelWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_PaddleClasModelWrapper*
+FD_C_CreatePaddleClasModelWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format);
+
+/** \brief Destroy a FD_C_PaddleClasModelWrapper object
+ *
+ * \param[in] fd_c_paddleclas_model_wrapper pointer to FD_C_PaddleClasModelWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyPaddleClasModelWrapper(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper);
+
+/** \brief Predict the classification result for an input image
+ *
+ * \param[in] fd_c_paddleclas_model_wrapper pointer to FD_C_PaddleClasModelWrapper object
+ * \param[in] img pointer to cv::Mat image
+ * \param[in] fd_c_classify_result_wrapper pointer to FD_C_PaddleClasModelWrapper object, which stores the result.
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern FD_C_Bool FD_C_PaddleClasModelWrapperPredict(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper,
+    FD_C_Mat img, FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/c_api/fastdeploy_capi/vision/detection/ppdet/model.cc b/c_api/fastdeploy_capi/vision/detection/ppdet/model.cc
new file mode 100644
index 000000000..17a87ec8b
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/detection/ppdet/model.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/detection/ppdet/model.h"
+
+#include "fastdeploy_capi/types_internal.h"
+#include "fastdeploy_capi/vision/visualize.h"
+
+extern "C" {
+
+FD_C_PPYOLOEWrapper* FD_C_CreatesPPYOLOEWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper = new FD_C_PPYOLOEWrapper();
+  fd_c_ppyoloe_wrapper->ppyoloe_model =
+      std::unique_ptr<fastdeploy::vision::detection::PPYOLOE>(
+          new fastdeploy::vision::detection::PPYOLOE(
+              std::string(model_file), std::string(params_file),
+              std::string(config_file), *runtime_option,
+              static_cast<fastdeploy::ModelFormat>(model_format)));
+  return fd_c_ppyoloe_wrapper;
+}
+
+void FD_C_DestroyPPYOLOEWrapper(
+    __fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper) {
+  delete fd_c_ppyoloe_wrapper;
+}
+
+FD_C_Bool FD_C_PPYOLOEWrapperPredict(
+    FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper, FD_C_Mat img,
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  cv::Mat* im = reinterpret_cast<cv::Mat*>(img);
+  auto& ppyoloe_model =
+      CHECK_AND_CONVERT_FD_TYPE(PPYOLOEWrapper, fd_c_ppyoloe_wrapper);
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+  return ppyoloe_model->Predict(im, detection_result.get());
+}
+}
\ No newline at end of file
diff --git a/c_api/fastdeploy_capi/vision/detection/ppdet/model.h b/c_api/fastdeploy_capi/vision/detection/ppdet/model.h
new file mode 100644
index 000000000..6dce7a64e
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/detection/ppdet/model.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+#include "fastdeploy_capi/runtime_option.h"
+#include "fastdeploy_capi/vision/result.h"
+
+typedef struct FD_C_PPYOLOEWrapper FD_C_PPYOLOEWrapper;
+typedef struct FD_C_RuntimeOptionWrapper FD_C_RuntimeOptionWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Create a new FD_C_PPYOLOEWrapper object
+ *
+ * \param[in] model_file Path of model file, e.g resnet/model.pdmodel
+ * \param[in] params_file Path of parameter file, e.g resnet/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+ * \param[in] config_file Path of configuration file for deployment, e.g resnet/infer_cfg.yml
+ * \param[in] fd_c_runtime_option_wrapper RuntimeOption for inference, the default will use cpu, and choose the backend defined in `valid_cpu_backends`
+ * \param[in] model_format Model format of the loaded model, default is Paddle format
+ *
+ * \return Return a pointer to FD_C_PPYOLOEWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_PPYOLOEWrapper*
+FD_C_CreatesPPYOLOEWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format);
+
+/** \brief Destroy a FD_C_PPYOLOEWrapper object
+ *
+ * \param[in] fd_c_ppyoloe_wrapper pointer to FD_C_PPYOLOEWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_DestroyPPYOLOEWrapper(__fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper);
+
+/** \brief Predict the detection result for an input image
+ *
+ * \param[in] fd_c_ppyoloe_wrapper pointer to FD_C_PPYOLOEWrapper object
+ * \param[in] img pointer to cv::Mat image
+ * \param[in] fd_c_detection_result_wrapper pointer to FD_C_DetectionResultWrapper object, which stores the result.
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern FD_C_Bool FD_C_PPYOLOEWrapperPredict(
+    __fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper, FD_C_Mat img,
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/c_api/fastdeploy_capi/vision/result.cc b/c_api/fastdeploy_capi/vision/result.cc
new file mode 100644
index 000000000..abf52ba69
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/result.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/result.h"
+
+#include "fastdeploy/utils/utils.h"
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+// Classification Results
+
+FD_C_ClassifyResultWrapper* FD_C_CreateClassifyResultWrapper() {
+  FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper =
+      new FD_C_ClassifyResultWrapper();
+  fd_c_classify_result_wrapper->classify_result =
+      std::unique_ptr<fastdeploy::vision::ClassifyResult>(
+          new fastdeploy::vision::ClassifyResult());
+  return fd_c_classify_result_wrapper;
+}
+
+void FD_C_DestroyClassifyResultWrapper(
+    __fd_take FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  delete fd_c_classify_result_wrapper;
+}
+
+void FD_C_DestroyClassifyResult(
+    __fd_take FD_C_ClassifyResult* fd_c_classify_result) {
+  if (fd_c_classify_result == nullptr) return;
+  // delete label_ids
+  delete[] fd_c_classify_result->label_ids.data;
+  // delete scores
+  delete[] fd_c_classify_result->scores.data;
+  delete fd_c_classify_result;
+}
+
+FD_C_ClassifyResult* FD_C_ClassifyResultWrapperGetData(
+    __fd_keep FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  auto& classify_result = CHECK_AND_CONVERT_FD_TYPE(
+      ClassifyResultWrapper, fd_c_classify_result_wrapper);
+  FD_C_ClassifyResult* fd_c_classify_result_data = new FD_C_ClassifyResult();
+  // copy label_ids
+  fd_c_classify_result_data->label_ids.size = classify_result->label_ids.size();
+  fd_c_classify_result_data->label_ids.data =
+      new int32_t[fd_c_classify_result_data->label_ids.size];
+  memcpy(fd_c_classify_result_data->label_ids.data,
+         classify_result->label_ids.data(),
+         sizeof(int32_t) * fd_c_classify_result_data->label_ids.size);
+  // copy scores
+  fd_c_classify_result_data->scores.size = classify_result->scores.size();
+  fd_c_classify_result_data->scores.data =
+      new float[fd_c_classify_result_data->scores.size];
+  memcpy(fd_c_classify_result_data->scores.data, classify_result->scores.data(),
+         sizeof(float) * fd_c_classify_result_data->scores.size);
+  fd_c_classify_result_data->type =
+      static_cast<FD_C_ResultType>(classify_result->type);
+  return fd_c_classify_result_data;
+}
+
+FD_C_ClassifyResultWrapper* FD_C_CreateClassifyResultWrapperFromData(
+    __fd_keep FD_C_ClassifyResult* fd_c_classify_result) {
+  FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper =
+      FD_C_CreateClassifyResultWrapper();
+  auto& classify_result = CHECK_AND_CONVERT_FD_TYPE(
+      ClassifyResultWrapper, fd_c_classify_result_wrapper);
+  // copy label_ids
+  classify_result->label_ids.resize(fd_c_classify_result->label_ids.size);
+  memcpy(classify_result->label_ids.data(),
+         fd_c_classify_result->label_ids.data,
+         sizeof(int32_t) * fd_c_classify_result->label_ids.size);
+  // copy scores
+  classify_result->scores.resize(fd_c_classify_result->scores.size);
+  memcpy(classify_result->scores.data(), fd_c_classify_result->scores.data,
+         sizeof(int32_t) * fd_c_classify_result->scores.size);
+  classify_result->type =
+      static_cast<fastdeploy::vision::ResultType>(fd_c_classify_result->type);
+  return fd_c_classify_result_wrapper;
+}
+
+// Detection Results
+
+FD_C_DetectionResultWrapper* FD_C_CreateDetectionResultWrapper() {
+  FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper =
+      new FD_C_DetectionResultWrapper();
+  fd_c_detection_result_wrapper->detection_result =
+      std::unique_ptr<fastdeploy::vision::DetectionResult>(
+          new fastdeploy::vision::DetectionResult());
+  return fd_c_detection_result_wrapper;
+}
+
+void FD_C_DestroyDetectionResultWrapper(
+    __fd_take FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  delete fd_c_detection_result_wrapper;
+}
+
+void FD_C_DestroyDetectionResult(
+    __fd_take FD_C_DetectionResult* fd_c_detection_result) {
+  if (fd_c_detection_result == nullptr) return;
+  // delete boxes
+  for (size_t i = 0; i < fd_c_detection_result->boxes.size; i++) {
+    delete[] fd_c_detection_result->boxes.data[i].data;
+  }
+  delete[] fd_c_detection_result->boxes.data;
+  // delete scores
+  delete[] fd_c_detection_result->scores.data;
+  // delete label_ids
+  delete[] fd_c_detection_result->label_ids.data;
+  // delete masks
+  for (size_t i = 0; i < fd_c_detection_result->masks.size; i++) {
+    delete[] fd_c_detection_result->masks.data[i].data.data;
+    delete[] fd_c_detection_result->masks.data[i].shape.data;
+  }
+  delete fd_c_detection_result;
+}
+
+FD_C_DetectionResult* FD_C_DetectionResultWrapperGetData(
+    __fd_keep FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+  FD_C_DetectionResult* fd_c_detection_result = new FD_C_DetectionResult();
+  // copy boxes
+  const int boxes_coordinate_dim = 4;
+  fd_c_detection_result->boxes.size = detection_result->boxes.size();
+  fd_c_detection_result->boxes.data =
+      new FD_C_OneDimArrayFloat[fd_c_detection_result->boxes.size];
+  for (size_t i = 0; i < detection_result->boxes.size(); i++) {
+    fd_c_detection_result->boxes.data[i].size = boxes_coordinate_dim;
+    fd_c_detection_result->boxes.data[i].data = new float[boxes_coordinate_dim];
+    for (size_t j = 0; j < boxes_coordinate_dim; j++) {
+      fd_c_detection_result->boxes.data[i].data[j] =
+          detection_result->boxes[i][j];
+    }
+  }
+  // copy scores
+  fd_c_detection_result->scores.size = detection_result->scores.size();
+  fd_c_detection_result->scores.data =
+      new float[fd_c_detection_result->scores.size];
+  memcpy(fd_c_detection_result->scores.data, detection_result->scores.data(),
+         sizeof(float) * fd_c_detection_result->scores.size);
+  // copy label_ids
+  fd_c_detection_result->label_ids.size = detection_result->label_ids.size();
+  fd_c_detection_result->label_ids.data =
+      new int32_t[fd_c_detection_result->label_ids.size];
+  memcpy(fd_c_detection_result->label_ids.data,
+         detection_result->label_ids.data(),
+         sizeof(int32_t) * fd_c_detection_result->label_ids.size);
+  // copy masks
+  fd_c_detection_result->masks.size = detection_result->masks.size();
+  fd_c_detection_result->masks.data =
+      new FD_C_Mask[fd_c_detection_result->masks.size];
+  for (size_t i = 0; i < detection_result->masks.size(); i++) {
+    // copy data in mask
+    fd_c_detection_result->masks.data[i].data.size =
+        detection_result->masks[i].data.size();
+    fd_c_detection_result->masks.data[i].data.data =
+        new uint8_t[detection_result->masks[i].data.size()];
+    memcpy(fd_c_detection_result->masks.data[i].data.data,
+           detection_result->masks[i].data.data(),
+           sizeof(uint8_t) * detection_result->masks[i].data.size());
+    // copy shape in mask
+    fd_c_detection_result->masks.data[i].shape.size =
+        detection_result->masks[i].shape.size();
+    fd_c_detection_result->masks.data[i].shape.data =
+        new int64_t[detection_result->masks[i].shape.size()];
+    memcpy(fd_c_detection_result->masks.data[i].shape.data,
+           detection_result->masks[i].shape.data(),
+           sizeof(int64_t) * detection_result->masks[i].shape.size());
+    fd_c_detection_result->masks.data[i].type =
+        static_cast<FD_C_ResultType>(detection_result->masks[i].type);
+  }
+  fd_c_detection_result->contain_masks = detection_result->contain_masks;
+  fd_c_detection_result->type =
+      static_cast<FD_C_ResultType>(detection_result->type);
+  return fd_c_detection_result;
+}
+
+FD_C_DetectionResultWrapper* FD_C_CreateDetectionResultWrapperFromData(
+    __fd_keep FD_C_DetectionResult* fd_c_detection_result) {
+  FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper =
+      FD_C_CreateDetectionResultWrapper();
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+
+  // copy boxes
+  const int boxes_coordinate_dim = 4;
+  detection_result->boxes.resize(fd_c_detection_result->boxes.size);
+  for (size_t i = 0; i < fd_c_detection_result->boxes.size; i++) {
+    for (size_t j = 0; j < boxes_coordinate_dim; j++) {
+      detection_result->boxes[i][j] =
+          fd_c_detection_result->boxes.data[i].data[j];
+    }
+  }
+  // copy scores
+  detection_result->scores.resize(fd_c_detection_result->scores.size);
+  memcpy(detection_result->scores.data(), fd_c_detection_result->scores.data,
+         sizeof(float) * fd_c_detection_result->scores.size);
+  // copy label_ids
+  detection_result->label_ids.resize(fd_c_detection_result->label_ids.size);
+  memcpy(detection_result->label_ids.data(),
+         fd_c_detection_result->label_ids.data,
+         sizeof(int32_t) * fd_c_detection_result->label_ids.size);
+  // copy masks
+  detection_result->masks.resize(fd_c_detection_result->masks.size);
+  for (size_t i = 0; i < fd_c_detection_result->masks.size; i++) {
+    // copy data in mask
+    detection_result->masks[i].data.resize(
+        fd_c_detection_result->masks.data[i].data.size);
+    memcpy(detection_result->masks[i].data.data(),
+           fd_c_detection_result->masks.data[i].data.data,
+           sizeof(uint8_t) * fd_c_detection_result->masks.data[i].data.size);
+    // copy shape in mask
+    detection_result->masks[i].shape.resize(
+        fd_c_detection_result->masks.data[i].shape.size);
+    memcpy(detection_result->masks[i].shape.data(),
+           fd_c_detection_result->masks.data[i].shape.data,
+           sizeof(int64_t) * fd_c_detection_result->masks.data[i].shape.size);
+    detection_result->masks[i].type =
+        static_cast<fastdeploy::vision::ResultType>(
+            fd_c_detection_result->masks.data[i].type);
+  }
+  detection_result->contain_masks = fd_c_detection_result->contain_masks;
+  detection_result->type =
+      static_cast<fastdeploy::vision::ResultType>(fd_c_detection_result->type);
+
+  return fd_c_detection_result_wrapper;
+}
+}
\ No newline at end of file
diff --git a/c_api/fastdeploy_capi/vision/result.h b/c_api/fastdeploy_capi/vision/result.h
new file mode 100644
index 000000000..9d32052d9
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/result.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+
+typedef struct FD_C_ClassifyResultWrapper FD_C_ClassifyResultWrapper;
+typedef struct FD_C_DetectionResultWrapper FD_C_DetectionResultWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FD_ENUM(FD_C_ResultType){
+    UNKNOWN_RESULT,
+    CLASSIFY,
+    DETECTION,
+    SEGMENTATION,
+    OCR,
+    MOT,
+    FACE_DETECTION,
+    FACE_ALIGNMENT,
+    FACE_RECOGNITION,
+    MATTING,
+    MASK,
+    KEYPOINT_DETECTION,
+    HEADPOSE,
+};
+
+typedef struct FD_C_ClassifyResult {
+  FD_C_OneDimArrayInt32 label_ids;
+  FD_C_OneDimArrayFloat scores;
+  FD_C_ResultType type;
+} FD_C_ClassifyResult;
+
+typedef struct FD_C_Mask {
+  FD_C_OneDimArrayUint8 data;
+  FD_C_OneDimArrayInt64 shape;
+  FD_C_ResultType type;
+} FD_C_Mask;
+
+typedef struct FD_C_OneDimMask {
+  size_t size;
+  FD_C_Mask* data;
+} FD_C_OneDimMask;  // std::vector<FD_C_Mask>
+
+typedef struct FD_C_DetectionResult {
+  FD_C_TwoDimArrayFloat boxes;
+  FD_C_OneDimArrayFloat scores;
+  FD_C_OneDimArrayInt32 label_ids;
+  FD_C_OneDimMask masks;
+  FD_C_Bool contain_masks;
+  FD_C_ResultType type;
+} FD_C_DetectionResult;
+
+// Classification Results
+
+/** \brief Create a new FD_C_ClassifyResultWrapper object
+ *
+ * \return Return a pointer to FD_C_ClassifyResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_ClassifyResultWrapper*
+FD_C_CreateClassifyResultWrapper();
+
+/** \brief Destroy a FD_C_ClassifyResultWrapper object
+ *
+ * \param[in] fd_c_classify_result_wrapper pointer to FD_C_ClassifyResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyClassifyResultWrapper(
+    __fd_take FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper);
+
+/** \brief Destroy a FD_C_ClassifyResult object
+ *
+ * \param[in] fd_c_classify_result pointer to FD_C_ClassifyResult object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_DestroyClassifyResult(__fd_take FD_C_ClassifyResult* fd_c_classify_result);
+
+/** \brief Get a FD_C_ClassifyResult object from FD_C_ClassifyResultWrapper object
+ *
+ * \param[in] fd_c_classify_result_wrapper pointer to FD_C_ClassifyResultWrapper object
+ * \return Return a pointer to FD_C_ClassifyResult object
+ */
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_ClassifyResult*
+FD_C_ClassifyResultWrapperGetData(
+    __fd_keep FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper);
+
+/** \brief Create a new FD_C_ClassifyResultWrapper object from FD_C_ClassifyResult object
+ *
+ * \param[in] fd_c_classify_result pointer to FD_C_ClassifyResult object
+ * \return Return a pointer to FD_C_ClassifyResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_ClassifyResultWrapper*
+FD_C_CreateClassifyResultWrapperFromData(
+    __fd_keep FD_C_ClassifyResult* fd_c_classify_result);
+
+// Detection Results
+
+/** \brief Create a new FD_C_DetectionResultWrapper object
+ *
+ * \return Return a pointer to FD_C_DetectionResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_DetectionResultWrapper*
+FD_C_CreateDetectionResultWrapper();
+
+/** \brief Destroy a FD_C_DetectionResultWrapper object
+ *
+ * \param[in] fd_c_detection_result_wrapper pointer to FD_C_DetectionResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyDetectionResultWrapper(
+    __fd_take FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper);
+
+/** \brief Destroy a FD_C_DetectionResult object
+ *
+ * \param[in] fd_c_detection_result pointer to FD_C_DetectionResult object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyDetectionResult(
+    __fd_take FD_C_DetectionResult* fd_c_detection_result);
+
+/** \brief Get a FD_C_DetectionResult object from FD_C_DetectionResultWrapper object
+ *
+ * \param[in] fd_c_detection_result_wrapper pointer to FD_C_DetectionResultWrapper object
+ * \return Return a pointer to FD_C_DetectionResult object
+ */
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_DetectionResult*
+FD_C_DetectionResultWrapperGetData(
+    __fd_keep FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper);
+
+/** \brief Create a new FD_C_DetectionResultWrapper object from FD_C_DetectionResult object
+ *
+ * \param[in] fd_c_detection_result pointer to FD_C_DetectionResult object
+ * \return Return a pointer to FD_C_DetectionResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_DetectionResultWrapper*
+FD_C_CreateDetectionResultWrapperFromData(
+    __fd_keep FD_C_DetectionResult* fd_c_detection_result);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/c_api/fastdeploy_capi/vision/visualize.cc b/c_api/fastdeploy_capi/vision/visualize.cc
new file mode 100644
index 000000000..9132fe606
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/visualize.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/visualize.h"
+
+#include "fastdeploy/vision/visualize/visualize.h"
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+FD_C_Mat FD_C_VisDetection(FD_C_Mat im,
+                           FD_C_DetectionResult* fd_c_detection_result,
+                           float score_threshold, int line_size,
+                           float font_size) {
+  FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper =
+      FD_C_CreateDetectionResultWrapperFromData(fd_c_detection_result);
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+  cv::Mat result = fastdeploy::vision::Visualize::VisDetection(
+      *(reinterpret_cast<cv::Mat*>(im)), *detection_result, score_threshold,
+      line_size, font_size);
+  return new cv::Mat(result);
+}
+}
\ No newline at end of file
diff --git a/c_api/fastdeploy_capi/vision/visualize.h b/c_api/fastdeploy_capi/vision/visualize.h
new file mode 100644
index 000000000..43d406dab
--- /dev/null
+++ b/c_api/fastdeploy_capi/vision/visualize.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+#include "fastdeploy_capi/vision/result.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Visualize Detection
+ *
+ * \return Return a pointer to cv::Mat object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_Mat
+FD_C_VisDetection(FD_C_Mat im, FD_C_DetectionResult* fd_detection_result,
+                  float score_threshold, int line_size, float font_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/cmake/ascend.cmake b/cmake/ascend.cmake
index 3e22aa92c..227c934b0 100644
--- a/cmake/ascend.cmake
+++ b/cmake/ascend.cmake
@@ -6,12 +6,12 @@ if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
   if (NOT BUILD_FASTDEPLOY_PYTHON)
     message(STATUS "Build FastDeploy Ascend C++ library on X86 platform.")
     if(NOT PADDLELITE_URL)
-      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux_x86_huawei_ascend_npu_0105.tgz")
+      set(PADDLELITE_URL "https://paddle-qa.bj.bcebos.com/Paddle-Lite/DevelopDailyBuild/FastDeploy.CPP.inference_lite_lib.ubuntu.x86.huawei_ascend_npu.CANN5.1.RC2.alpha001.tar.gz")
     endif()
   else ()
     message(STATUS "Build FastDeploy Ascend Python library on X86 platform.")
     if(NOT PADDLELITE_URL)
-      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux_x86_huawei_ascend_npu_python_0105.tgz") 
+      set(PADDLELITE_URL "https://paddle-qa.bj.bcebos.com/Paddle-Lite/DevelopDailyBuild/FastDeploy.Python.inference_lite_lib.ubuntu.x86.huawei_ascend_npu.CANN5.1.RC2.alpha001.tar.gz") 
     endif()
   endif()	
 endif()
@@ -21,12 +21,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
   if (NOT BUILD_FASTDEPLOY_PYTHON)
     message(STATUS "Build FastDeploy Ascend C++ library on aarch64 platform.")
     if(NOT PADDLELITE_URL)
-      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux_arm64_huawei_ascend_npu_0118.tgz")
+      set(PADDLELITE_URL "https://paddle-qa.bj.bcebos.com/Paddle-Lite/DevelopDailyBuild/FastDeploy.CPP.inference_lite_lib.ubuntu.armv8.huawei_ascend_npu.CANN5.1.RC2.alpha001.tar.gz")
     endif()
   else ()
     message(STATUS "Build FastDeploy Ascend Python library on aarch64 platform.")
     if(NOT PADDLELITE_URL)
-      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux_arm64_huawei_ascend_npu_python_0118.tgz")
+      set(PADDLELITE_URL "https://paddle-qa.bj.bcebos.com/Paddle-Lite/DevelopDailyBuild/FastDeploy.Python.inference_lite_lib.ubuntu.armv8.huawei_ascend_npu.CANN5.1.RC2.alpha001.tar.gz")
     endif()
   endif()	
 endif()
diff --git a/cmake/fast_tokenizer.cmake b/cmake/fast_tokenizer.cmake
index 393952f1d..6e183dafe 100644
--- a/cmake/fast_tokenizer.cmake
+++ b/cmake/fast_tokenizer.cmake
@@ -83,7 +83,8 @@ elseif(ANDROID)
   if(NOT ANDROID_TOOLCHAIN MATCHES "clang")
      message(FATAL_ERROR "Currently, only support clang toolchain while cross compiling FastDeploy for Android with FastTokenizer, but found ${ANDROID_TOOLCHAIN}.")
   endif()    
-  set(FASTTOKENIZER_FILE "fast_tokenizer-android-${ANDROID_ABI}-${FASTTOKENIZER_VERSION}.tgz")
+  # set(FASTTOKENIZER_FILE "fast_tokenizer-android-${ANDROID_ABI}-${FASTTOKENIZER_VERSION}.tgz")
+  set(FASTTOKENIZER_FILE "fast_tokenizer-lite-android-${ANDROID_ABI}-${FASTTOKENIZER_VERSION}.tgz")
 else()
   if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
     set(FASTTOKENIZER_FILE "fast_tokenizer-linux-aarch64-${FASTTOKENIZER_VERSION}.tgz")
diff --git a/cmake/paddlelite.cmake b/cmake/paddlelite.cmake
index 2f8f26d9b..b255edc66 100755
--- a/cmake/paddlelite.cmake
+++ b/cmake/paddlelite.cmake
@@ -117,4 +117,4 @@ endif()
 
 add_library(external_paddle_lite STATIC IMPORTED GLOBAL)
 set_property(TARGET external_paddle_lite PROPERTY IMPORTED_LOCATION ${PADDLELITE_LIB})
-add_dependencies(external_paddle_lite ${PADDLELITE_PROJECT})
+add_dependencies(external_paddle_lite ${PADDLELITE_PROJECT})
\ No newline at end of file
diff --git a/cmake/poros.cmake b/cmake/poros.cmake
index 4b206d84a..63ec7c72f 100755
--- a/cmake/poros.cmake
+++ b/cmake/poros.cmake
@@ -13,6 +13,10 @@
 # limitations under the License.
 include(ExternalProject)
 
+if(NOT ENABLE_TRT_BACKEND)
+  message(FATAL_ERROR "While ENABLE_POROS_BACKEND, requires ENABLE_TRT_BACKEND=ON, but now its OFF.")
+endif()
+
 set(POROS_PROJECT "extern_poros")
 set(POROS_PREFIX_DIR ${THIRD_PARTY_PATH}/poros)
 set(POROS_SOURCE_DIR
@@ -48,9 +52,10 @@ else()
   if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
     message(FATAL_ERROR "Poros Backend doesn't support linux aarch64 now.")
   else()
-    message(FATAL_ERROR "Poros currently only provides precompiled packages for the GPU version.")
     if(WITH_GPU)
         set(POROS_FILE "poros_manylinux_torch1.12.1_cu116_trt8.4_gcc82-${POROS_VERSION}.tar.gz")
+    else()
+      message(FATAL_ERROR "Poros currently only provides precompiled packages for the GPU version.")
     endif()
   endif()
 endif()
@@ -77,7 +82,7 @@ add_dependencies(external_poros ${POROS_PROJECT})
 # Download libtorch.so with ABI=1
 set(TORCH_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
 set(TORCH_FILE "libtorch-cxx11-abi-shared-with-deps-1.12.1-cu116.zip")
-set(TROCH_URL "${TORCH_URL_BASE}${TORCH_FILE}")
+set(TORCH_URL "${TORCH_URL_BASE}${TORCH_FILE}")
 message(STATUS "Use the default Torch lib from: ${TORCH_URL}")
 download_and_decompress(${TORCH_URL} ${CMAKE_CURRENT_BINARY_DIR}/${TORCH_FILE} ${THIRD_PARTY_PATH}/install)
 if(EXISTS ${THIRD_PARTY_PATH}/install/torch)
diff --git a/cmake/summary.cmake b/cmake/summary.cmake
index faaacb417..1482539c1 100755
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -39,10 +39,12 @@ function(fastdeploy_summary)
   message(STATUS "  ENABLE_POROS_BACKEND      : ${ENABLE_POROS_BACKEND}")
   message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
   message(STATUS "  ENABLE_OPENVINO_BACKEND   : ${ENABLE_OPENVINO_BACKEND}")
+  message(STATUS "  ENABLE_BENCHMARK          : ${ENABLE_BENCHMARK}")
   message(STATUS "  WITH_GPU                  : ${WITH_GPU}")
   message(STATUS "  WITH_ASCEND               : ${WITH_ASCEND}")
   message(STATUS "  WITH_TIMVX                : ${WITH_TIMVX}")
   message(STATUS "  WITH_KUNLUNXIN            : ${WITH_KUNLUNXIN}")
+  message(STATUS "  WITH_CAPI            : ${WITH_CAPI}")
   if(ENABLE_ORT_BACKEND)
     message(STATUS "  ONNXRuntime version       : ${ONNXRUNTIME_VERSION}")
   endif()
diff --git a/docs/cn/build_and_install/download_prebuilt_libraries.md b/docs/cn/build_and_install/download_prebuilt_libraries.md
index af6c94990..903b25045 100755
--- a/docs/cn/build_and_install/download_prebuilt_libraries.md
+++ b/docs/cn/build_and_install/download_prebuilt_libraries.md
@@ -86,8 +86,9 @@ Release版本
 | Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.3.tgz) | clang++ 10.0.0编译产出|
 | Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.3.tgz) | clang++ 13.0.0编译产出 |
 | Linux aarch64 | [fastdeploy-linux-aarch64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.3.tgz) | gcc 6.3编译产出 |  
-| Android armv7&v8 | [fastdeploy-android-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.3-shared.tgz) | NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |      
-| Android armv7&v8 | [fastdeploy-android-with-text-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.3-shared.tgz) | 包含FastTokenizer、UIE等Text API，NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.3-shared.tgz) | CV API，NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-with-text-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.3-shared.tgz) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 25 及 clang++编译产出, 支持arm64-v8a及armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-with-text-only-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-only-1.0.3-shared.tgz) | 仅包含 FastTokenizer、UIE 等 Text API，NDK 25 及 clang++ 编译产出, 不包含 OpenCV 等 CV API。 支持 arm64-v8a 及 armeabi-v7a |
 
 ## Java SDK安装
 
@@ -95,8 +96,8 @@ Release版本（Java SDK 目前仅支持Android，版本为1.0.3）
 
 | 平台 | 文件 | 说明 |
 | :--- | :--- | :---- |
-| Android Java SDK | [fastdeploy-android-sdk-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.3.aar) | NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |  
-| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.3.aar) | 包含FastTokenizer、UIE等Text API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.3.aar) | CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.3.aar) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
 
 
 Develop版本（Nightly build）
@@ -108,7 +109,8 @@ Develop版本（Nightly build）
 | Mac OSX x64 | [fastdeploy-osx-x86_64-0.0.0.tgz](https://bj.bcebos.com/fastdeploy/dev/cpp/fastdeploy-osx-x86_64-0.0.0.tgz) | - |
 | Mac OSX arm64 | [fastdeploy-osx-arm64-0.0.0.tgz](https://fastdeploy.bj.bcebos.com/dev/cpp/fastdeploy-osx-arm64-0.0.0.tgz) | clang++ 13.0.0编译产出 |
 | Linux aarch64 | - | - |  
-| Android armv7&v8 | [fastdeploy-android-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-0.0.0-shared.tgz) | NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |  
-| Android armv7&v8 | [fastdeploy-android-with-text-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-with-text-0.0.0-shared.tgz) | 包含FastTokenizer、UIE等Text API，NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |  
-| Android Java SDK | [fastdeploy-android-sdk-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-0.0.0.aar) | NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |  
-| Android Java SDK | [fastdeploy-android-sdk-with-text-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-with-text-0.0.0.aar) | 包含FastTokenizer、UIE等Text API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
+| Android armv7&v8 | [fastdeploy-android-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-0.0.0-shared.tgz) | CV API，NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-with-text-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-with-text-0.0.0-shared.tgz) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 25及clang++编译产出, 支持arm64-v8a及armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-with-text-only-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-with-text-only-0.0.0-shared.tgz) | 仅包含 FastTokenizer、UIE 等 Text API，NDK 25及clang++编译产出，不包含 OpenCV 等 CV API。 支持arm64-v8a及armeabi-v7a |
+| Android Java SDK | [fastdeploy-android-sdk-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-0.0.0.aar) | CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-with-text-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-with-text-0.0.0.aar) | 包含 FastTokenizer、UIE 等 Text API，CV API，NDK 20 编译产出, minSdkVersion 15, targetSdkVersion 28 |
diff --git a/docs/cn/build_and_install/huawei_ascend.md b/docs/cn/build_and_install/huawei_ascend.md
index 3741027e2..520b23eab 100644
--- a/docs/cn/build_and_install/huawei_ascend.md
+++ b/docs/cn/build_and_install/huawei_ascend.md
@@ -118,5 +118,13 @@ FastDeploy现在已经集成FlyCV, 用户可以在支持的硬件平台上使用
 
 
 ## 六.昇腾部署Demo参考
-- 华为昇腾NPU 上使用C++部署 PaddleClas 分类模型请参考：[PaddleClas 华为升腾NPU C++ 部署示例](../../../examples/vision/classification/paddleclas/cpp/README.md)
-- 华为昇腾NPU 上使用Python部署 PaddleClas 分类模型请参考：[PaddleClas 华为升腾NPU Python 部署示例](../../../examples/vision/classification/paddleclas/python/README.md)
+
+| 模型系列 | C++ 部署示例 | Python 部署示例 |
+| :-----------| :--------   | :--------------- |
+|   PaddleClas       |   [昇腾NPU C++ 部署示例](../../../examples/vision/classification/paddleclas/cpp/README_CN.md)       |    [昇腾NPU Python 部署示例](../../../examples/vision/classification/paddleclas/python/README_CN.md)          |  
+|   PaddleDetection  |      [昇腾NPU C++ 部署示例](../../../examples/vision/detection/paddledetection/cpp/README_CN.md)        |     [昇腾NPU Python 部署示例](../../../examples/vision/detection/paddledetection/python/README_CN.md)               |
+|   PaddleSeg        |      [昇腾NPU C++ 部署示例](../../../examples/vision/segmentation/paddleseg/cpp/README_CN.md)        |      [昇腾NPU Python 部署示例](../../../examples//vision/segmentation/paddleseg/python/README_CN.md)              |
+|   PaddleOCR        |     [昇腾NPU C++ 部署示例](../../../examples/vision/ocr/PP-OCRv3/cpp/README_CN.md)         |      [昇腾NPU Python 部署示例](../../../examples/vision//ocr/PP-OCRv3/python/README_CN.md)              |
+|   Yolov5           |      [昇腾NPU C++ 部署示例](../../../examples/vision/detection/yolov5/cpp/README_CN.md)       |       [昇腾NPU Python 部署示例](../../../examples/vision/detection/yolov5/python/README_CN.md)             |
+|   Yolov6           |      [昇腾NPU C++ 部署示例](../../../examples/vision/detection/yolov6/cpp/README_CN.md)        |       [昇腾NPU Python 部署示例](../../../examples/vision/detection/yolov6/python/README_CN.md)             |
+|   Yolov7           |      [昇腾NPU C++ 部署示例](../../../examples/vision/detection/yolov7/cpp/README_CN.md)        |       [昇腾NPU Python 部署示例](../../../examples/vision/detection/yolov7/python/README_CN.md)             |
diff --git a/docs/cn/build_and_install/rknpu2.md b/docs/cn/build_and_install/rknpu2.md
index 33ee596fe..f432c94ce 100644
--- a/docs/cn/build_and_install/rknpu2.md
+++ b/docs/cn/build_and_install/rknpu2.md
@@ -11,5 +11,5 @@ RKNPU2指的是Rockchip推出的RK356X以及RK3588系列芯片的NPU。
 * [RKNPU2开发环境搭建](../faq/rknpu2/environment.md)
 * [编译FastDeploy](../faq/rknpu2/build.md)
 * [RKNN模型导出建议](../faq/rknpu2/export.md)
-* [RKNPU2模型部署demo](../faq/rknpu2/rknpu2.md)
+* [RKNPU2模型速度一览表](../faq/rknpu2/rknpu2.md)
 * [RKNPU2 常见问题合集](../faq/rknpu2/issues.md)
diff --git a/docs/cn/faq/common_faq.md b/docs/cn/faq/common_faq.md
new file mode 100644
index 000000000..70543dcb7
--- /dev/null
+++ b/docs/cn/faq/common_faq.md
@@ -0,0 +1,4 @@
+# 常见问题
+
+1. Windows安装fastdeploy-python或fastdeploy-gpu-python后，执行`import fastdeploy`时，出现提示"DLL Load failed: 找不到指定模块"
+- **解决方式** 此问题原因可能在于系统没有安装VS动态库，在此页面根据个人环境下载安装后，重新import解决 https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170
diff --git a/docs/cn/faq/develop_a_new_model.md b/docs/cn/faq/develop_a_new_model.md
index 37a312f09..bb8b482e1 100644
--- a/docs/cn/faq/develop_a_new_model.md
+++ b/docs/cn/faq/develop_a_new_model.md
@@ -3,23 +3,22 @@
 
 # FastDeploy集成新模型流程
 
-在FastDeploy里面新增一个模型，包括增加C++/Python的部署支持。 本文以torchvision v0.12.0中的ResNet50模型为例，介绍使用FastDeploy做外部[模型集成](#modelsupport)，具体包括如下3步。
+在FastDeploy里面新增一个模型，包括增加C++/Python的部署支持。 本文以YOLOv7Face模型为例，介绍使用FastDeploy做外部[模型集成](#modelsupport)，具体包括如下3步。
 
 | 步骤 | 说明                                | 创建或修改的文件                            |
 |:------:|:-------------------------------------:|:---------------------------------------------:|
-| [1](#step2)    |  在fastdeploy/vision相应任务模块增加模型实现       | resnet.h、resnet.cc、vision.h                     |
-| [2](#step4)     | 通过pybind完成Python接口绑定 | resnet_pybind.cc、classification_pybind.cc |
-| [3](#step5)     | 实现Python相应调用接口    | resnet.py、\_\_init\_\_.py                        |
+| [1](#step2)    |  在fastdeploy/vision相应任务模块增加模型实现       | yolov7face.h、yolov7face.cc、preprocessor.h、preprocess.cc、postprocessor.h、postprocessor.cc、vision.h                     |
+| [2](#step4)     | 通过pybind完成Python接口绑定 | yolov7face_pybind.cc |
+| [3](#step5)     | 实现Python相应调用接口    | yolov7face.py、\_\_init\_\_.py                        |
 
 在完成上述3步之后，一个外部模型就集成好了。
 <br />
 如果您想为FastDeploy贡献代码，还需要为新增模型添加测试代码、说明文档和代码注释，可在[测试](#test)中查看。
 ## 模型集成     <span id="modelsupport"></span>
 
-### 模型准备  <span id="step1"></span>
+## 1、模型准备  <span id="step1"></span>
 
-
-在集成外部模型之前，先要将训练好的模型（.pt，.pdparams 等）转换成FastDeploy支持部署的模型格式（.onnx，.pdmodel）。多数开源仓库会提供模型转换脚本，可以直接利用脚本做模型的转换。由于torchvision没有提供转换脚本，因此手动编写转换脚本，本文中将 `torchvison.models.resnet50` 转换为 `resnet50.onnx`， 参考代码如下：
+在集成外部模型之前，先要将训练好的模型（.pt，.pdparams 等）转换成FastDeploy支持部署的模型格式（.onnx，.pdmodel）。多数开源仓库会提供模型转换脚本，可以直接利用脚本做模型的转换。例如yolov7face官方库提供的[export.py](https://github.com/derronqi/yolov7-face/blob/main/models/export.py)文件， 若官方库未提供转换导出文件，则需要手动编写转换脚本，如torchvision没有提供转换脚本，因此手动编写转换脚本，下文中将 `torchvison.models.resnet50` 转换为 `resnet50.onnx`，参考代码如下：
 
 ```python
 import torch
@@ -41,57 +40,139 @@ torch.onnx.export(model,
 ```
 执行上述脚本将会得到 `resnet50.onnx` 文件。
 
-### C++部分  <span id="step2"></span>
-* 创建`resnet.h`文件
+## 2、CPP代码实现  <span id="step2"></span>
+### 2.1、前处理类实现 
+* 创建`preprocessor.h`文件
   * 创建位置
-    * FastDeploy/fastdeploy/vision/classification/contrib/resnet.h (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名.h)
+    * FastDeploy/fastdeploy/vision/facedet/contrib/yolov7face/preprocess.h (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名/precessor.h)
   * 创建内容
-    * 首先在resnet.h中创建 ResNet类并继承FastDeployModel父类，之后声明`Predict`、`Initialize`、`Preprocess`、`Postprocess`和`构造函数`，以及必要的变量，具体的代码细节请参考[resnet.h](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-69128489e918f305c208476ba793d8167e77de2aa7cadf5dcbac30da448bd28e)。
+    * 首先在preprocess.h中创建 Yolov7FacePreprocess 类,之后声明`Run`、`preprocess`、`LetterBox`和`构造函数`，以及必要的变量及其`set`和`get`方法，具体的代码细节请参考[preprocess.h](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.h)。
 
 ```C++
-class FASTDEPLOY_DECL ResNet : public FastDeployModel {
+class FASTDEPLOY_DECL Yolov7FacePreprocessor {
  public:
-  ResNet(...);
-  virtual bool Predict(...);
- private:
-  bool Initialize();
+  Yolov7FacePreprocessor(...);
+  bool Run(...);
+ protected:
   bool Preprocess(...);
-  bool Postprocess(...);
+  void LetterBox(...);
 };
 ```
 
-* 创建`resnet.cc`文件
+* 创建`preprocessor.cc`文件
   * 创建位置
-    * FastDeploy/fastdeploy/vision/classification/contrib/resnet.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名.cc)
+    * FastDeploy/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名/preprocessor.cc)
   * 创建内容
-    * 在`resnet.cc`中实现`resnet.h`中声明函数的具体逻辑，其中`PreProcess` 和 `PostProcess`需要参考源官方库的前后处理逻辑复现，ResNet每个函数具体逻辑如下，具体的代码请参考[resnet.cc](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-d229d702de28345253a53f2a5839fd2c638f3d32fffa6a7d04d23db9da13a871)。
+    * 在`preprocessor.cc`中实现`preprocessor.h`中声明函数的具体逻辑，其中`Preprocess`需要参考源官方库的前后处理逻辑复现，preprocessor每个函数具体逻辑如下，具体的代码请参考[preprocessor.cc](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc)。
 
 ```C++
-ResNet::ResNet(...) {
+Yolov7FacePreprocessor::Yolov7FacePreprocessor(...) {
+  // 构造函数逻辑
+  // 全局变量赋值
+}
+bool Yolov7FacePreprocessor::Run() {
+  // 执行前处理
+  // 根据传入图片数量对每张图片进行处理，通过循环的方式将每张图片传入Preprocess函数进行预处理,
+  // 即Preprocess为处理单元，Run方法为每张图片调用处理单元处理
+  return true;
+}
+bool Yolov7FacePreprocessor::Preprocess(FDMat* mat, FDTensor* output,
+                                        std::map<std::string, std::array<float, 2>>* im_info) {
+// 前处理逻辑
+// 1. LetterBox 2. convert and permute 3. 处理结果存入 FDTensor类中  
+  return true;
+}
+void Yolov7FacePreprocessor::LetterBox(FDMat* mat) {
+  //LetterBox
+  return true;
+}
+```
+
+### 2.2、后处理类实现
+* 创建`postprocessor.h`文件
+  * 创建位置
+    * FastDeploy/fastdeploy/vision/facedet/contrib/yolov7face/postprocessor.h (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名/postprocessor.h)
+  * 创建内容
+    * 首先在postprocess.h中创建 Yolov7FacePostprocess 类,之后声明`Run`和`构造函数`，以及必要的变量及其`set`和`get`方法，具体的代码细节请参考[postprocessor.h](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/postprocessor.h)。
+
+```C++
+class FASTDEPLOY_DECL Yolov7FacePostprocessor {
+ public:
+  Yolov7FacePostprocessor(...);
+  bool Run(...);
+};
+```
+
+* 创建`postprocessor.cc`文件
+  * 创建位置
+    * FastDeploy/fastdeploy/vision/facedet/contrib/yolov7face/postprocessor.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名/postprocessor.cc)
+  * 创建内容
+    * 在`postprocessor.cc`中实现`postprocessor.h`中声明函数的具体逻辑，其中`Postprocess`需要参考源官方库的前后处理逻辑复现，postprocessor每个函数具体逻辑如下，具体的代码请参考[postprocessor.cc](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/postprocessor.cc)。
+
+```C++
+Yolov7FacePostprocessor::Yolov7FacePostprocessor(...) {
+  // 构造函数逻辑
+  // 全局变量赋值
+}
+bool Yolov7FacePostprocessor::Run() {
+  // 后处理逻辑
+  // 1. Padding 2. Choose box by conf_threshold 3. NMS 4. 结果存入 FaceDetectionResult类
+  return true;
+}
+
+```
+### 2.3、YOLOv7Face实现
+* 创建`yolov7face.h`文件
+  * 创建位置
+    * FastDeploy/fastdeploy/vision/facedet/contrib/yolov7face/yolov7face.h (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名/模型名.h)
+  * 创建内容
+    * 首先在yolov7face.h中创建 YOLOv7Face 类并继承FastDeployModel父类，之后声明`Predict`、`BatchPredict`、`Initialize`和`构造函数`，以及必要的变量及其`get`方法，具体的代码细节请参考[yolov7face.h](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/yolov7face.h)。
+
+```C++
+class FASTDEPLOY_DECL YOLOv7Face : public FastDeployModel {
+ public:
+  YOLOv7Face(...);
+  virtual bool Predict(...);
+  virtual bool BatchPredict(...);
+ protected:
+  bool Initialize();
+  Yolov7FacePreprocessor preprocessor_;
+  Yolov7FacePostprocessor postprocessor_;
+};
+```
+
+* 创建`yolov7face.cc`文件
+  * 创建位置
+    * FastDeploy/fastdeploy/vision/facedet/contrib/yolov7face/yolov7face.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名/模型名.cc)
+  * 创建内容
+    * 在`yolov7face.cc`中实现`yolov7face.h`中声明函数的具体逻辑，YOLOv7Face每个函数具体逻辑如下，具体的代码请参考[yolov7face.cc](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/yolov7face.cc)。
+
+```C++
+YOLOv7Face::YOLOv7Face(...) {
   // 构造函数逻辑
   // 1. 指定 Backend 2. 设置RuntimeOption 3. 调用Initialize()函数
 }
-bool ResNet::Initialize() {
+bool YOLOv7Face::Initialize() {
   // 初始化逻辑
   // 1. 全局变量赋值 2. 调用InitRuntime()函数
   return true;
 }
-bool ResNet::Preprocess(Mat* mat, FDTensor* output) {
-// 前处理逻辑
-// 1. Resize 2. BGR2RGB 3. Normalize 4. HWC2CHW 5. 处理结果存入 FDTensor类中  
+bool YOLOv7Face::Predict(const cv::Mat& im, FaceDetectionResult* result) {
+  std::vector<FaceDetectionResult> results;
+  if (!BatchPredict({im}, &results)) {
+    return false;
+  }
+  *result = std::move(results[0]);
   return true;
 }
-bool ResNet::Postprocess(FDTensor& infer_result, ClassifyResult* result, int topk) {
-  //后处理逻辑
-  // 1. Softmax 2. Choose topk labels 3. 结果存入 ClassifyResult类
-  return true;
-}
-bool ResNet::Predict(cv::Mat* im, ClassifyResult* result, int topk) {
+// Predict是对单张图片进行预测，通过将含有一张图片的数组送入BatchPredict实现
+bool YOLOv7Face::BatchPredict(const std::vector<cv::Mat>& images, std::vector<FaceDetectionResult>* result) {
   Preprocess(...)
   Infer(...)
   Postprocess(...)
   return true;
 }
+// BatchPredict为对批量图片进行预测，接收一个含有若干张图片的动态数组vector
 ```
 <span id="step3"></span>
 * 在`vision.h`文件中加入新增模型文件
@@ -101,77 +182,116 @@ bool ResNet::Predict(cv::Mat* im, ClassifyResult* result, int topk) {
 
 ```C++
 #ifdef ENABLE_VISION
-#include "fastdeploy/vision/classification/contrib/resnet.h"
+#include "fastdeploy/vision/facedet/contrib/yolov7face.h"
 #endif
 ```
 
+## 3、Python接口封装
 
-### Pybind部分  <span id="step4"></span>
+### 3.1、Pybind部分  <span id="step4"></span>
 
 * 创建Pybind文件  
   * 创建位置
-    * FastDeploy/fastdeploy/vision/classification/contrib/resnet_pybind.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名_pybind.cc)
+    * FastDeploy/fastdeploy/vision/facedet/contrib/yolov7face/yolov7face_pybind.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/外部模型/模型名/模型名_pybind.cc)
   * 创建内容
-    * 利用Pybind将C++中的函数变量绑定到Python中，具体代码请参考[resnet_pybind.cc](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-270af0d65720310e2cfbd5373c391b2110d65c0f4efa547f7b7eeffcb958bdec)。
+    * 利用Pybind将C++中的函数变量绑定到Python中，具体代码请参考[yolov7face_pybind.cc](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/yolov7face_pybind.cc)。
 ```C++
-void BindResNet(pybind11::module& m) {
-  pybind11::class_<vision::classification::ResNet, FastDeployModel>(
-      m, "ResNet")
+void BindYOLOv7Face(pybind11::module& m) {
+  pybind11::class_<vision::facedet::YOLOv7Face, FastDeployModel>(
+      m, "YOLOv7Face")
       .def(pybind11::init<std::string, std::string, RuntimeOption, ModelFormat>())
       .def("predict", ...)
-      .def_readwrite("size", &vision::classification::ResNet::size)
-      .def_readwrite("mean_vals", &vision::classification::ResNet::mean_vals)
-      .def_readwrite("std_vals", &vision::classification::ResNet::std_vals);
+      .def("batch_predict", ...)
+      .def_property_readonly("preprocessor", ...)
+      .def_property_readonly("postprocessor", ...);
+  pybind11::class_<vision::facedet::Yolov7FacePreprocessor>(
+    m, "Yolov7FacePreprocessor")
+    .def(pybind11::init<>())
+    .def("run", ...)
+    .def_property("size", ...)
+    .def_property("padding_color_value", ...)
+    .def_property("is_scale_up", ...);
+  pybind11::class_<vision::facedet::Yolov7FacePostprocessor>(
+    m, "Yolov7FacePostprocessor")
+    .def(pybind11::init<>())
+    .def("run", ...)
+    .def_property("conf_threshold", ...)
+    .def_property("nms_threshold", ...);
 }
 ```
 
 * 调用Pybind函数
   * 修改位置
-    * FastDeploy/fastdeploy/vision/classification/classification_pybind.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/任务名称}_pybind.cc)
+    * FastDeploy/fastdeploy/vision/facedet/facedet_pybind.cc (FastDeploy/C++代码存放位置/视觉模型/任务名称/任务名称}_pybind.cc)
   * 修改内容
 ```C++
-void BindResNet(pybind11::module& m);
-void BindClassification(pybind11::module& m) {
-  auto classification_module =
-      m.def_submodule("classification", "Image classification models.");
-  BindResNet(classification_module);
+void BindYOLOv7Face(pybind11::module& m);
+void BindFaceDet(pybind11::module& m) {
+  auto facedet_module =
+      m.def_submodule("facedet", "Face detection models.");
+  BindYOLOv7Face(facedet_module);
 }
 ```
 
-
-### Python部分  <span id="step5"></span>
-
-
-* 创建`resnet.py`文件
+### 3.2、python部分 <span id="step5"></span>
+* 创建`yolov7face.py`文件
   * 创建位置
-    * FastDeploy/python/fastdeploy/vision/classification/contrib/resnet.py (FastDeploy/Python代码存放位置/fastdeploy/视觉模型/任务名称/外部模型/模型名.py)
+    * FastDeploy/python/fastdeploy/vision/facedet/contrib/yolov7face.py (FastDeploy/Python代码存放位置/fastdeploy/视觉模型/任务名称/外部模型/模型名.py)
   * 创建内容
-    * 创建ResNet类继承自FastDeployModel，实现 `\_\_init\_\_`、Pybind绑定的函数（如`predict()`）、以及`对Pybind绑定的全局变量进行赋值和获取的函数`，具体代码请参考[resnet.py](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-a4dc5ec2d450e91f1c03819bf314c238b37ac678df56d7dea3aab7feac10a157)。
+    * 创建YOLOv7Face类继承自FastDeployModel、preprocess以及postprocess类，实现 `\_\_init\_\_`、Pybind绑定的函数（如`predict()`）、以及`对Pybind绑定的全局变量进行赋值和获取的函数`，具体代码请参考[yolov7face.py](https://github.com/PaddlePaddle/FastDeploy/tree/develop/python/fastdeploy/vision/facedet/contrib/yolov7face.py)。
 
 ```python
-class ResNet(FastDeployModel):
+class YOLOv7Face(FastDeployModel):
     def __init__(self, ...):
-        self._model = C.vision.classification.ResNet(...)
-    def predict(self, input_image, topk=1):
-        return self._model.predict(input_image, topk)
+        self._model = C.vision.facedet.YOLOv7Face(...)
+    def predict(self, input_image):
+        return self._model.predict(input_image)
+    def batch_predict(self, images):
+        return self._model.batch_predict(images)
+    @property
+    def preprocessor(self):
+        return self._model.preprocessor
+    @property
+    def postprocessor(self):
+        return self._model.postprocessor
+
+class Yolov7FacePreprocessor():
+    def __init__(self, ...):
+        self._model = C.vision.facedet.Yolov7FacePreprocessor(...)
+    def run(self, input_ims):
+        return self._preprocessor.run(input_ims)
     @property
     def size(self):
-        return self._model.size
-    @size.setter
-    def size(self, wh):
-        ...
+        return self._preprocessor.size
+    @property
+    def padding_color_value(self):
+        return self._preprocessor.padding_color_value
+    ...
+
+class Yolov7FacePreprocessor():
+    def __init__(self, ...):
+        self._model = C.vision.facedet.Yolov7FacePostprocessor(...)
+    def run(self, ...):
+        return self._postprocessor.run(...)
+    @property
+    def conf_threshold(self):
+        return self._postprocessor.conf_threshold
+    @property
+    def nms_threshold(self):
+        return self._postprocessor.nms_threshold
+    ...
 ```
 <span id="step6"></span>
-* 导入ResNet类
+* 导入YOLOv7Face、Yolov7FacePreprocessor、Yolov7facePostprocessor类
   * 修改位置
-    * FastDeploy/python/fastdeploy/vision/classification/\_\_init\_\_.py (FastDeploy/Python代码存放位置/fastdeploy/视觉模型/任务名称/\_\_init\_\_.py)
+    * FastDeploy/python/fastdeploy/vision/facedet/\_\_init\_\_.py (FastDeploy/Python代码存放位置/fastdeploy/视觉模型/任务名称/\_\_init\_\_.py)
   * 修改内容
 
 ```Python
-from .contrib.resnet import ResNet
+from .contrib.yolov7face import *
 ```
 
-## 测试  <span id="test"></span>
+## 4、测试  <span id="test"></span>
 ### 编译
   * C++
     * 位置：FastDeploy/
@@ -203,8 +323,8 @@ cd dist
 pip install fastdeploy_gpu_python-版本号-cpxx-cpxxm-系统架构.whl
 ```
 
-### 编写测试代码
-  * 创建位置: FastDeploy/examples/vision/classification/resnet/ (FastDeploy/示例目录/视觉模型/任务名称/模型名/)
+## 5、示例代码开发
+  * 创建位置: FastDeploy/examples/vision/facedet/yolov7face/ (FastDeploy/示例目录/视觉模型/任务名称/模型名/)
   * 创建目录结构
 
 ```
@@ -220,9 +340,9 @@ pip install fastdeploy_gpu_python-版本号-cpxx-cpxxm-系统架构.whl
 ```
 
 * C++
-  * 编写CmakeLists文件、C++ 代码以及 README.md 内容请参考[cpp/](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-afcbe607b796509581f89e38b84190717f1eeda2df0419a2ac9034197ead5f96)。
+  * 编写CmakeLists文件、C++ 代码以及 README.md 内容请参考[cpp/](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/facedet/yolov7face/cpp)。
   * 编译 infer.cc
-    * 位置：FastDeploy/examples/vision/classification/resnet/cpp/
+    * 位置：FastDeploy/examples/vision/facedet/yolov7face/cpp/
 
 ```
 mkdir build & cd build
@@ -231,38 +351,36 @@ make
 ```
 
 * Python
-  * Python 代码以及 README.md 内容请参考[python/](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-5a0d6be8c603a8b81454ac14c17fb93555288d9adf92bbe40454449309700135)。
+  * Python 代码以及 README.md 内容请参考[python/](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/facedet/yolov7face/python)。
 
 ### 为代码添加注释
 为了方便用户理解代码，我们需要为新增代码添加注释，添加注释方法可参考如下示例。
 - C++ 代码
-您需要在resnet.h文件中为函数和变量增加注释，有如下三种注释方式，具体可参考[resnet.h](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-69128489e918f305c208476ba793d8167e77de2aa7cadf5dcbac30da448bd28e)。
+您需要在resnet.h文件中为函数和变量增加注释，有如下三种注释方式，具体可参考[yolov7face.h](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/vision/facedet/contrib/yolov7face/yolov7face.h)。
 
 ```C++
 /** \brief Predict for the input "im", the result will be saved in "result".
 *
 * \param[in] im Input image for inference.
 * \param[in] result Saving the inference result.
-* \param[in] topk The length of return values, e.g., if topk==2, the result will include the 2 most possible class label for input image.
 */
-virtual bool Predict(cv::Mat* im, ClassifyResult* result, int topk = 1);
+virtual bool Predict(const cv::Mat& im, FaceDetectionResult* result);
 /// Tuple of (width, height)
 std::vector<int> size;
-/*! @brief Initialize for ResNet model, assign values to the global variables and call InitRuntime()
+/*! @brief Initialize for YOLOv7Face model, assign values to the global variables and call InitRuntime()
 */
 bool Initialize();
 ```
 - Python 代码
-你需要为resnet.py文件中的函数和变量增加适当的注释，示例如下，具体可参考[resnet.py](https://github.com/PaddlePaddle/FastDeploy/pull/347/files#diff-a4dc5ec2d450e91f1c03819bf314c238b37ac678df56d7dea3aab7feac10a157)。
+你需要为yolov7face.py文件中的函数和变量增加适当的注释，示例如下，具体可参考[yolov7face.py](https://github.com/PaddlePaddle/FastDeploy/tree/develop/python/fastdeploy/vision/facedet/contrib/yolov7face.py)。
 
 ```python  
-  def predict(self, input_image, topk=1):
-    """Classify an input image
-    :param input_image: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
-    :param topk: (int)The topk result by the classify confidence score, default 1
-    :return: ClassifyResult
-    """
-    return self._model.predict(input_image, topk)
+    def predict(self, input_image):
+         """Detect the location and key points of human faces from an input image
+         :param input_image: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
+         :return: FaceDetectionResult
+         """
+         return self._model.predict(input_image)
 ```
 
 对于集成模型过程中的其他文件，您也可以对实现的细节添加适当的注释说明。
diff --git a/docs/cn/faq/rknpu2/build.md b/docs/cn/faq/rknpu2/build.md
index 7389d2396..3334387b7 100644
--- a/docs/cn/faq/rknpu2/build.md
+++ b/docs/cn/faq/rknpu2/build.md
@@ -1,3 +1,4 @@
+[English](../../../en/faq/rknpu2/build.md) | 中文
 # FastDeploy RKNPU2引擎编译
 
 ## FastDeploy后端支持详情
diff --git a/docs/cn/faq/rknpu2/environment.md b/docs/cn/faq/rknpu2/environment.md
index a86b19dba..5461c12f6 100644
--- a/docs/cn/faq/rknpu2/environment.md
+++ b/docs/cn/faq/rknpu2/environment.md
@@ -1,3 +1,4 @@
+[English](../../../en/faq/rknpu2/environment.md) | 中文
 # FastDeploy RKNPU2推理环境搭建
 
 ## 简介
diff --git a/docs/cn/faq/rknpu2/issues.md b/docs/cn/faq/rknpu2/issues.md
index b0cebfa9d..b7c9fe79e 100644
--- a/docs/cn/faq/rknpu2/issues.md
+++ b/docs/cn/faq/rknpu2/issues.md
@@ -1,3 +1,4 @@
+[English](../../../en/faq/rknpu2/issues.md) | 中文
 # RKNPU2常见问题合集
 
 在使用FastDeploy的过程中大家可能会碰到很多的问题，这个文档用来记录已经解决的共性问题，方便大家查阅。
diff --git a/docs/cn/faq/rknpu2/rknpu2.md b/docs/cn/faq/rknpu2/rknpu2.md
index 81f35bd43..99554e5ba 100644
--- a/docs/cn/faq/rknpu2/rknpu2.md
+++ b/docs/cn/faq/rknpu2/rknpu2.md
@@ -13,18 +13,20 @@ ONNX模型不能直接调用RK芯片中的NPU进行运算，需要把ONNX模型
 * ARM CPU使用ONNX框架进行测试
 * NPU均使用单核进行测试
 
-| 任务场景                 | 模型                                                                                       | 模型版本(表示已经测试的版本)          | ARM CPU/RKNN速度(ms) |
-|----------------------|------------------------------------------------------------------------------------------|--------------------------|--------------------|
-| Detection            | [Picodet](../../../../examples/vision/detection/paddledetection/rknpu2/README.md)        | Picodet-s                | 162/112            |
-| Detection            | [RKYOLOV5](../../../../examples/vision/detection/rkyolo/README.md)                       | YOLOV5-S-Relu(int8)      | -/57               |
-| Detection            | [RKYOLOX](../../../../examples/vision/detection/rkyolo/README.md)                        | -                        | -/-                |
-| Detection            | [RKYOLOV7](../../../../examples/vision/detection/rkyolo/README.md)                       | -                        | -/-                |
-| Segmentation         | [Unet](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md)              | Unet-cityscapes          | -/-                |
-| Segmentation         | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md) | portrait(int8)           | 133/43             |
-| Segmentation         | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md) | human(int8)              | 133/43             |
-| Face Detection       | [SCRFD](../../../../examples/vision/facedet/scrfd/rknpu2/README.md)                      | SCRFD-2.5G-kps-640(int8) | 108/42             |
-| Face FaceRecognition | [InsightFace](../../../../examples/vision/faceid/insightface/rknpu2/README_CN.md)        | ms1mv3_arcface_r18(int8) | 81/12              |
-| Classification       | [ResNet](../../../../examples/vision/classification/paddleclas/rknpu2/README.md)         | ResNet50_vd              | -/33               |
+| 任务场景                 | 模型                                                                                               | 模型版本(表示已经测试的版本)          | ARM CPU/RKNN速度(ms) |
+|----------------------|--------------------------------------------------------------------------------------------------|--------------------------|--------------------|
+| Detection            | [Picodet](../../../../examples/vision/detection/paddledetection/rknpu2/README.md)                | Picodet-s                | 162/112            |
+| Detection            | [PaddleDetection Yolov8](../../../../examples/vision/detection/paddledetection/rknpu2/README.md) | yolov8-n                 | -/100              |
+| Detection            | [PPYOLOE](../../../../examples/vision/detection/paddledetection/rknpu2/README.md)                | ppyoloe-s(int8)          | -/77               |
+| Detection            | [RKYOLOV5](../../../../examples/vision/detection/rkyolo/README.md)                               | YOLOV5-S-Relu(int8)      | -/57               |
+| Detection            | [RKYOLOX](../../../../examples/vision/detection/rkyolo/README.md)                                | -                        | -/-                |
+| Detection            | [RKYOLOV7](../../../../examples/vision/detection/rkyolo/README.md)                               | -                        | -/-                |
+| Segmentation         | [Unet](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md)                      | Unet-cityscapes          | -/-                |
+| Segmentation         | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md)         | portrait(int8)           | 133/43             |
+| Segmentation         | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md)         | human(int8)              | 133/43             |
+| Face Detection       | [SCRFD](../../../../examples/vision/facedet/scrfd/rknpu2/README.md)                              | SCRFD-2.5G-kps-640(int8) | 108/42             |
+| Face FaceRecognition | [InsightFace](../../../../examples/vision/faceid/insightface/rknpu2/README_CN.md)                | ms1mv3_arcface_r18(int8) | 81/12              |
+| Classification       | [ResNet](../../../../examples/vision/classification/paddleclas/rknpu2/README.md)                 | ResNet50_vd              | -/33               |
 
 ## 预编译库下载
 
diff --git a/docs/cn/faq/usage_of_fastdeploy_init_bat.md b/docs/cn/faq/usage_of_fastdeploy_init_bat.md
new file mode 100644
index 000000000..acd1d3beb
--- /dev/null
+++ b/docs/cn/faq/usage_of_fastdeploy_init_bat.md
@@ -0,0 +1,107 @@
+# fastdeploy_init.bat工具使用方式
+
+<div id="CommandLineDeps"></div>  
+
+## 1 方式一：使用 fastdeploy_init.bat 进行配置（推荐）  
+<div id="CommandLineDeps1"></div>  
+
+对于版本高于0.2.1的SDK，我们提供了 **fastdeploy_init.bat** 工具来管理FastDeploy中所有的依赖库。可以通过该脚本工具查看(show)、拷贝(install) 和 设置(init and setup) SDK中所有的dll，方便用户快速完成运行时环境配置。
+
+### 1.1 fastdeploy_init.bat 使用说明  
+<div id="CommandLineDeps11"></div>  
+
+首先进入SDK的根目录，运行以下命令，可以查看 fastdeploy_init.bat 的用法说明
+```bat
+D:\path-to-your-fastdeploy-sdk-dir>fastdeploy_init.bat help
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+[1] [help]    print help information:                      fastdeploy_init.bat help
+[2] [show]    show all dlls/libs/include paths:            fastdeploy_init.bat show fastdeploy-sdk-dir
+[3] [init]    init all dlls paths for current terminal:    fastdeploy_init.bat init fastdeploy-sdk-dir  [WARNING: need copy onnxruntime.dll manually]
+[4] [setup]   setup path env for current terminal:         fastdeploy_init.bat setup fastdeploy-sdk-dir [WARNING: need copy onnxruntime.dll manually]
+[5] [install] install all dlls to a specific dir:          fastdeploy_init.bat install fastdeploy-sdk-dir another-dir-to-install-dlls **[RECOMMEND]**
+[6] [install] install all dlls with logging infos:         fastdeploy_init.bat install fastdeploy-sdk-dir another-dir-to-install-dlls info
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+```  
+用法简要说明如下：  
+- help:     打印所有的用法说明  
+- show:     查看SDK中所有的 dll、lib 和 include 路径
+- init:     初始化所有dll路径信息，后续用于设置terminal环境变量（不推荐，请参考4.3中关于onnxruntime的说明）
+- setup:    在init之后运行，设置terminal环境便令（不推荐，请参考4.3中关于onnxruntime的说明）  
+- install:  将SDK中所有的dll安装到某个指定的目录（推荐）
+
+### 1.2  fastdeploy_init.bat 查看 SDK 中所有的 dll、lib 和 include 路径  
+<div id="CommandLineDeps12"></div>  
+
+进入SDK的根目录，运行show命令，可以查看SDK中所有的 dll、lib 和 include 路径。以下命令中 %cd% 表示当前目录（SDK的根目录）。  
+```bat
+D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat show %cd%
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+[SDK] D:\path-to-fastdeploy-sdk-dir
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+[DLL] D:\path-to-fastdeploy-sdk-dir\lib\fastdeploy.dll **[NEEDED]**
+[DLL] D:\path-to-fastdeploy-sdk-dir\third_libs\install\faster_tokenizer\lib\core_tokenizers.dll  **[NEEDED]**
+[DLL] D:\path-to-fastdeploy-sdk-dir\third_libs\install\opencv\build\x64\vc15\bin\opencv_ffmpeg3416_64.dll  **[NEEDED]**
+......
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+[Lib] D:\path-to-fastdeploy-sdk-dir\lib\fastdeploy.lib **[NEEDED][fastdeploy]**
+[Lib] D:\path-to-fastdeploy-sdk-dir\third_libs\install\faster_tokenizer\lib\core_tokenizers.lib  **[NEEDED][fastdeploy::text]**
+[Lib] D:\path-to-fastdeploy-sdk-dir\third_libs\install\opencv\build\x64\vc15\lib\opencv_world3416.lib  **[NEEDED][fastdeploy::vision]**
+......
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+[Include] D:\path-to-fastdeploy-sdk-dir\include **[NEEDED][fastdeploy]**
+[Include] D:\path-to-fastdeploy-sdk-dir\third_libs\install\faster_tokenizer\include  **[NEEDED][fastdeploy::text]**
+[Include] D:\path-to-fastdeploy-sdk-dir\third_libs\install\opencv\build\include  **[NEEDED][fastdeploy::vision]**
+......
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+[XML] D:\path-to-fastdeploy-sdk-dir\third_libs\install\openvino\runtime\bin\plugins.xml **[NEEDED]**
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+```  
+可以看到该命令会根据您当前的SDK，输出对应的信息，包含 dll、lib 和 include 的路径信息。对于 dll，被标记为 `[NEEDED]`的，是运行时所需要的，如果包含OpenVINO后端，还需要将他的plugins.xml拷贝到exe所在的目录；对于 lib 和 include，被标记为`[NEEDED]`的，是开发时所需要配置的最小依赖。并且，我们还增加了对应的API Tag标记，如果您只使用vision API，则只需要配置标记为 `[NEEDED][fastdeploy::vision]` 的 lib 和 include 路径.  
+
+### 1.3 fastdeploy_init.bat 安装 SDK 中所有的 dll 到指定的目录 （推荐）
+<div id="CommandLineDeps13"></div>  
+
+进入SDK的根目录，运行install命令，可以将SDK 中所有的 dll 安装到指定的目录（如exe所在的目录）。我们推荐这种方式来配置exe运行所需要的依赖库。比如，可以在SDK根目录下创建一个临时的bin目录备份所有的dll文件。以下命令中 %cd% 表示当前目录（SDK的根目录）。
+```bat  
+% info参数为可选参数，添加info参数后会打印详细的安装信息 %
+D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat install %cd% bin
+D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat install %cd% bin info
+```
+```bat
+D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat install %cd% bin
+[INFO] Do you want to install all FastDeploy dlls ?
+[INFO] From: D:\path-to-fastdeploy-sdk-dir
+[INFO]   To: bin
+Choose y means YES, n means NO: [y/n]y
+YES.
+请按任意键继续. . .
+[INFO] Created bin done!
+已复制         1 个文件。
+已复制         1 个文件。
+已复制         1 个文件。
+已复制         1 个文件。
+.....
+已复制         1 个文件。
+已复制         1 个文件。
+已复制         1 个文件。
+已复制         1 个文件。
+.....
+```  
+### 1.4 fastdeploy_init.bat 配置 SDK 环境变量  
+<div id="CommandLineDeps14"></div>  
+
+您也可以选择通过配置环境变量的方式来设置运行时的依赖库环境，这种方式只在当前的terminal有效。如果您使用的SDK中包含了onnxruntime推理后端，我们不推荐这种方式，详细原因请参考（4.3）中关于onnxruntime配置的说明（需要手动拷贝onnxruntime所有的dll到exe所在的目录）。配置 SDK 环境变量的方式如下。以下命令中 %cd% 表示当前目录（SDK的根目录）。
+```bat
+% 先运行 init 初始化当前SDK所有的dll文件路径 %
+D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat init %cd%
+% 再运行 setup 完成 SDK 环境变量配置  %
+D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat setup %cd%
+```
+
+## 2 方式二：修改CMakeLists.txt，一行命令配置（推荐）
+<div id="CommandLineDeps2"></div>  
+
+考虑到Windows下C++开发的特殊性，如经常需要拷贝所有的lib或dll文件到某个指定的目录，FastDeploy提供了`install_fastdeploy_libraries`的cmake函数，方便用户快速配置所有的dll。修改ppyoloe的CMakeLists.txt，添加：  
+```cmake
+install_fastdeploy_libraries(${CMAKE_CURRENT_BINARY_DIR}/Release)
+```
diff --git a/docs/cn/faq/use_sdk_on_windows.md b/docs/cn/faq/use_sdk_on_windows.md
index 0683e8d04..da3760fac 100644
--- a/docs/cn/faq/use_sdk_on_windows.md
+++ b/docs/cn/faq/use_sdk_on_windows.md
@@ -2,37 +2,9 @@
 
 # 在 Windows 使用 FastDeploy C++ SDK
 
-## 目录
-- [1. 环境依赖](#Environment)  
-- [2. 下载 FastDeploy Windows 10 C++ SDK](#Download)  
-- [3. Windows下多种方式使用 C++ SDK 的方式](#CommandLine)
-  - [3.1 命令行方式使用 C++ SDK](#CommandLine)  
-    - [3.1.1 在 Windows 命令行终端 上编译 example](#CommandLine)  
-    - [3.1.2 运行可执行文件获得推理结果](#CommandLine)  
-  - [3.2 Visual Studio 2019 创建sln工程使用 C++ SDK](#VisualStudio2019Sln)  
-    - [3.2.1 Visual Studio 2019 创建 sln 工程项目](#VisualStudio2019Sln1)  
-    - [3.2.2 从examples中拷贝infer_ppyoloe.cc的代码到工程](#VisualStudio2019Sln2)  
-    - [3.2.3 将工程配置设置成"Release x64"配置](#VisualStudio2019Sln3)  
-    - [3.2.4 配置头文件include路径](#VisualStudio2019Sln4)  
-    - [3.2.5 配置lib路径和添加库文件](#VisualStudio2019Sln5)  
-    - [3.2.6 编译工程并运行获取结果](#VisualStudio2019Sln6)
-  - [3.3 Visual Studio 2019 创建CMake工程使用 C++ SDK](#VisualStudio2019)
-    - [3.3.1 Visual Studio 2019 创建CMake工程项目](#VisualStudio20191)  
-    - [3.3.2 在CMakeLists中配置 FastDeploy C++ SDK](#VisualStudio20192)  
-    - [3.3.3 生成工程缓存并修改CMakeSetting.json配置](#VisualStudio20193)  
-    - [3.3.4 生成可执行文件，运行获取结果](#VisualStudio20194)  
-- [4. 多种方法配置exe运行时所需的依赖库](#CommandLineDeps1)
-  - [4.1 使用 fastdeploy_init.bat 进行配置（推荐）](#CommandLineDeps1)
-    - [4.1.1 fastdeploy_init.bat 使用说明](#CommandLineDeps11)
-    - [4.1.2 fastdeploy_init.bat 查看 SDK 中所有的 dll、lib 和 include 路径](#CommandLineDeps12)
-    - [4.1.3 fastdeploy_init.bat 安装 SDK 中所有的 dll 到指定的目录](#CommandLineDeps13)
-    - [4.1.4 fastdeploy_init.bat 配置 SDK 环境变量](#CommandLineDeps14)
-  - [4.2 修改 CMakeLists.txt，一行命令配置（推荐）](#CommandLineDeps2)  
-  - [4.3 命令行设置环境变量](#CommandLineDeps3)  
-  - [4.4 手动拷贝依赖库到exe的目录下](#CommandLineDeps4)  
+【**注意**】**编译只支持Release模型，不支持Debug模型**
 
-
-## 1. 环境依赖
+## 1. 准备环境和Windows部署库
 <div id="Environment"></div>  
 
 - cmake >= 3.12
@@ -40,468 +12,50 @@
 - cuda >= 11.2 (当WITH_GPU=ON)
 - cudnn >= 8.0 (当WITH_GPU=ON)
 
-## 2. 下载 FastDeploy Windows 10 C++ SDK
-<div id="Download"></div>  
 
-### 2.1 下载预编译库或者从源码编译最新的SDK
-可以从以下链接下载编译好的 FastDeploy Windows 10 C++ SDK，SDK中包含了examples代码。
-```text
-https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-0.2.1.zip
-```
-源码编译请参考: [build_and_install](../build_and_install)
-### 2.2 准备模型文件和测试图片
-可以从以下链接下载模型文件和测试图片，并解压缩
-```text
-https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco.tgz # (下载后解压缩)
-https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
-```
+1. 根据需求，选择下载对应的C++(CPU/GPU)部署库，下载文档见[安装文档说明](../build_and_install)
+> 假定当前下载解压后的库路径在`D:\Download\fastdeploy-win-x64-gpu-x.x.x
+2. 下载如下模型文件和测试图片
+> https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco.tgz # (下载后解压缩)
+> https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
 
-## 3. Windows下多种方式使用 C++ SDK 的方式
-### 3.1 SDK使用方式一：命令行方式使用 C++ SDK
-<div id="CommandLine"></div>  
+## 2. 编译示例代码
+
+本文档编译的示例代码可在解压的库中找到，编译工具依赖VS 2019的安装，**Windows打开x64 Native Tools Command Prompt for VS 2019命令工具**，通过如下命令开始编译
+
+```shell
+cd D:\Download\fastdeploy-win-x64-gpu-x.x.x\examples\vision\detection\paddledetection\cpp
 
-#### 3.1.1 在 Windows 上编译 PPYOLOE
-Windows菜单打开`x64 Native Tools Command Prompt for VS 2019`命令工具，cd到ppyoloe的demo路径  
-```bat  
-cd fastdeploy-win-x64-gpu-0.2.1\examples\vision\detection\paddledetection\cpp
-```
-```bat
 mkdir build && cd build
-cmake .. -G "Visual Studio 16 2019" -A x64 -DFASTDEPLOY_INSTALL_DIR=%cd%\..\..\..\..\..\..\..\fastdeploy-win-x64-gpu-0.2.1 -DCUDA_DIRECTORY="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2"
-```
-然后执行
-```bat
+cmake .. -G "Visual Studio 16 2019" -A x64 -DFASTDEPLOY_INSTALL_DIR=D:\Download\fastdeploy-win-x64-gpu-x.x.x -DCUDA_DIRECTORY="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2"
+
 msbuild infer_demo.sln /m:4 /p:Configuration=Release /p:Platform=x64
 ```
 
-#### 3.1.2 运行 demo
-```bat
+如需使用Visual Studio 2019创建sln工程，或者CMake工程等方式编译，可参考如下文档
+- [FastDeploy C++库在Windows上的多种使用方式](./use_sdk_on_windows_build.md)
+
+## 3. 运行编译可执行程序
+
+注意Windows上运行时，需要将FastDeploy依赖的库拷贝至可执行程序所在目录, 或者配置环境变量。FastDeploy提供了工具帮助我们快速将所有依赖库拷贝至可执行程序所在目录,通过如下命令将所有依赖的dll文件拷贝至可执行程序所在的目录
+```shell
+cd D:\Download\fastdeploy-win-x64-gpu-x.x.x
+
+fastdeploy_init.bat install %cd% D:\Download\fastdeploy-win-x64-gpu-x.x.x\examples\vision\detection\paddledetection\cpp\build\Release
+```
+
+将dll拷贝到当前路径后，准备好模型和图片，使用如下命令运行可执行程序即可
+```shell
 cd Release
 infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 0  # CPU
 infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 1  # GPU
 infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 2  # GPU + TensorRT
-```  
-
-特别说明，exe运行时所需要的依赖库配置方法，请参考章节: [多种方法配置exe运行时所需的依赖库](#CommandLineDeps)
-
-### 3.2 SDK使用方式二：Visual Studio 2019 创建 sln 工程使用 C++ SDK
-
-本章节针对非CMake用户，介绍如何在Visual Studio 2019 中创建 sln 工程使用 FastDeploy C++ SDK. CMake用户请直接看下一章节。另外，本章节内容特别感谢“梦醒南天”同学关于FastDeploy使用的文档教程：[如何在 Windows 上使用 FastDeploy C++ 部署 PaddleDetection 目标检测模型](https://www.bilibili.com/read/cv18807232)
-
-<div id="VisualStudio2019Sln"></div>  
-
-#### 3.2.1 步骤一：Visual Studio 2019 创建 sln 工程项目
-
-<div id="VisualStudio2019Sln1"></div>  
-
-（1） 打开Visual Studio 2019，点击"创建新项目"->点击"控制台程序"，从而创建新的sln工程项目.
-
-![image](https://user-images.githubusercontent.com/31974251/192813386-cf9a93e0-ee42-42b3-b8bf-d03ae7171d4e.png)
-
-![image](https://user-images.githubusercontent.com/31974251/192816516-a4965b9c-21c9-4a01-bbb2-c648a8256fc9.png)
-
-（2）点击“创建”，便创建了一个空的sln工程。我们直接从examples里面拷贝infer_ppyoloe的代码这里。
-
-![image](https://user-images.githubusercontent.com/31974251/192817382-643c8ca2-1f2a-412e-954e-576c22b4ea62.png)
-
-#### 3.2.2 步骤二：从examples中拷贝infer_ppyoloe.cc的代码到工程
-
-<div id="VisualStudio2019Sln2"></div>  
-
-（1）从examples中拷贝infer_ppyoloe.cc的代码到工程，直接替换即可，拷贝代码的路径为：  
-```bat
-fastdeploy-win-x64-gpu-0.2.1\examples\vision\detection\paddledetection\cpp
 ```
 
-![image](https://user-images.githubusercontent.com/31974251/192818456-21ca846c-ab52-4001-96d2-77c8174bff6b.png)  
+在此步骤中使用到的`fastdeploy_init.bat`提供更多其它功能，帮忙开发者使用，包括
+- 查看SDK中所有dll, lib和include的路径
+- 安装SDK中所有dll至指定目录
+- 配置SDK环境变量
 
-#### 3.2.3 步骤三：将工程配置设置成"Release x64"配置
-
-<div id="VisualStudio2019Sln3"></div>  
-
-![image](https://user-images.githubusercontent.com/31974251/192818918-98d7a54c-4a60-4760-a3cb-ecacc38b7e7a.png)
-
-#### 3.2.4 步骤四：配置头文件include路径
-
-<div id="VisualStudio2019Sln4"></div>  
-
-
-（1）配置头文件include路径：鼠标选择项目，然后单击右键即可弹出下来菜单，在其中单击“属性”。
-
-![image](https://user-images.githubusercontent.com/31974251/192820573-23096aea-046c-4bb4-9929-c412718805cb.png)
-
-
-（2）在弹出来的属性页中选择：C/C++ —> 常规 —> 附加包含目录，然后在添加 fastdeploy 和 opencv 的头文件路径。如：  
-
-```bat  
-
-D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\include
-D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv-win-x64-3.4.16\build\include  
-```  
-注意，如果是自行编译最新的SDK或版本>0.2.1，依赖库目录结构有所变动，opencv路径需要做出适当的修改。如：  
-```bat  
-D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv\build\include  
-```
-
-![image](https://user-images.githubusercontent.com/31974251/192824445-978c06ed-cc14-4d6a-8ccf-d4594ca11533.png)
-
-用户需要根据自己实际的sdk路径稍作修改。
-
-
-#### 3.2.5 步骤五：配置lib路径和添加库文件
-
-<div id="VisualStudio2019Sln5"></div>  
-
-（1）属性页中选择：链接器—>常规—> 附加库目录，然后在添加 fastdeploy 和 opencv 的lib路径。如：  
-```bat  
-D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\lib
-D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv-win-x64-3.4.16\build\x64\vc15\lib
-```
-注意，如果是自行编译最新的SDK或版本>0.2.1，依赖库目录结构有所变动，opencv路径需要做出适当的修改。如：  
-```bat  
-D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv\build\include  
-```  
-
-![image](https://user-images.githubusercontent.com/31974251/192826130-fe28791f-317c-4e66-a6a5-133e60b726f0.png)
-
-（2）添加库文件：只需要 fastdeploy.lib 和 opencv_world3416.lib  
-
- ![image](https://user-images.githubusercontent.com/31974251/192826884-44fc84a1-c57a-45f1-8ee2-30b7eaa3dce9.png)
-
-#### 3.2.6 步骤六：编译工程并运行获取结果
-
-<div id="VisualStudio2019Sln6"></div>  
-
-
-（1）点击菜单栏“生成”->“生成解决方案”
-
-![image](https://user-images.githubusercontent.com/31974251/192827608-beb53685-2f94-44dc-aa28-49b09a4ab864.png)
-
-![image](https://user-images.githubusercontent.com/31974251/192827842-1f05d435-8a3e-492b-a3b7-d5e88f85f814.png)  
-
-编译成功，可以看到exe保存在：  
-```bat  
-D:\qiuyanjun\fastdeploy_test\infer_ppyoloe\x64\Release\infer_ppyoloe.exe  
-```  
-
-（2）执行可执行文件，获得推理结果。 首先需要拷贝所有的dll到exe所在的目录下。同时，也需要把ppyoloe的模型文件和测试图片下载解压缩后，拷贝到exe所在的目录。 特别说明，exe运行时所需要的依赖库配置方法，请参考章节: [多种方法配置exe运行时所需的依赖库](#CommandLineDeps)  
-
-![image](https://user-images.githubusercontent.com/31974251/192829545-3ea36bfc-9a54-492b-984b-2d5d39094d47.png)  
-
-
-### 3.3 SDK使用方式三：Visual Studio 2019 创建 CMake 工程使用 C++ SDK
-<div id="VisualStudio2019"></div>  
-
-本章节针对CMake用户，介绍如何在Visual Studio 2019 中创建 CMake 工程使用 FastDeploy C++ SDK.
-
-#### 3.3.1 步骤一：Visual Studio 2019 创建“CMake”工程项目
-
-<div id="VisualStudio20191"></div>  
-
-（1）打开Visual Studio 2019，点击"创建新项目"->点击"CMake"，从而创建CMake工程项目。以PPYOLOE为例，来说明如何在Visual Studio 2019 IDE中使用FastDeploy C++ SDK.
-
-![image](https://user-images.githubusercontent.com/31974251/192143543-9f29e4cb-2307-45ca-a61a-bcfba5df19ff.png)
-
-![image](https://user-images.githubusercontent.com/31974251/192143640-39e79c65-8b50-4254-8da6-baa21bb23e3c.png)  
-
-
-![image](https://user-images.githubusercontent.com/31974251/192143713-be2e6490-4cab-4151-8463-8c367dbc451a.png)
-
-（2）打开工程发现，Visual Stuio 2019已经为我们生成了一些基本的文件，其中包括CMakeLists.txt。infer_ppyoloe.h头文件这里实际上用不到，我们可以直接删除。  
-
-![image](https://user-images.githubusercontent.com/31974251/192143930-db1655c2-66ee-448c-82cb-0103ca1ca2a0.png)  
-
-#### 3.3.2 步骤二：在CMakeLists中配置 FastDeploy C++ SDK
-
-<div id="VisualStudio20192"></div>  
-
-（1）在工程创建完成后，我们需要添加infer_ppyoloe推理源码，并修改CMakeLists.txt，修改如下：
-
-![image](https://user-images.githubusercontent.com/31974251/192144782-79bccf8f-65d0-4f22-9f41-81751c530319.png)
-
-（2）其中infer_ppyoloe.cpp的代码可以直接从examples中的代码拷贝过来：  
-- [examples/vision/detection/paddledetection/cpp/infer_ppyoloe.cc](../../../examples/vision/detection/paddledetection/cpp/infer_ppyoloe.cc)
-
-（3）CMakeLists.txt主要包括配置FastDeploy C++ SDK的路径，如果是GPU版本的SDK，还需要配置CUDA_DIRECTORY为CUDA的安装路径，CMakeLists.txt的配置如下：
-
-```cmake
-project(infer_ppyoloe_demo C CXX)
-cmake_minimum_required(VERSION 3.12)
-
-# Only support "Release" mode now  
-set(CMAKE_BUILD_TYPE "Release")
-
-# Set FastDeploy install dir
-set(FASTDEPLOY_INSTALL_DIR "D:/qiuyanjun/fastdeploy-win-x64-gpu-0.2.1"
-    CACHE PATH "Path to downloaded or built fastdeploy sdk.")
-
-# Set CUDA_DIRECTORY (CUDA 11.x) for GPU SDK
-set(CUDA_DIRECTORY "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7"
-    CACHE PATH "Path to installed CUDA Toolkit.")
-
-include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
-
-include_directories(${FASTDEPLOY_INCS})
-
-add_executable(infer_ppyoloe_demo ${PROJECT_SOURCE_DIR}/infer_ppyoloe.cpp)
-target_link_libraries(infer_ppyoloe_demo ${FASTDEPLOY_LIBS})  
-
-# Optional: install all DLLs to binary dir.
-install_fastdeploy_libraries(${CMAKE_CURRENT_BINARY_DIR}/Release)
-```
-注意，`install_fastdeploy_libraries`函数仅在最新的代码编译的SDK或版本>0.2.1下有效。  
-
-#### 3.3.3 步骤三：生成工程缓存并修改CMakeSetting.json配置
-
-<div id="VisualStudio20193"></div>  
-
-（1）点击"CMakeLists.txt"->右键点击"生成缓存":  
-
-![image](https://user-images.githubusercontent.com/31974251/192145349-c78b110a-0e41-4ee5-8942-3bf70bd94a75.png)
-
-发现已经成功生成缓存了，但是由于打开工程时，默认是Debug模式，我们发现exe和缓存保存路径还是Debug模式下的。 我们可以先修改CMake的设置为Release.
-
-（2）点击"CMakeLists.txt"->右键点击"infer_ppyoloe_demo的cmake设置"，进入CMakeSettings.json的设置面板，把其中的Debug设置修改为Release.  
-
-![image](https://user-images.githubusercontent.com/31974251/192145242-01d37b44-e2fa-47df-82c1-c11c2ccbff99.png)  
-
-同时设置CMake生成器为 "Visual Studio 16 2019 Win64"
-
-![image](https://user-images.githubusercontent.com/31974251/192147961-ac46d0f6-7349-4126-a123-914af2b63d95.jpg)
-
-（3）点击保存CMake缓存以切换为Release配置：  
-
-![image](https://user-images.githubusercontent.com/31974251/192145974-b5a63341-9143-49a2-8bfe-94ac641b1670.png)
-
-（4）：（4.1）点击"CMakeLists.txt"->右键"CMake缓存仅限x64-Release"->"点击删除缓存"；（4.2）点击"CMakeLists.txt"->"生成缓存"；（4.3）如果在步骤一发现删除缓存的选项是灰色的可以直接点击"CMakeLists.txt"->"生成"，若生成失败则可以重复尝试（4.1）和（4。2）
-
-![image](https://user-images.githubusercontent.com/31974251/192146394-51fbf2b8-1cba-41ca-bb45-5f26890f64ce.jpg)  
-
-最终可以看到，配置已经成功生成Relase模式下的CMake缓存了。  
-
-![image](https://user-images.githubusercontent.com/31974251/192146239-a1eacd9e-034d-4373-a262-65b18ce25b87.png)  
-
-
-#### 3.3.4 步骤四：生成可执行文件，运行获取结果。
-
-<div id="VisualStudio20194"></div>  
-
-（1）点击"CMakeLists.txt"->"生成"。可以发现已经成功生成了infer_ppyoloe_demo.exe，并保存在`out/build/x64-Release/Release`目录下。  
-
-![image](https://user-images.githubusercontent.com/31974251/192146852-c64d2252-8c8f-4309-a950-908a5cb258b8.png)
-
-（2）执行可执行文件，获得推理结果。 首先需要拷贝所有的dll到exe所在的目录下，这里我们可以在CMakeLists.txt添加一下命令，可将FastDeploy中所有的dll安装到指定的目录。注意，该方式仅在最新的代码编译的SDK或版本>0.2.1下有效。其他配置方式，请参考章节: [多种方法配置exe运行时所需的依赖库](#CommandLineDeps)  
-
-```cmake  
-install_fastdeploy_libraries(${CMAKE_CURRENT_BINARY_DIR}/Release)
-```  
-（3）同时，也需要把ppyoloe的模型文件和测试图片下载解压缩后，拷贝到exe所在的目录。 准备完成后，目录结构如下：  
-
-![image](https://user-images.githubusercontent.com/31974251/192147505-054edb77-564b-405e-89ee-fd0d2e413e78.png)
-
-（4）最后，执行以下命令获得推理结果：  
-
-```bat  
-D:\xxxinfer_ppyoloe\out\build\x64-Release\Release>infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 0
-[INFO] fastdeploy/runtime.cc(304)::fastdeploy::Runtime::Init    Runtime initialized with Backend::OPENVINO in Device::CPU.
-DetectionResult: [xmin, ymin, xmax, ymax, score, label_id]
-415.047180,89.311569, 506.009613, 283.863098, 0.950423, 0
-163.665710,81.914932, 198.585342, 166.760895, 0.896433, 0
-581.788635,113.027618, 612.623474, 198.521713, 0.842596, 0
-267.217224,89.777306, 298.796051, 169.361526, 0.837951, 0
-......
-153.301407,123.233757, 177.130539, 164.558350, 0.066697, 60
-505.887604,140.919601, 523.167236, 151.875336, 0.084912, 67
-
-Visualized result saved in ./vis_result.jpg
-```  
-
-打开保存的图片查看可视化结果：  
-
-<div  align="center">  
-<img src="https://user-images.githubusercontent.com/19339784/184326520-7075e907-10ed-4fad-93f8-52d0e35d4964.jpg", width=480px, height=320px />
-</div>
-
-特别说明，exe运行时所需要的依赖库配置方法，请参考章节: [多种方法配置exe运行时所需的依赖库](#CommandLineDeps)
-
-## 4. 多种方法配置exe运行时所需的依赖库
-<div id="CommandLineDeps"></div>  
-说明：对于使用的最新源码编译的SDK或SDK版本>0.2.1的用户，我们推荐使用（4.1）和（4.2）中的方式配置运行时的依赖库。如果使用的SDK版本<=0.2.1，请参考（4.3）和（4.4）中的方式进行配置。
-
-### 4.1 方式一：使用 fastdeploy_init.bat 进行配置（推荐）  
-<div id="CommandLineDeps1"></div>  
-
-对于版本高于0.2.1的SDK，我们提供了 **fastdeploy_init.bat** 工具来管理FastDeploy中所有的依赖库。可以通过该脚本工具查看(show)、拷贝(install) 和 设置(init and setup) SDK中所有的dll，方便用户快速完成运行时环境配置。
-
-#### 4.1.1 fastdeploy_init.bat 使用说明  
-<div id="CommandLineDeps11"></div>  
-
-首先进入SDK的根目录，运行以下命令，可以查看 fastdeploy_init.bat 的用法说明
-```bat
-D:\path-to-your-fastdeploy-sdk-dir>fastdeploy_init.bat help
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-[1] [help]    print help information:                      fastdeploy_init.bat help
-[2] [show]    show all dlls/libs/include paths:            fastdeploy_init.bat show fastdeploy-sdk-dir
-[3] [init]    init all dlls paths for current terminal:    fastdeploy_init.bat init fastdeploy-sdk-dir  [WARNING: need copy onnxruntime.dll manually]
-[4] [setup]   setup path env for current terminal:         fastdeploy_init.bat setup fastdeploy-sdk-dir [WARNING: need copy onnxruntime.dll manually]
-[5] [install] install all dlls to a specific dir:          fastdeploy_init.bat install fastdeploy-sdk-dir another-dir-to-install-dlls **[RECOMMEND]**
-[6] [install] install all dlls with logging infos:         fastdeploy_init.bat install fastdeploy-sdk-dir another-dir-to-install-dlls info
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-```  
-用法简要说明如下：  
-- help:     打印所有的用法说明  
-- show:     查看SDK中所有的 dll、lib 和 include 路径
-- init:     初始化所有dll路径信息，后续用于设置terminal环境变量（不推荐，请参考4.3中关于onnxruntime的说明）
-- setup:    在init之后运行，设置terminal环境便令（不推荐，请参考4.3中关于onnxruntime的说明）  
-- install:  将SDK中所有的dll安装到某个指定的目录（推荐）
-#### 4.1.2  fastdeploy_init.bat 查看 SDK 中所有的 dll、lib 和 include 路径  
-<div id="CommandLineDeps12"></div>  
-
-进入SDK的根目录，运行show命令，可以查看SDK中所有的 dll、lib 和 include 路径。以下命令中 %cd% 表示当前目录（SDK的根目录）。  
-```bat
-D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat show %cd%
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-[SDK] D:\path-to-fastdeploy-sdk-dir
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-[DLL] D:\path-to-fastdeploy-sdk-dir\lib\fastdeploy.dll **[NEEDED]**
-[DLL] D:\path-to-fastdeploy-sdk-dir\third_libs\install\faster_tokenizer\lib\core_tokenizers.dll  **[NEEDED]**
-[DLL] D:\path-to-fastdeploy-sdk-dir\third_libs\install\opencv\build\x64\vc15\bin\opencv_ffmpeg3416_64.dll  **[NEEDED]**
-......
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-[Lib] D:\path-to-fastdeploy-sdk-dir\lib\fastdeploy.lib **[NEEDED][fastdeploy]**
-[Lib] D:\path-to-fastdeploy-sdk-dir\third_libs\install\faster_tokenizer\lib\core_tokenizers.lib  **[NEEDED][fastdeploy::text]**
-[Lib] D:\path-to-fastdeploy-sdk-dir\third_libs\install\opencv\build\x64\vc15\lib\opencv_world3416.lib  **[NEEDED][fastdeploy::vision]**
-......
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-[Include] D:\path-to-fastdeploy-sdk-dir\include **[NEEDED][fastdeploy]**
-[Include] D:\path-to-fastdeploy-sdk-dir\third_libs\install\faster_tokenizer\include  **[NEEDED][fastdeploy::text]**
-[Include] D:\path-to-fastdeploy-sdk-dir\third_libs\install\opencv\build\include  **[NEEDED][fastdeploy::vision]**
-......
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-[XML] D:\path-to-fastdeploy-sdk-dir\third_libs\install\openvino\runtime\bin\plugins.xml **[NEEDED]**
-------------------------------------------------------------------------------------------------------------------------------------------------------------
-```  
-可以看到该命令会根据您当前的SDK，输出对应的信息，包含 dll、lib 和 include 的路径信息。对于 dll，被标记为 `[NEEDED]`的，是运行时所需要的，如果包含OpenVINO后端，还需要将他的plugins.xml拷贝到exe所在的目录；对于 lib 和 include，被标记为`[NEEDED]`的，是开发时所需要配置的最小依赖。并且，我们还增加了对应的API Tag标记，如果您只使用vision API，则只需要配置标记为 `[NEEDED][fastdeploy::vision]` 的 lib 和 include 路径.  
-
-#### 4.1.3 fastdeploy_init.bat 安装 SDK 中所有的 dll 到指定的目录 （推荐）
-<div id="CommandLineDeps13"></div>  
-
-进入SDK的根目录，运行install命令，可以将SDK 中所有的 dll 安装到指定的目录（如exe所在的目录）。我们推荐这种方式来配置exe运行所需要的依赖库。比如，可以在SDK根目录下创建一个临时的bin目录备份所有的dll文件。以下命令中 %cd% 表示当前目录（SDK的根目录）。
-```bat  
-% info参数为可选参数，添加info参数后会打印详细的安装信息 %
-D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat install %cd% bin
-D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat install %cd% bin info
-```
-```bat
-D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat install %cd% bin
-[INFO] Do you want to install all FastDeploy dlls ?
-[INFO] From: D:\path-to-fastdeploy-sdk-dir
-[INFO]   To: bin
-Choose y means YES, n means NO: [y/n]y
-YES.
-请按任意键继续. . .
-[INFO] Created bin done!
-已复制         1 个文件。
-已复制         1 个文件。
-已复制         1 个文件。
-已复制         1 个文件。
-.....
-已复制         1 个文件。
-已复制         1 个文件。
-已复制         1 个文件。
-已复制         1 个文件。
-.....
-```  
-#### 4.1.4 fastdeploy_init.bat 配置 SDK 环境变量  
-<div id="CommandLineDeps14"></div>  
-
-您也可以选择通过配置环境变量的方式来设置运行时的依赖库环境，这种方式只在当前的terminal有效。如果您使用的SDK中包含了onnxruntime推理后端，我们不推荐这种方式，详细原因请参考（4.3）中关于onnxruntime配置的说明（需要手动拷贝onnxruntime所有的dll到exe所在的目录）。配置 SDK 环境变量的方式如下。以下命令中 %cd% 表示当前目录（SDK的根目录）。
-```bat
-% 先运行 init 初始化当前SDK所有的dll文件路径 %
-D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat init %cd%
-% 再运行 setup 完成 SDK 环境变量配置  %
-D:\path-to-fastdeploy-sdk-dir>fastdeploy_init.bat setup %cd%
-```
-
-### 4.2 方式二：修改CMakeLists.txt，一行命令配置（推荐）
-<div id="CommandLineDeps2"></div>  
-
-考虑到Windows下C++开发的特殊性，如经常需要拷贝所有的lib或dll文件到某个指定的目录，FastDeploy提供了`install_fastdeploy_libraries`的cmake函数，方便用户快速配置所有的dll。修改ppyoloe的CMakeLists.txt，添加：  
-```cmake
-install_fastdeploy_libraries(${CMAKE_CURRENT_BINARY_DIR}/Release)
-```
-注意，该方式仅在最新的代码编译的SDK或版本>0.2.1下有效。  
-
-### 4.3 方式三：命令行设置环境变量
-<div id="CommandLineDeps3"></div>  
-
-编译好的exe保存在Release目录下，在运行demo前，需要将模型和测试图片拷贝至该目录。另外，需要在终端指定DLL的搜索路径。请在build目录下执行以下命令。
-```bat
-set FASTDEPLOY_HOME=%cd%\..\..\..\..\..\..\..\fastdeploy-win-x64-gpu-0.2.1
-set PATH=%FASTDEPLOY_HOME%\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\onnxruntime\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\opencv-win-x64-3.4.16\build\x64\vc15\bin;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\paddle\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mkldnn\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mklml\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle2onnx\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\tensorrt\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\third_party\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\yaml-cpp\lib;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\bin;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\3rdparty\tbb\bin;%PATH%
-```  
-注意，需要拷贝onnxruntime.dll到exe所在的目录。
-```bat
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\onnxruntime\lib\onnxruntime* Release\
-```  
-由于较新的Windows在System32系统目录下自带了onnxruntime.dll，因此就算设置了PATH，系统依然会出现onnxruntime的加载冲突。因此需要先拷贝demo用到的onnxruntime.dll到exe所在的目录。如下
-```bat
-where onnxruntime.dll
-C:\Windows\System32\onnxruntime.dll  # windows自带的onnxruntime.dll
-```  
-另外，注意，如果是自行编译最新的SDK或版本>0.2.1，opencv和openvino目录结构有所改变，路径需要做出适当的修改。如：  
-```bat  
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\opencv\build\x64\vc15\bin;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\bin;%PATH%
-set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\3rdparty\tbb\bin;%PATH%
-```
-可以把上述命令拷贝并保存到build目录下的某个bat脚本文件中(包含copy onnxruntime)，如`setup_fastdeploy_dll.bat`，方便多次使用。
-```bat
-setup_fastdeploy_dll.bat
-```
-
-### 4.4 方式四：手动拷贝依赖库到exe的目录下
-
-<div id="CommandLineDeps4"></div>  
-
-手动拷贝，或者在build目录下执行以下命令：
-```bat
-set FASTDEPLOY_HOME=%cd%\..\..\..\..\..\..\..\fastdeploy-win-x64-gpu-0.2.1
-copy /Y %FASTDEPLOY_HOME%\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\onnxruntime\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\opencv-win-x64-3.4.16\build\x64\vc15\bin\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\paddle_inference\paddle\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mkldnn\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mklml\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\paddle2onnx\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\tensorrt\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\third_party\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\yaml-cpp\lib\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\openvino\bin\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\openvino\bin\*.xml Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\openvino\3rdparty\tbb\bin\*.dll Release\
-```
-另外，注意，如果是自行编译最新的SDK或版本>0.2.1，opencv和openvino目录结构有所改变，路径需要做出适当的修改。如：  
-```bat  
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\opencv\build\x64\vc15\bin\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\bin\*.dll Release\
-copy /Y %FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\3rdparty\tbb\bin\*.dll Release\
-```
-可以把上述命令拷贝并保存到build目录下的某个bat脚本文件中，如`copy_fastdeploy_dll.bat`，方便多次使用。
-```bat
-copy_fastdeploy_dll.bat
-```
-特别说明：上述的set和copy命令对应的依赖库路径，需要用户根据自己使用SDK中的依赖库进行适当地修改。比如，若是CPU版本的SDK，则不需要TensorRT相关的设置。
+具体可参考如下文档
+- [fastdeploy_init.bat工具的使用](./usage_of_fastdeploy_init_bat.md)
diff --git a/docs/cn/faq/use_sdk_on_windows_build.md b/docs/cn/faq/use_sdk_on_windows_build.md
new file mode 100644
index 000000000..5f5e70e62
--- /dev/null
+++ b/docs/cn/faq/use_sdk_on_windows_build.md
@@ -0,0 +1,312 @@
+# FastDeploy C++库在Windows上的多种使用方式 
+
+## 目录
+- [1. 环境依赖](#Environment)  
+- [2. 下载 FastDeploy Windows 10 C++ SDK](#Download)  
+- [3. Windows下多种方式使用 C++ SDK 的方式](#CommandLine)
+  - [3.1 命令行方式使用 C++ SDK](#CommandLine)  
+    - [3.1.1 在 Windows 命令行终端 上编译 example](#CommandLine)  
+    - [3.1.2 运行可执行文件获得推理结果](#CommandLine)  
+  - [3.2 Visual Studio 2019 创建sln工程使用 C++ SDK](#VisualStudio2019Sln)  
+    - [3.2.1 Visual Studio 2019 创建 sln 工程项目](#VisualStudio2019Sln1)  
+    - [3.2.2 从examples中拷贝infer_ppyoloe.cc的代码到工程](#VisualStudio2019Sln2)  
+    - [3.2.3 将工程配置设置成"Release x64"配置](#VisualStudio2019Sln3)  
+    - [3.2.4 配置头文件include路径](#VisualStudio2019Sln4)  
+    - [3.2.5 配置lib路径和添加库文件](#VisualStudio2019Sln5)  
+    - [3.2.6 编译工程并运行获取结果](#VisualStudio2019Sln6)
+  - [3.3 Visual Studio 2019 创建CMake工程使用 C++ SDK](#VisualStudio2019)
+    - [3.3.1 Visual Studio 2019 创建CMake工程项目](#VisualStudio20191)  
+    - [3.3.2 在CMakeLists中配置 FastDeploy C++ SDK](#VisualStudio20192)  
+    - [3.3.3 生成工程缓存并修改CMakeSetting.json配置](#VisualStudio20193)  
+    - [3.3.4 生成可执行文件，运行获取结果](#VisualStudio20194)  
+
+
+## 1. 环境依赖
+<div id="Environment"></div>  
+
+- cmake >= 3.12
+- Visual Studio 16 2019
+- cuda >= 11.2 (当WITH_GPU=ON)
+- cudnn >= 8.0 (当WITH_GPU=ON)
+
+## 2. 下载 FastDeploy Windows 10 C++ SDK
+<div id="Download"></div>  
+
+### 2.1 下载预编译库或者从源码编译最新的SDK
+可以从以下链接下载编译好的 FastDeploy Windows 10 C++ SDK，SDK中包含了examples代码。
+```text
+https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-win-x64-gpu-0.2.1.zip
+```
+源码编译请参考: [build_and_install](../build_and_install)
+### 2.2 准备模型文件和测试图片
+可以从以下链接下载模型文件和测试图片，并解压缩
+```text
+https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco.tgz # (下载后解压缩)
+https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+```
+
+## 3. Windows下多种方式使用 C++ SDK 的方式
+### 3.1 SDK使用方式一：命令行方式使用 C++ SDK
+<div id="CommandLine"></div>  
+
+#### 3.1.1 在 Windows 上编译 PPYOLOE
+Windows菜单打开`x64 Native Tools Command Prompt for VS 2019`命令工具，cd到ppyoloe的demo路径  
+```bat  
+cd fastdeploy-win-x64-gpu-0.2.1\examples\vision\detection\paddledetection\cpp
+```
+```bat
+mkdir build && cd build
+cmake .. -G "Visual Studio 16 2019" -A x64 -DFASTDEPLOY_INSTALL_DIR=%cd%\..\..\..\..\..\..\..\fastdeploy-win-x64-gpu-0.2.1 -DCUDA_DIRECTORY="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2"
+```
+然后执行
+```bat
+msbuild infer_demo.sln /m:4 /p:Configuration=Release /p:Platform=x64
+```
+
+#### 3.1.2 运行 demo
+```bat
+cd Release
+infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 0  # CPU
+infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 1  # GPU
+infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 2  # GPU + TensorRT
+```  
+
+特别说明，exe运行时所需要的依赖库配置方法，请参考章节: [多种方法配置exe运行时所需的依赖库](#CommandLineDeps)
+
+### 3.2 SDK使用方式二：Visual Studio 2019 创建 sln 工程使用 C++ SDK
+
+本章节针对非CMake用户，介绍如何在Visual Studio 2019 中创建 sln 工程使用 FastDeploy C++ SDK. CMake用户请直接看下一章节。另外，本章节内容特别感谢“梦醒南天”同学关于FastDeploy使用的文档教程：[如何在 Windows 上使用 FastDeploy C++ 部署 PaddleDetection 目标检测模型](https://www.bilibili.com/read/cv18807232)
+
+<div id="VisualStudio2019Sln"></div>  
+
+#### 3.2.1 步骤一：Visual Studio 2019 创建 sln 工程项目
+
+<div id="VisualStudio2019Sln1"></div>  
+
+（1） 打开Visual Studio 2019，点击"创建新项目"->点击"控制台程序"，从而创建新的sln工程项目.
+
+![image](https://user-images.githubusercontent.com/31974251/192813386-cf9a93e0-ee42-42b3-b8bf-d03ae7171d4e.png)
+
+![image](https://user-images.githubusercontent.com/31974251/192816516-a4965b9c-21c9-4a01-bbb2-c648a8256fc9.png)
+
+（2）点击“创建”，便创建了一个空的sln工程。我们直接从examples里面拷贝infer_ppyoloe的代码这里。
+
+![image](https://user-images.githubusercontent.com/31974251/192817382-643c8ca2-1f2a-412e-954e-576c22b4ea62.png)
+
+#### 3.2.2 步骤二：从examples中拷贝infer_ppyoloe.cc的代码到工程
+
+<div id="VisualStudio2019Sln2"></div>  
+
+（1）从examples中拷贝infer_ppyoloe.cc的代码到工程，直接替换即可，拷贝代码的路径为：  
+```bat
+fastdeploy-win-x64-gpu-0.2.1\examples\vision\detection\paddledetection\cpp
+```
+
+![image](https://user-images.githubusercontent.com/31974251/192818456-21ca846c-ab52-4001-96d2-77c8174bff6b.png)  
+
+#### 3.2.3 步骤三：将工程配置设置成"Release x64"配置
+
+<div id="VisualStudio2019Sln3"></div>  
+
+![image](https://user-images.githubusercontent.com/31974251/192818918-98d7a54c-4a60-4760-a3cb-ecacc38b7e7a.png)
+
+#### 3.2.4 步骤四：配置头文件include路径
+
+<div id="VisualStudio2019Sln4"></div>  
+
+
+（1）配置头文件include路径：鼠标选择项目，然后单击右键即可弹出下来菜单，在其中单击“属性”。
+
+![image](https://user-images.githubusercontent.com/31974251/192820573-23096aea-046c-4bb4-9929-c412718805cb.png)
+
+
+（2）在弹出来的属性页中选择：C/C++ —> 常规 —> 附加包含目录，然后在添加 fastdeploy 和 opencv 的头文件路径。如：  
+
+```bat  
+
+D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\include
+D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv-win-x64-3.4.16\build\include  
+```  
+注意，如果是自行编译最新的SDK或版本>0.2.1，依赖库目录结构有所变动，opencv路径需要做出适当的修改。如：  
+```bat  
+D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv\build\include  
+```
+
+![image](https://user-images.githubusercontent.com/31974251/192824445-978c06ed-cc14-4d6a-8ccf-d4594ca11533.png)
+
+用户需要根据自己实际的sdk路径稍作修改。
+
+
+#### 3.2.5 步骤五：配置lib路径和添加库文件
+
+<div id="VisualStudio2019Sln5"></div>  
+
+（1）属性页中选择：链接器—>常规—> 附加库目录，然后在添加 fastdeploy 和 opencv 的lib路径。如：  
+```bat  
+D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\lib
+D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv-win-x64-3.4.16\build\x64\vc15\lib
+```
+注意，如果是自行编译最新的SDK或版本>0.2.1，依赖库目录结构有所变动，opencv路径需要做出适当的修改。如：  
+```bat  
+D:\qiuyanjun\fastdeploy_build\built\fastdeploy-win-x64-gpu-0.2.1\third_libs\install\opencv\build\include  
+```  
+
+![image](https://user-images.githubusercontent.com/31974251/192826130-fe28791f-317c-4e66-a6a5-133e60b726f0.png)
+
+（2）添加库文件：只需要 fastdeploy.lib 和 opencv_world3416.lib  
+
+ ![image](https://user-images.githubusercontent.com/31974251/192826884-44fc84a1-c57a-45f1-8ee2-30b7eaa3dce9.png)
+
+#### 3.2.6 步骤六：编译工程并运行获取结果
+
+<div id="VisualStudio2019Sln6"></div>  
+
+
+（1）点击菜单栏“生成”->“生成解决方案”
+
+![image](https://user-images.githubusercontent.com/31974251/192827608-beb53685-2f94-44dc-aa28-49b09a4ab864.png)
+
+![image](https://user-images.githubusercontent.com/31974251/192827842-1f05d435-8a3e-492b-a3b7-d5e88f85f814.png)  
+
+编译成功，可以看到exe保存在：  
+```bat  
+D:\qiuyanjun\fastdeploy_test\infer_ppyoloe\x64\Release\infer_ppyoloe.exe  
+```  
+
+（2）执行可执行文件，获得推理结果。 首先需要拷贝所有的dll到exe所在的目录下。同时，也需要把ppyoloe的模型文件和测试图片下载解压缩后，拷贝到exe所在的目录。 特别说明，exe运行时所需要的依赖库配置方法，请参考章节: [多种方法配置exe运行时所需的依赖库](#CommandLineDeps)  
+
+![image](https://user-images.githubusercontent.com/31974251/192829545-3ea36bfc-9a54-492b-984b-2d5d39094d47.png)  
+
+
+### 3.3 SDK使用方式三：Visual Studio 2019 创建 CMake 工程使用 C++ SDK
+<div id="VisualStudio2019"></div>  
+
+本章节针对CMake用户，介绍如何在Visual Studio 2019 中创建 CMake 工程使用 FastDeploy C++ SDK.
+
+#### 3.3.1 步骤一：Visual Studio 2019 创建“CMake”工程项目
+
+<div id="VisualStudio20191"></div>  
+
+（1）打开Visual Studio 2019，点击"创建新项目"->点击"CMake"，从而创建CMake工程项目。以PPYOLOE为例，来说明如何在Visual Studio 2019 IDE中使用FastDeploy C++ SDK.
+
+![image](https://user-images.githubusercontent.com/31974251/192143543-9f29e4cb-2307-45ca-a61a-bcfba5df19ff.png)
+
+![image](https://user-images.githubusercontent.com/31974251/192143640-39e79c65-8b50-4254-8da6-baa21bb23e3c.png)  
+
+
+![image](https://user-images.githubusercontent.com/31974251/192143713-be2e6490-4cab-4151-8463-8c367dbc451a.png)
+
+（2）打开工程发现，Visual Stuio 2019已经为我们生成了一些基本的文件，其中包括CMakeLists.txt。infer_ppyoloe.h头文件这里实际上用不到，我们可以直接删除。  
+
+![image](https://user-images.githubusercontent.com/31974251/192143930-db1655c2-66ee-448c-82cb-0103ca1ca2a0.png)  
+
+#### 3.3.2 步骤二：在CMakeLists中配置 FastDeploy C++ SDK
+
+<div id="VisualStudio20192"></div>  
+
+（1）在工程创建完成后，我们需要添加infer_ppyoloe推理源码，并修改CMakeLists.txt，修改如下：
+
+![image](https://user-images.githubusercontent.com/31974251/192144782-79bccf8f-65d0-4f22-9f41-81751c530319.png)
+
+（2）其中infer_ppyoloe.cpp的代码可以直接从examples中的代码拷贝过来：  
+- [examples/vision/detection/paddledetection/cpp/infer_ppyoloe.cc](../../../examples/vision/detection/paddledetection/cpp/infer_ppyoloe.cc)
+
+（3）CMakeLists.txt主要包括配置FastDeploy C++ SDK的路径，如果是GPU版本的SDK，还需要配置CUDA_DIRECTORY为CUDA的安装路径，CMakeLists.txt的配置如下：
+
+```cmake
+project(infer_ppyoloe_demo C CXX)
+cmake_minimum_required(VERSION 3.12)
+
+# Only support "Release" mode now  
+set(CMAKE_BUILD_TYPE "Release")
+
+# Set FastDeploy install dir
+set(FASTDEPLOY_INSTALL_DIR "D:/qiuyanjun/fastdeploy-win-x64-gpu-0.2.1"
+    CACHE PATH "Path to downloaded or built fastdeploy sdk.")
+
+# Set CUDA_DIRECTORY (CUDA 11.x) for GPU SDK
+set(CUDA_DIRECTORY "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7"
+    CACHE PATH "Path to installed CUDA Toolkit.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_ppyoloe_demo ${PROJECT_SOURCE_DIR}/infer_ppyoloe.cpp)
+target_link_libraries(infer_ppyoloe_demo ${FASTDEPLOY_LIBS})  
+
+# Optional: install all DLLs to binary dir.
+install_fastdeploy_libraries(${CMAKE_CURRENT_BINARY_DIR}/Release)
+```
+注意，`install_fastdeploy_libraries`函数仅在最新的代码编译的SDK或版本>0.2.1下有效。  
+
+#### 3.3.3 步骤三：生成工程缓存并修改CMakeSetting.json配置
+
+<div id="VisualStudio20193"></div>  
+
+（1）点击"CMakeLists.txt"->右键点击"生成缓存":  
+
+![image](https://user-images.githubusercontent.com/31974251/192145349-c78b110a-0e41-4ee5-8942-3bf70bd94a75.png)
+
+发现已经成功生成缓存了，但是由于打开工程时，默认是Debug模式，我们发现exe和缓存保存路径还是Debug模式下的。 我们可以先修改CMake的设置为Release.
+
+（2）点击"CMakeLists.txt"->右键点击"infer_ppyoloe_demo的cmake设置"，进入CMakeSettings.json的设置面板，把其中的Debug设置修改为Release.  
+
+![image](https://user-images.githubusercontent.com/31974251/192145242-01d37b44-e2fa-47df-82c1-c11c2ccbff99.png)  
+
+同时设置CMake生成器为 "Visual Studio 16 2019 Win64"
+
+![image](https://user-images.githubusercontent.com/31974251/192147961-ac46d0f6-7349-4126-a123-914af2b63d95.jpg)
+
+（3）点击保存CMake缓存以切换为Release配置：  
+
+![image](https://user-images.githubusercontent.com/31974251/192145974-b5a63341-9143-49a2-8bfe-94ac641b1670.png)
+
+（4）：（4.1）点击"CMakeLists.txt"->右键"CMake缓存仅限x64-Release"->"点击删除缓存"；（4.2）点击"CMakeLists.txt"->"生成缓存"；（4.3）如果在步骤一发现删除缓存的选项是灰色的可以直接点击"CMakeLists.txt"->"生成"，若生成失败则可以重复尝试（4.1）和（4。2）
+
+![image](https://user-images.githubusercontent.com/31974251/192146394-51fbf2b8-1cba-41ca-bb45-5f26890f64ce.jpg)  
+
+最终可以看到，配置已经成功生成Relase模式下的CMake缓存了。  
+
+![image](https://user-images.githubusercontent.com/31974251/192146239-a1eacd9e-034d-4373-a262-65b18ce25b87.png)  
+
+
+#### 3.3.4 步骤四：生成可执行文件，运行获取结果。
+
+<div id="VisualStudio20194"></div>  
+
+（1）点击"CMakeLists.txt"->"生成"。可以发现已经成功生成了infer_ppyoloe_demo.exe，并保存在`out/build/x64-Release/Release`目录下。  
+
+![image](https://user-images.githubusercontent.com/31974251/192146852-c64d2252-8c8f-4309-a950-908a5cb258b8.png)
+
+（2）执行可执行文件，获得推理结果。 首先需要拷贝所有的dll到exe所在的目录下，这里我们可以在CMakeLists.txt添加一下命令，可将FastDeploy中所有的dll安装到指定的目录。注意，该方式仅在最新的代码编译的SDK或版本>0.2.1下有效。其他配置方式，请参考章节: [多种方法配置exe运行时所需的依赖库](#CommandLineDeps)  
+
+```cmake  
+install_fastdeploy_libraries(${CMAKE_CURRENT_BINARY_DIR}/Release)
+```  
+（3）同时，也需要把ppyoloe的模型文件和测试图片下载解压缩后，拷贝到exe所在的目录。 准备完成后，目录结构如下：  
+
+![image](https://user-images.githubusercontent.com/31974251/192147505-054edb77-564b-405e-89ee-fd0d2e413e78.png)
+
+（4）最后，执行以下命令获得推理结果：  
+
+```bat  
+D:\xxxinfer_ppyoloe\out\build\x64-Release\Release>infer_ppyoloe_demo.exe ppyoloe_crn_l_300e_coco 000000014439.jpg 0
+[INFO] fastdeploy/runtime.cc(304)::fastdeploy::Runtime::Init    Runtime initialized with Backend::OPENVINO in Device::CPU.
+DetectionResult: [xmin, ymin, xmax, ymax, score, label_id]
+415.047180,89.311569, 506.009613, 283.863098, 0.950423, 0
+163.665710,81.914932, 198.585342, 166.760895, 0.896433, 0
+581.788635,113.027618, 612.623474, 198.521713, 0.842596, 0
+267.217224,89.777306, 298.796051, 169.361526, 0.837951, 0
+......
+153.301407,123.233757, 177.130539, 164.558350, 0.066697, 60
+505.887604,140.919601, 523.167236, 151.875336, 0.084912, 67
+
+Visualized result saved in ./vis_result.jpg
+```  
+
+打开保存的图片查看可视化结果：  
+
+<div  align="center">  
+<img src="https://user-images.githubusercontent.com/19339784/184326520-7075e907-10ed-4fad-93f8-52d0e35d4964.jpg", width=480px, height=320px />
+</div>
diff --git a/docs/en/build_and_install/download_prebuilt_libraries.md b/docs/en/build_and_install/download_prebuilt_libraries.md
index 0d716e66b..56bc864a0 100644
--- a/docs/en/build_and_install/download_prebuilt_libraries.md
+++ b/docs/en/build_and_install/download_prebuilt_libraries.md
@@ -93,8 +93,9 @@ Install the released version（Latest 1.0.3 for now, Android is 1.0.3）
 | Mac OSX x64 | [fastdeploy-osx-x86_64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-x86_64-1.0.3.tgz) | clang++ 10.0.0|
 | Mac OSX arm64 | [fastdeploy-osx-arm64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-osx-arm64-1.0.3.tgz) | clang++ 13.0.0 |
 | Linux aarch64 | [fastdeploy-osx-arm64-1.0.3.tgz](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-1.0.3.tgz) | gcc 6.3 |  
-| Android armv7&v8 | [fastdeploy-android-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.3-shared.tgz) | NDK 25, clang++, support arm64-v8a and armeabi-v7a  |      
-| Android armv7&v8 | [fastdeploy-android-with-text-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.3-shared.tgz) | contains Text API, such as FastTokenizer and UIE，NDK 25, clang++, support arm64-v8a and armeabi-v7a  |
+| Android armv7&v8 | [fastdeploy-android-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.3-shared.tgz) | CV API, NDK 25, clang++, support arm64-v8a and armeabi-v7a  |
+| Android armv7&v8 | [fastdeploy-android-with-text-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-1.0.3-shared.tgz) | contains Text API, such as FastTokenizer and UIE, CV API, NDK 25, clang++, support arm64-v8a and armeabi-v7a  |
+| Android armv7&v8 | [fastdeploy-android-with-text-only-1.0.3-shared.tgz](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-with-text-only-1.0.3-shared.tgz) | only contains Text API, such as FastTokenizer and UIE, NDK 25, clang++, does not contain CV API, support arm64-v8a and armeabi-v7a  |
 
 ## Java SDK
 
@@ -102,8 +103,8 @@ Install the released version（Android is 1.0.3 pre-release）
 
 | Platform | File | Description |
 | :--- | :--- | :---- |
-| Android Java SDK | [fastdeploy-android-sdk-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.3.aar) | NDK 20, minSdkVersion 15, targetSdkVersion 28 |  
-| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.3.aar) | contains Text API, such as FastTokenizer and UI, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-1.0.3.aar) | CV API, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-with-text-1.0.3.aar](https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-sdk-with-text-1.0.3.aar) | contains Text API, such as FastTokenizer and UIE, CV API, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
 
 Install the Develop version（Nightly build）
 
@@ -114,7 +115,8 @@ Install the Develop version（Nightly build）
 | Mac OSX x64 | [fastdeploy-osx-arm64-0.0.0.tgz](https://bj.bcebos.com/fastdeploy/dev/cpp/fastdeploy-osx-arm64-0.0.0.tgz) | - |
 | Mac OSX arm64 | [fastdeploy-osx-arm64-0.0.0.tgz](https://fastdeploy.bj.bcebos.com/dev/cpp/fastdeploy-osx-arm64-0.0.0.tgz) | clang++ 13.0.0 to compile |
 | Linux aarch64 | - | - |  
-| Android armv7&v8 | [fastdeploy-android-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-0.0.0-shared.tgz) | NDK 25, clang++, support arm64-v8a and armeabi-v7a |  
-| Android armv7&v8 | [fastdeploy-android-with-text-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-with-text-0.0.0-shared.tgz) | contains Text API, such as FastTokenizer and UIE，NDK 25, clang++, support arm64-v8a and armeabi-v7a |  
-| Android Java SDK | [fastdeploy-android-sdk-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-0.0.0.aar) | NDK 20, minSdkVersion 15, targetSdkVersion 28 |  
-| Android Java SDK | [fastdeploy-android-sdk-with-text-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-with-text-0.0.0.aar) | contains Text API, such as FastTokenizer and UI, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
+| Android armv7&v8 | [fastdeploy-android-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-0.0.0-shared.tgz) | CV API, NDK 25, clang++, support arm64-v8a and armeabi-v7a |  
+| Android armv7&v8 | [fastdeploy-android-with-text-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-with-text-0.0.0-shared.tgz) | contains Text API, such as FastTokenizer and UIE, CV API, such as OpenCV, NDK 25, clang++, support arm64-v8a and armeabi-v7a |
+| Android armv7&v8 | [fastdeploy-android-with-text-only-0.0.0-shared.tgz](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-with-text-only-0.0.0-shared.tgz) | only contains Text API, such as FastTokenizer and UIE，NDK 25, clang++, does not contain CV API, support arm64-v8a and armeabi-v7a |
+| Android Java SDK | [fastdeploy-android-sdk-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-0.0.0.aar) | CV API, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
+| Android Java SDK | [fastdeploy-android-sdk-with-text-0.0.0.aar](https://bj.bcebos.com/fastdeploy/dev/android/fastdeploy-android-sdk-with-text-0.0.0.aar) | contains Text API, such as FastTokenizer and UIE, CV API, such as OpenCV, NDK 20, minSdkVersion 15, targetSdkVersion 28 |
diff --git a/docs/en/build_and_install/huawei_ascend.md b/docs/en/build_and_install/huawei_ascend.md
index 55743ca1c..c648e2ea3 100644
--- a/docs/en/build_and_install/huawei_ascend.md
+++ b/docs/en/build_and_install/huawei_ascend.md
@@ -117,6 +117,12 @@ In end-to-end model inference, the pre-processing and post-processing phases are
 
 
 ## Deployment demo reference
-- Deploying PaddleClas Classification Model on Huawei Ascend NPU using C++ please refer to: [PaddleClas Huawei Ascend NPU C++ Deployment Example](../../../examples/vision/classification/paddleclas/cpp/README.md)
-
-- Deploying PaddleClas classification model on Huawei Ascend NPU using Python please refer to: [PaddleClas Huawei Ascend NPU Python Deployment Example](../../../examples/vision/classification/paddleclas/python/README.md)
+| Model | C++ Example | Python Example |
+| :-----------| :--------   | :--------------- |
+|   PaddleClas       |   [Ascend NPU C++ Example](../../../examples/vision/classification/paddleclas/cpp/README.md)       |    [Ascend NPU Python Example](../../../examples/vision/classification/paddleclas/python/README.md)          |  
+|   PaddleDetection  |      [Ascend NPU C++ Example](../../../examples/vision/detection/paddledetection/cpp/README.md)        |     [Ascend NPU Python Example](../../../examples/vision/detection/paddledetection/python/README.md)               |
+|   PaddleSeg        |      [Ascend NPU C++ Example](../../../examples/vision/segmentation/paddleseg/cpp/README.md)        |      [Ascend NPU Python Example](../../../examples//vision/segmentation/paddleseg/python/README.md)              |
+|   PaddleOCR        |     [Ascend NPU C++ Example](../../../examples/vision/ocr/PP-OCRv3/cpp/README.md)         |      [Ascend NPU Python Example](../../../examples/vision//ocr/PP-OCRv3/python/README.md)              |
+|   Yolov5           |      [Ascend NPU C++ Example](../../../examples/vision/detection/yolov5/cpp/README.md)       |       [Ascend NPU Python Example](../../../examples/vision/detection/yolov5/python/README.md)             |
+|   Yolov6           |      [Ascend NPU C++ Example](../../../examples/vision/detection/yolov6/cpp/README.md)        |       [Ascend NPU Python Example](../../../examples/vision/detection/yolov6/python/README.md)             |
+|   Yolov7           |      [Ascend NPU C++ Example](../../../examples/vision/detection/yolov7/cpp/README.md)        |       [Ascend NPU Python Example](../../../examples/vision/detection/yolov7/python/README.md)             |
diff --git a/docs/en/faq/rknpu2/build.md b/docs/en/faq/rknpu2/build.md
new file mode 100644
index 000000000..79f8035f9
--- /dev/null
+++ b/docs/en/faq/rknpu2/build.md
@@ -0,0 +1,78 @@
+English | [中文](../../../cn/faq/rknpu2/build.md) 
+# FastDeploy RKNPU2 Engine Compilation 
+
+## FastDeploy supported backends
+FastDeploy currently supports the following backends on the RK platform: 
+
+| Backend                | Platform                    | Supported model formats  | Notes                                          |
+|:------------------|:---------------------|:-------|:-------------------------------------------|
+| ONNX&nbsp;Runtime | RK356X   <br> RK3588 | ONNX   | Compile switch `ENABLE_ORT_BACKEND` is controlled by ON or OFF. Default OFF    |
+| RKNPU2            | RK356X   <br> RK3588 | RKNN   | Compile switch `ENABLE_RKNPU2_BACKEND` is controlled by ON or OFF. Default OFF  |
+
+## Compile FastDeploy SDK
+
+### Compile FastDeploy C++ SDK on board side 
+
+Currently, RKNPU2 is only available on linux. The following tutorial is completed on RK3568(debian 10) and RK3588(debian 11). 
+
+```bash
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy
+
+# If you are using the develop branch, type the following command 
+git checkout develop
+
+mkdir build && cd build
+cmake ..  -DENABLE_ORT_BACKEND=ON \
+	      -DENABLE_RKNPU2_BACKEND=ON \
+	      -DENABLE_VISION=ON \
+	      -DRKNN2_TARGET_SOC=RK3588 \
+          -DCMAKE_INSTALL_PREFIX=${PWD}/fastdeploy-0.0.0
+make -j8
+make install
+```
+
+### Cross-compile FastDeploy C++ SDK
+```bash
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy
+
+# If you are using the develop branch, type the following command 
+git checkout develop
+
+mkdir build && cd build
+cmake ..  -DCMAKE_C_COMPILER=/home/zbc/opt/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-gcc \
+          -DCMAKE_CXX_COMPILER=/home/zbc/opt/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-g++ \
+          -DCMAKE_TOOLCHAIN_FILE=./../cmake/toolchain.cmake \
+          -DTARGET_ABI=arm64 \
+          -DENABLE_ORT_BACKEND=OFF \
+	      -DENABLE_RKNPU2_BACKEND=ON \
+	      -DENABLE_VISION=ON \
+	      -DRKNN2_TARGET_SOC=RK3588 \
+	      -DENABLE_FLYCV=ON \
+          -DCMAKE_INSTALL_PREFIX=${PWD}/fastdeploy-0.0.0
+make -j8
+make install
+```
+
+### Compile the Python SDK on the board
+
+Currently, RKNPU2 is only available on linux. The following tutorial is  completed on RK3568(debian 10) and RK3588(debian 11). Packing Python is dependent on `wheel`, so run `pip install wheel` before compiling.
+
+```bash
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy
+
+# If you are using the develop branch, type the following command 
+git checkout develop
+
+cd python
+export ENABLE_ORT_BACKEND=ON
+export ENABLE_RKNPU2_BACKEND=ON
+export ENABLE_VISION=ON
+export RKNN2_TARGET_SOC=RK3588
+python3 setup.py build
+python3 setup.py bdist_wheel
+cd dist
+pip3 install fastdeploy_python-0.0.0-cp39-cp39-linux_aarch64.whl
+```
diff --git a/docs/en/faq/rknpu2/environment.md b/docs/en/faq/rknpu2/environment.md
new file mode 100644
index 000000000..5f8f61efd
--- /dev/null
+++ b/docs/en/faq/rknpu2/environment.md
@@ -0,0 +1,92 @@
+English | [中文](../../../cn/faq/rknpu2/environment.md) 
+# FastDeploy RKNPU2 inference environment setup
+
+## Introduction
+
+We need to set up the development environment before deploying models on FastDeploy. The environment setup of FastDeploy is divided into two parts: the board-side inference environment setup and the PC-side model conversion environment setup.
+
+## Board-side inference environment setup
+
+Based on the feedback from developers, we provide two ways to set up the inference environment on the board: one-click script installation script and command line installation of development board dirver.
+
+### Install via script
+
+Most developers don't like complex command lines for installation, so FastDeploy provides a one-click way for developers to install stable RKNN. Refer to the following command to set up the board side environment
+
+```bash
+# Download and unzip rknpu2_device_install_1.4.0
+wget https://bj.bcebos.com/fastdeploy/third_libs/rknpu2_device_install_1.4.0.zip
+unzip rknpu2_device_install_1.4.0.zip
+
+cd rknpu2_device_install_1.4.0
+# RK3588 runs the following code 
+sudo rknn_install_rk3588.sh
+# RK356X  runs the following code 
+sudo rknn_install_rk356X.sh
+```
+
+### Install via the command line 
+
+For developers who want to try out the latest RK drivers, we provide a method to install them from scratch using the following command line. 
+
+```bash
+# Install the required packages 
+sudo apt update -y
+sudo apt install -y python3
+sudo apt install -y python3-dev
+sudo apt install -y python3-pip
+sudo apt install -y gcc
+sudo apt install -y python3-opencv
+sudo apt install -y python3-numpy
+sudo apt install -y cmake
+
+# Download rknpu2
+# RK3588 runs the following code 
+git clone https://gitee.com/mirrors_rockchip-linux/rknpu2.git
+sudo cp ./rknpu2/runtime/RK3588/Linux/librknn_api/aarch64/* /usr/lib
+sudo cp ./rknpu2/runtime/RK3588/Linux/rknn_server/aarch64/usr/bin/* /usr/bin/
+
+# RK356X  runs the following code 
+git clone https://gitee.com/mirrors_rockchip-linux/rknpu2.git
+sudo cp ./rknpu2/runtime/RK356X/Linux/librknn_api/aarch64/* /usr/lib
+sudo cp ./rknpu2/runtime/RK356X/Linux/rknn_server/aarch64/usr/bin/* /usr/bin/
+```
+
+## Install rknn_toolkit2
+
+There are dependency issues when installing the rknn_toolkit2. Here are the installation tutorial. 
+rknn_toolkit2 depends on a few specific packages, so it is recommended to create a virtual environment using conda. The way to install conda is omitted and we mainly introduce how to install rknn_toolkit2.
+
+
+### Download rknn_toolkit2
+rknn_toolkit2 can usually be downloaded from git 
+```bash
+git clone https://github.com/rockchip-linux/rknn-toolkit2.git
+```
+
+### Download and install the required packages 
+```bash
+sudo apt-get install libxslt1-dev zlib1g zlib1g-dev libglib2.0-0 \
+libsm6 libgl1-mesa-glx libprotobuf-dev gcc g++
+```
+
+### Install rknn_toolkit2 environment 
+```bash
+# Create virtual environment
+conda create -n rknn2 python=3.6
+conda activate rknn2
+
+# Install numpy==1.16.6 first because rknn_toolkit2 has a specific numpy dependency
+pip install numpy==1.16.6
+
+# Install rknn_toolkit2-1.3.0_11912b58-cp38-cp38-linux_x86_64.whl
+cd ~/Download /rknn-toolkit2-master/packages
+pip install rknn_toolkit2-1.3.0_11912b58-cp38-cp38-linux_x86_64.whl
+```
+
+## Resource links 
+
+* [RKNPU2, rknntoolkit2 development board download  Password：rknn](https://eyun.baidu.com/s/3eTDMk6Y)
+
+## Other documents 
+- [RKNN model conversion document](./export.md)
diff --git a/docs/en/faq/rknpu2/issues.md b/docs/en/faq/rknpu2/issues.md
new file mode 100644
index 000000000..320c57a2a
--- /dev/null
+++ b/docs/en/faq/rknpu2/issues.md
@@ -0,0 +1,47 @@
+English | [中文](../../../cn/faq/rknpu2/issues.md) 
+# RKNPU2 FAQs
+
+This document collects the common problems when using FastDeploy.
+
+## Navigation
+
+- [Link issues in dynamic link library](#动态链接库链接问题)
+
+## Link issues in dynamic link library
+
+### Association issue
+
+- [Issue 870](https://github.com/PaddlePaddle/FastDeploy/issues/870)
+
+### Problem Description 
+
+No problem during compiling, but the following error is reported when running the program
+```text
+error while loading shared libraries: libfastdeploy.so.0.0.0: cannot open shared object file: No such file or directory
+```
+
+### Analysis
+
+The linker ld indicates that the library file cannot be found. The default directories for ld are /lib and /usr/lib.
+Other directories are also OK, but you need to let ld know where the library files are located. 
+
+
+### Solutions
+
+**Temporary solution**
+
+This solution has no influence on the system, but it only works on the current terminal and fails when closing this terminal.
+
+```bash
+source PathToFastDeploySDK/fastdeploy_init.sh
+```
+
+**Permanent solution**
+
+The temporary solution fails because users need to retype the command each time they reopen the terminal to run the program. If you don't want to constantly run the code, execute the following code: 
+```bash
+source PathToFastDeploySDK/fastdeploy_init.sh
+sudo cp PathToFastDeploySDK/fastdeploy_libs.conf /etc/ld.so.conf.d/
+sudo ldconfig
+```
+After execution, the configuration file is written to the system. Refresh to let the system find the library location.
diff --git a/examples/runtime/python/infer_torchscript_poros.py b/examples/runtime/python/infer_torchscript_poros.py
index de31061f0..974d41ac0 100644
--- a/examples/runtime/python/infer_torchscript_poros.py
+++ b/examples/runtime/python/infer_torchscript_poros.py
@@ -51,7 +51,6 @@ if __name__ == '__main__':
     option.use_poros_backend()
     option.set_model_path(
         "std_resnet50_script.pt", model_format=ModelFormat.TORCHSCRIPT)
-    option.is_dynamic = True
     # compile
     runtime = fd.Runtime(option)
     runtime.compile(prewarm_datas)
diff --git a/examples/vision/classification/yolov5cls/cpp/infer.cc b/examples/vision/classification/yolov5cls/cpp/infer.cc
index 2920c95b0..048964f1e 100644
--- a/examples/vision/classification/yolov5cls/cpp/infer.cc
+++ b/examples/vision/classification/yolov5cls/cpp/infer.cc
@@ -27,10 +27,9 @@ void CpuInfer(const std::string& model_file, const std::string& image_file) {
   }
 
   auto im = cv::imread(image_file);
-  auto im_bak = im.clone();
 
   fastdeploy::vision::ClassifyResult res;
-  if (!model.Predict(&im, &res)) {
+  if (!model.Predict(im, &res)) {
     std::cerr << "Failed to predict." << std::endl;
     return;
   }
@@ -48,10 +47,9 @@ void GpuInfer(const std::string& model_file, const std::string& image_file) {
   }
 
   auto im = cv::imread(image_file);
-  auto im_bak = im.clone();
 
   fastdeploy::vision::ClassifyResult res;
-  if (!model.Predict(&im, &res)) {
+  if (!model.Predict(im, &res)) {
     std::cerr << "Failed to predict." << std::endl;
     return;
   }
@@ -71,10 +69,9 @@ void TrtInfer(const std::string& model_file, const std::string& image_file) {
   }
 
   auto im = cv::imread(image_file);
-  auto im_bak = im.clone();
 
   fastdeploy::vision::ClassifyResult res;
-  if (!model.Predict(&im, &res)) {
+  if (!model.Predict(im, &res)) {
     std::cerr << "Failed to predict." << std::endl;
     return;
   }
diff --git a/examples/vision/classification/yolov5cls/python/infer.py b/examples/vision/classification/yolov5cls/python/infer.py
index 55974a764..8be218b2e 100644
--- a/examples/vision/classification/yolov5cls/python/infer.py
+++ b/examples/vision/classification/yolov5cls/python/infer.py
@@ -44,8 +44,9 @@ args = parse_arguments()
 runtime_option = build_option(args)
 model = fd.vision.classification.YOLOv5Cls(
     args.model, runtime_option=runtime_option)
+model.postprocessor.topk = args.topk
 
 # 预测图片分类结果
 im = cv2.imread(args.image)
-result = model.predict(im, args.topk)
+result = model.predict(im)
 print(result)
diff --git a/examples/vision/detection/paddledetection/cpp/README.md b/examples/vision/detection/paddledetection/cpp/README.md
index b53d8ae48..94e73fd45 100755
--- a/examples/vision/detection/paddledetection/cpp/README.md
+++ b/examples/vision/detection/paddledetection/cpp/README.md
@@ -1,7 +1,7 @@
 English | [简体中文](README_CN.md)
 # PaddleDetection C++ Deployment Example
 
-This directory provides examples that `infer_xxx.cc` fast finishes the deployment of PaddleDetection models, including PPYOLOE/PicoDet/YOLOX/YOLOv3/PPYOLO/FasterRCNN/YOLOv5/YOLOv6/YOLOv7/RTMDet on CPU/GPU and GPU accelerated by TensorRT. 
+This directory provides examples that `infer_xxx.cc` fast finishes the deployment of PaddleDetection models, including PPYOLOE/PicoDet/YOLOX/YOLOv3/PPYOLO/FasterRCNN/YOLOv5/YOLOv6/YOLOv7/RTMDet on CPU/GPU and GPU accelerated by TensorRT.
 
 Before deployment, two steps require confirmation
 
@@ -15,13 +15,13 @@ ppyoloe is taken as an example for inference deployment
 
 mkdir build
 cd build
-# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
 make -j
 
-# Download the PPYOLOE model file and test images 
+# Download the PPYOLOE model file and test images
 wget https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco.tgz
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
 tar xvf ppyoloe_crn_l_300e_coco.tgz
@@ -33,12 +33,16 @@ tar xvf ppyoloe_crn_l_300e_coco.tgz
 ./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 1
 # TensorRT Inference on GPU
 ./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 2
+# Kunlunxin XPU Inference
+./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 3
+# Huawei Ascend Inference
+./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 4
 ```
 
 The above command works for Linux or MacOS. For SDK use-pattern in Windows, refer to:
 - [How to use FastDeploy C++ SDK in Windows](../../../../../docs/en/faq/use_sdk_on_windows.md)
 
-## PaddleDetection C++ Interface 
+## PaddleDetection C++ Interface
 
 ### Model Class
 
@@ -56,7 +60,7 @@ Loading and initializing PaddleDetection PPYOLOE model, where the format of mode
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path
 > * **config_file**(str): •	Configuration file path, which is the deployment yaml file exported by PaddleDetection
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
@@ -73,7 +77,7 @@ Loading and initializing PaddleDetection PPYOLOE model, where the format of mode
 > **Parameter**
 >
 > > * **im**: Input images in HWC or BGR format
-> > * **result**: Detection result, including detection box and confidence of each box. Refer to [Vision Model Prediction Result](../../../../../docs/api/vision_results/) for DetectionResult 
+> > * **result**: Detection result, including detection box and confidence of each box. Refer to [Vision Model Prediction Result](../../../../../docs/api/vision_results/) for DetectionResult
 
 - [Model Description](../../)
 - [Python Deployment](../python)
diff --git a/examples/vision/detection/paddledetection/python/README.md b/examples/vision/detection/paddledetection/python/README.md
index baec5fe06..d0aa3e301 100755
--- a/examples/vision/detection/paddledetection/python/README.md
+++ b/examples/vision/detection/paddledetection/python/README.md
@@ -9,11 +9,11 @@ Before deployment, two steps require confirmation.
 This directory provides examples that `infer_xxx.py` fast finishes the deployment of PPYOLOE/PicoDet models on CPU/GPU and GPU accelerated by TensorRT. The script is as follows
 
 ```bash
-# Download deployment example code 
+# Download deployment example code
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy/examples/vision/detection/paddledetection/python/
 
-# Download the PPYOLOE model file and test images 
+# Download the PPYOLOE model file and test images
 wget https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco.tgz
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
 tar xvf ppyoloe_crn_l_300e_coco.tgz
@@ -24,6 +24,10 @@ python infer_ppyoloe.py --model_dir ppyoloe_crn_l_300e_coco --image 000000014439
 python infer_ppyoloe.py --model_dir ppyoloe_crn_l_300e_coco --image 000000014439.jpg --device gpu
 # TensorRT inference on GPU  （Attention: It is somewhat time-consuming for the operation of model serialization when running TensorRT inference for the first time. Please be patient.）
 python infer_ppyoloe.py --model_dir ppyoloe_crn_l_300e_coco --image 000000014439.jpg --device gpu --use_trt True
+# Kunlunxin XPU Inference
+python infer_ppyoloe.py --model_dir ppyoloe_crn_l_300e_coco --image 000000014439.jpg --device kunlunxin
+# Huawei Ascend Inference
+python infer_ppyoloe.py --model_dir ppyoloe_crn_l_300e_coco --image 000000014439.jpg --device ascend
 ```
 
 The visualized result after running is as follows
@@ -31,7 +35,7 @@ The visualized result after running is as follows
 <img src="https://user-images.githubusercontent.com/19339784/184326520-7075e907-10ed-4fad-93f8-52d0e35d4964.jpg", width=480px, height=320px />
 </div>
 
-## PaddleDetection Python Interface 
+## PaddleDetection Python Interface
 
 ```python
 fastdeploy.vision.detection.PPYOLOE(model_file, params_file, config_file, runtime_option=None, model_format=ModelFormat.PADDLE)
@@ -52,7 +56,7 @@ PaddleDetection model loading and initialization, among which model_file and par
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path
 > * **config_file**(str): Inference configuration yaml file path
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
diff --git a/examples/vision/detection/yolov5/cpp/README.md b/examples/vision/detection/yolov5/cpp/README.md
index 1b5e9ad86..74f182088 100755
--- a/examples/vision/detection/yolov5/cpp/README.md
+++ b/examples/vision/detection/yolov5/cpp/README.md
@@ -12,12 +12,12 @@ Taking the CPU inference on Linux as an example, the compilation test can be com
 ```bash
 mkdir build
 cd build
-# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
 make -j
-# Download the official converted yolov5 Paddle model files and test images 
+# Download the official converted yolov5 Paddle model files and test images
 wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_infer.tar
 tar -xvf yolov5s_infer.tar
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
@@ -31,11 +31,13 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_paddle_demo yolov5s_infer 000000014439.jpg 2
 # KunlunXin XPU inference
 ./infer_paddle_demo yolov5s_infer 000000014439.jpg 3
+# Huawei Ascend Inference
+./infer_paddle_demo yolov5s_infer 000000014439.jpg 4
 ```
 
 The above steps apply to the inference of Paddle models. If you want to conduct the inference of ONNX models, follow these steps:
 ```bash
-# 1. Download the official converted yolov5 ONNX model files and test images 
+# 1. Download the official converted yolov5 ONNX model files and test images
 wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s.onnx
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
 
@@ -53,7 +55,7 @@ The visualized result after running is as follows
 The above command works for Linux or MacOS. For SDK use-pattern in Windows, refer to:
 - [How to use FastDeploy C++ SDK in Windows](../../../../../docs/cn/faq/use_sdk_on_windows.md)
 
-## YOLOv5 C++ Interface 
+## YOLOv5 C++ Interface
 
 ### YOLOv5 Class
 
@@ -69,7 +71,7 @@ YOLOv5 model loading and initialization, among which model_file is the exported
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path. Merely passing an empty string when the model is in ONNX format
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
 > * **model_format**(ModelFormat): Model format. ONNX format by default
diff --git a/examples/vision/detection/yolov5/python/README.md b/examples/vision/detection/yolov5/python/README.md
index 0e815dd09..23b6665c7 100755
--- a/examples/vision/detection/yolov5/python/README.md
+++ b/examples/vision/detection/yolov5/python/README.md
@@ -22,17 +22,19 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer.py --model yolov5s_infer --image 000000014439.jpg --device cpu
 # GPU inference
 python infer.py --model yolov5s_infer --image 000000014439.jpg --device gpu
-# TensorRT inference on GPU 
+# TensorRT inference on GPU
 python infer.py --model yolov5s_infer --image 000000014439.jpg --device gpu --use_trt True
 # KunlunXin XPU inference
 python infer.py --model yolov5s_infer --image 000000014439.jpg --device kunlunxin
+# Huawei Ascend Inference
+python infer.py --model yolov5s_infer --image 000000014439.jpg --device ascend
 ```
 
 The visualized result after running is as follows
 
 <img width="640" src="https://user-images.githubusercontent.com/67993288/184309358-d803347a-8981-44b6-b589-4608021ad0f4.jpg">
 
-## YOLOv5 Python Interface 
+## YOLOv5 Python Interface
 
 ```python
 fastdeploy.vision.detection.YOLOv5(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
@@ -42,7 +44,7 @@ YOLOv5  model loading and initialization, among which model_file is the exported
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path. No need to set when the model is in ONNX format
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
 > * **model_format**(ModelFormat): Model format. ONNX format by default
diff --git a/examples/vision/detection/yolov6/python/README.md b/examples/vision/detection/yolov6/python/README.md
index 789df9747..04bc9f345 100755
--- a/examples/vision/detection/yolov6/python/README.md
+++ b/examples/vision/detection/yolov6/python/README.md
@@ -23,6 +23,9 @@ python infer_paddle_model.py --model yolov6s_infer --image 000000014439.jpg  --d
 python infer_paddle_model.py --model yolov6s_infer --image 000000014439.jpg  --device gpu
 # KunlunXin XPU inference
 python infer_paddle_model.py --model yolov6s_infer --image 000000014439.jpg  --device kunlunxin
+# Huawei Ascend Inference
+python infer_paddle_model.py --model yolov6s_infer --image 000000014439.jpg  --device ascend
+
 ```
 If you want to verify the inference of ONNX models, refer to the following command:
 ```bash
@@ -34,7 +37,7 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer.py --model yolov6s.onnx --image 000000014439.jpg --device cpu
 # GPU inference
 python infer.py --model yolov6s.onnx --image 000000014439.jpg --device gpu
-# TensorRT inference on GPU 
+# TensorRT inference on GPU
 python infer.py --model yolov6s.onnx --image 000000014439.jpg --device gpu --use_trt True
 ```
 
@@ -42,7 +45,7 @@ The visualized result after running is as follows
 
 <img width="640" src="https://user-images.githubusercontent.com/67993288/184301725-390e4abb-db2b-482d-931d-469381322626.jpg">
 
-## YOLOv6 Python Interface 
+## YOLOv6 Python Interface
 
 ```python
 fastdeploy.vision.detection.YOLOv6(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
@@ -52,7 +55,7 @@ YOLOv6 model loading and initialization, among which model_file is the exported
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path. No need to set when the model is in ONNX format
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
 > * **model_format**(ModelFormat): Model format. ONNX format by default
diff --git a/examples/vision/detection/yolov7/cpp/README.md b/examples/vision/detection/yolov7/cpp/README.md
index e36875e0c..a3abd6d19 100755
--- a/examples/vision/detection/yolov7/cpp/README.md
+++ b/examples/vision/detection/yolov7/cpp/README.md
@@ -1,7 +1,7 @@
 English | [简体中文](README_CN.md)
 # YOLOv7 C++ Deployment Example
 
-This directory provides examples that `infer.cc` fast finishes the deployment of YOLOv7 on CPU/GPU and GPU accelerated by TensorRT. 
+This directory provides examples that `infer.cc` fast finishes the deployment of YOLOv7 on CPU/GPU and GPU accelerated by TensorRT.
 
 Before deployment, two steps require confirmation
 
@@ -13,7 +13,7 @@ Taking the CPU inference on Linux as an example, the compilation test can be com
 ```bash
 mkdir build
 cd build
-# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy  Precompiled Library` mentioned above 
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy  Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
@@ -29,10 +29,12 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_paddle_model_demo yolov7_infer 000000014439.jpg 1
 # KunlunXin XPU inference
 ./infer_paddle_model_demo yolov7_infer 000000014439.jpg 2
+# Huawei Ascend inference
+./infer_paddle_model_demo yolov7_infer 000000014439.jpg 3
 ```
 If you want to verify the inference of ONNX models, refer to the following command:
 ```bash
-# Download the official converted yolov7 ONNX model files and test images 
+# Download the official converted yolov7 ONNX model files and test images
 wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov7.onnx
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
 
@@ -52,7 +54,7 @@ The visualized result after running is as follows
 The above command works for Linux or MacOS. For SDK use-pattern in Windows, refer to:
 - [How to use FastDeploy C++ SDK in Windows](../../../../../docs/en/faq/use_sdk_on_windows.md)
 
-## YOLOv7 C++ Interface 
+## YOLOv7 C++ Interface
 
 ### YOLOv7 Class
 
@@ -68,7 +70,7 @@ YOLOv7 model loading and initialization, among which model_file is the exported
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path. Merely passing an empty string when the model is in ONNX format
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
 > * **model_format**(ModelFormat): Model format. ONNX format by default
@@ -86,7 +88,7 @@ YOLOv7 model loading and initialization, among which model_file is the exported
 > **Parameter**
 >
 > > * **im**: Input images in HWC or BGR format
-> > * **result**: Detection results, including detection box and confidence of each box. Refer to [Vision Model Prediction Results](../../../../../docs/api/vision_results/) for DetectionResult 
+> > * **result**: Detection results, including detection box and confidence of each box. Refer to [Vision Model Prediction Results](../../../../../docs/api/vision_results/) for DetectionResult
 > > * **conf_threshold**: Filtering threshold of detection box confidence
 > > * **nms_iou_threshold**: iou threshold during NMS processing
 
diff --git a/examples/vision/detection/yolov8/cpp/README.md b/examples/vision/detection/yolov8/cpp/README.md
new file mode 100644
index 000000000..ca9462b0c
--- /dev/null
+++ b/examples/vision/detection/yolov8/cpp/README.md
@@ -0,0 +1,90 @@
+English | [简体中文](README_CN.md)
+# YOLOv8 C++ Deployment Example
+
+This directory provides the example that `infer.cc` fast finishes the deployment of YOLOv8 on CPU/GPU and GPU through TensorRT.
+
+Two steps before deployment
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. Download the precompiled deployment library and samples code based on your development environment. Refer to [FastDeploy Precompiled Library](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+Taking the CPU inference on Linux as an example, FastDeploy version 1.0.3 or above (x.x.x>=1.0.3) is required to support this model.
+
+```bash
+mkdir build
+cd build
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# 1. Download the official converted YOLOv8 ONNX model files and test images 
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov8s.onnx
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+
+# CPU inference
+./infer_demo yolov8s.onnx 000000014439.jpg 0
+# GPU inference
+./infer_demo yolov8s.onnx 000000014439.jpg 1
+# TensorRT inference on GPU 
+./infer_demo yolov8s.onnx 000000014439.jpg 2
+```
+The visualized result is as follows
+
+<img width="640" src="https://user-images.githubusercontent.com/67993288/184309358-d803347a-8981-44b6-b589-4608021ad0f4.jpg">
+
+he above command works for Linux or MacOS. For SDK in Windows, refer to:
+- [How to use FastDeploy C++ SDK in Windows](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+If you use Huawei Ascend NPU deployment, refer to the following document  to initialize the deployment environment:
+- [How to use Huawei Ascend NPU deployment](../../../../../docs/cn/faq/use_sdk_on_ascend.md)
+
+## YOLOv8 C++ Interface
+
+### YOLOv8
+
+```c++
+fastdeploy::vision::detection::YOLOv8(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+YOLOv8 model loading and initialization, among which model_file is the exported ONNX model format
+
+**Parameter**
+
+> * **model_file**(str): Model file path 
+> * **params_file**(str): Parameter file path. Merely passing an empty string when the model is in ONNX format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat): Model format. ONNX format by default
+
+#### Predict function
+
+> ```c++
+> YOLOv8::Predict(cv::Mat* im, DetectionResult* result)
+> ```
+>
+> Model prediction interface. Input images and output detection results
+>
+> **Parameter**
+>
+> > * **im**: Input images in HWC or BGR format
+> > * **result**: Detection results, including detection box and confidence of each box. Refer to [Vision Model Prediction Results](../../../../../docs/api/vision_results/) for DetectionResult.
+
+### Class Member Variable
+#### Pre-processing Parameter
+Users can modify the following preprocessing parameters based on actual needs to change the final inference and deployment results
+
+> > * **size**(vector&lt;int&gt;): This parameter changes the resize used during preprocessing, containing two integer elements for [width, height] with default value [640, 640]
+> > * **padding_value**(vector&lt;float&gt;): This parameter is used to change the padding value of images during resize, containing three floating-point elements that represent the value of three channels. Default value [114, 114, 114]
+> > * **is_no_pad**(bool): Specify whether to resize the image through padding. `is_no_pad=ture` represents no paddling. Default `is_no_pad=false`
+> > * **is_mini_pad**(bool): This parameter sets the width and height of the image after resize to the value nearest to the `size` member variable and to the point where the padded pixel size is divisible by the `stride` member variable. Default `is_mini_pad=false`
+> > * **stride**(int): Used with the `stris_mini_pad` member variable. Default `stride=32`
+
+- [Model Description](../../)
+- [Python Deployment](../python)
+- [Vision Model Prediction Results](../../../../../docs/api/vision_results/)
+- [How to switch the backend engine](../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/detection/yolov8/cpp/README_CN.md b/examples/vision/detection/yolov8/cpp/README_CN.md
index d1d87ccc5..0545df665 100644
--- a/examples/vision/detection/yolov8/cpp/README_CN.md
+++ b/examples/vision/detection/yolov8/cpp/README_CN.md
@@ -81,7 +81,7 @@ YOLOv8模型加载和初始化，其中model_file为导出的ONNX模型格式。
 > > * **size**(vector&lt;int&gt;): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[640, 640]
 > > * **padding_value**(vector&lt;float&gt;): 通过此参数可以修改图片在resize时候做填充(padding)的值, 包含三个浮点型元素, 分别表示三个通道的值, 默认值为[114, 114, 114]
 > > * **is_no_pad**(bool): 通过此参数让图片是否通过填充的方式进行resize, `is_no_pad=ture` 表示不使用填充的方式，默认值为`is_no_pad=false`
-> > * **is_mini_pad**(bool): 通过此参数可以将resize之后图像的宽高这是为最接近`size`成员变量的值, 并且满足填充的像素大小是可以被`stride`成员变量整除的。默认值为`is_mini_pad=false`
+> > * **is_mini_pad**(bool): 通过此参数可以将resize之后图像的宽高设置为最接近`size`成员变量的值, 并且满足填充的像素大小是可以被`stride`成员变量整除的。默认值为`is_mini_pad=false`
 > > * **stride**(int): 配合`stris_mini_pad`成员变量使用, 默认值为`stride=32`
 
 - [模型介绍](../../)
diff --git a/examples/vision/detection/yolov8/python/README.md b/examples/vision/detection/yolov8/python/README.md
new file mode 100644
index 000000000..8922147aa
--- /dev/null
+++ b/examples/vision/detection/yolov8/python/README.md
@@ -0,0 +1,78 @@
+English | [简体中文](README_CN.md)
+# YOLOv8 Python Deployment Example
+
+Two steps before deployment
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. Install FastDeploy Python whl. Refer to [FastDeploy Python Installation](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+This directory provides the example that `infer.py` fast finishes the deployment of YOLOv8 on CPU/GPU and GPU through TensorRT. The script is as follows
+
+```bash
+# Download the example code for deployment
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/detection/yolov8/python/
+
+# Download yolov8 model files and test images
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov8.onnx
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+
+# CPU inference
+python infer.py --model yolov8.onnx --image 000000014439.jpg --device cpu
+# GPU inference
+python infer.py --model yolov8.onnx --image 000000014439.jpg --device gpu
+# TensorRT inference on GPU 
+python infer.py --model yolov8.onnx --image 000000014439.jpg --device gpu --use_trt True
+```
+
+The visualized result is as follows
+
+<img width="640" src="https://user-images.githubusercontent.com/67993288/184309358-d803347a-8981-44b6-b589-4608021ad0f4.jpg">
+
+## YOLOv8 Python Interface
+
+```python
+fastdeploy.vision.detection.YOLOv8(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+```
+
+YOLOv8 model loading and initialization, among which model_file is the exported ONNX model format
+
+**Parameter**
+
+> * **model_file**(str): Model file path 
+> * **params_file**(str): Parameter file path. No need to set when the model is in ONNX format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat): Model format. ONNX format by default
+
+### predict function
+
+> ```python
+> YOLOv8.predict(image_data)
+> ```
+>
+> Model prediction interface. Input images and output detection results
+>
+> **Parameter**
+>
+> > * **image_data**(np.ndarray): Input data in HWC or BGR format
+
+> **Return**
+>
+> > Return the `fastdeploy.vision.DetectionResult`structure, refer to [Vision Model Prediction Results](../../../../../docs/api/vision_results/) for its description
+
+### Class Member Property
+#### Pre-processing Parameter
+Users can modify the following preprocessing parameters based on actual needs to change the final inference and deployment results
+
+> > * **size**(list[int]): This parameter changes the resize used during preprocessing, containing two integer elements for [width, height] with default value [640, 640]
+> > * **padding_value**(list[float]): This parameter is used to change the padding value of images during resize, containing three floating-point elements that represent the value of three channels. Default value [114, 114, 114]
+> > * **is_no_pad**(bool): Specify whether to resize the image through padding. `is_no_pad=True` represents no paddling. Default `is_no_pad=False`
+> > * **is_mini_pad**(bool): This parameter sets the width and height of the image after resize to the value nearest to the `size` member variable and to the point where the padded pixel size is divisible by the `stride` member variable. Default `is_mini_pad=False`
+> > * **stride**(int): Used with the `stris_mini_padide` member variable. Default `stride=32`
+
+## Other Documents
+
+- [YOLOv8 Model Description](..)
+- [YOLOv8 C++ Deployment](../cpp)
+- [Model Prediction Results](../../../../../docs/api/vision_results/)
+- [How to switch the backend engine](../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/detection/yolov8/python/README_CN.md b/examples/vision/detection/yolov8/python/README_CN.md
index ae19ec727..8b2877372 100644
--- a/examples/vision/detection/yolov8/python/README_CN.md
+++ b/examples/vision/detection/yolov8/python/README_CN.md
@@ -50,7 +50,7 @@ YOLOv8模型加载和初始化，其中model_file为导出的ONNX模型格式
 > YOLOv8.predict(image_data)
 > ```
 >
-> 模型预测结口，输入图像直接输出检测结果。
+> 模型预测接口，输入图像直接输出检测结果。
 >
 > **参数**
 >
diff --git a/examples/vision/facedet/blazeface/README.md b/examples/vision/facedet/blazeface/README.md
new file mode 100644
index 000000000..98c430412
--- /dev/null
+++ b/examples/vision/facedet/blazeface/README.md
@@ -0,0 +1,34 @@
+English | [简体中文](README_CN.md)
+# BlazeFace Ready-to-deploy Model
+
+- BlazeFace deployment model implementation comes from [BlazeFace](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection),and [Pre-training model based on WiderFace](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection)
+  - （1）Provided in [Official library
+](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/tools) *.params, could deploy after operation [export_model.py](#Export PADDLE model);
+  - （2）Developers can train BlazeFace model based on their own data according to [export_model. py](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/tools/export_model.py)After exporting the model, complete the deployment。
+
+## Export PADDLE model
+
+Visit [BlazeFace](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection) Github library, download and install according to the instructions, download the `. yml` and `. params` model parameters, and use` export_ Model. py `gets the` pad `model file`. yml,. pdiparams,. pdmodel `.
+
+
+* Download BlazeFace model parameter file
+
+|Network structure | input size | number of pictures/GPU | learning rate strategy | Easy/Media/Hard Set | prediction delay (SD855) | model size (MB) | download | configuration file|
+|:------------:|:--------:|:----:|:-------:|:-------:|:---------:|:----------:|:---------:|:--------:|
+| BlazeFace  | 640  |    8    | 1000e     | 0.885 / 0.855 / 0.731 | - | 0.472 |[Download link](https://paddledet.bj.bcebos.com/models/blazeface_1000e.pdparams) | [Config file](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection/blazeface_1000e.yml) |
+| BlazeFace-FPN-SSH  | 640  |    8    | 1000e     | 0.907 / 0.883 / 0.793 | - | 0.479 |[Download link](https://paddledet.bj.bcebos.com/models/blazeface_fpn_ssh_1000e.pdparams) | [Config file](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection/blazeface_fpn_ssh_1000e.yml) |
+
+* Export paddle-format file
+  ```bash
+  python tools/export_model.py -c configs/face_detection/blazeface_1000e.yml -o weights=blazeface_1000e.pdparams --export_serving_model=True
+  ```
+
+## Detailed Deployment Tutorials
+
+- [Python Deployment](python)
+- [C++ Deployment](cpp)
+
+
+## Release Note
+
+- This tutorial and related code are written based on [BlazeFace](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection)
diff --git a/examples/vision/facedet/blazeface/README_CN.md b/examples/vision/facedet/blazeface/README_CN.md
new file mode 100644
index 000000000..f3957c0ca
--- /dev/null
+++ b/examples/vision/facedet/blazeface/README_CN.md
@@ -0,0 +1,31 @@
+# BlazeFace准备部署模型
+
+- BlazeFace部署模型实现来自[BlazeFace](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection),和[基于WiderFace的预训练模型](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection)
+  - （1）[官方库](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/tools)中提供的*.params,通过[export_model.py](#导出PADDLE模型)操作后，可进行部署；
+  - （2）开发者基于自己数据训练的BlazeFace模型，可按照[export_model.py](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/tools/export_model.py)导出模型后，完成部署。
+
+## 导出PADDLE模型
+
+访问[BlazeFace](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection)github库，按照指引下载安装，下载`.yml`和`.params` 模型参数，利用 `export_model.py` 得到`paddle`模型文件`.yml, .pdiparams, .pdmodel`。
+
+* 下载BlazeFace模型参数文件
+
+| 网络结构 | 输入尺寸 | 图片个数/GPU | 学习率策略 | Easy/Medium/Hard Set  | 预测时延（SD855）| 模型大小(MB) | 下载 | 配置文件 |
+|:------------:|:--------:|:----:|:-------:|:-------:|:---------:|:----------:|:---------:|:--------:|
+| BlazeFace  | 640  |    8    | 1000e     | 0.885 / 0.855 / 0.731 | - | 0.472 |[下载链接](https://paddledet.bj.bcebos.com/models/blazeface_1000e.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection/blazeface_1000e.yml) |
+| BlazeFace-FPN-SSH  | 640  |    8    | 1000e     | 0.907 / 0.883 / 0.793 | - | 0.479 |[下载链接](https://paddledet.bj.bcebos.com/models/blazeface_fpn_ssh_1000e.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection/blazeface_fpn_ssh_1000e.yml) |
+
+* 导出paddle格式文件
+  ```bash
+  python tools/export_model.py -c configs/face_detection/blazeface_1000e.yml -o weights=blazeface_1000e.pdparams --export_serving_model=True
+  ```
+
+## 详细部署文档
+
+- [Python部署](python)
+- [C++部署](cpp)
+
+
+## 版本说明
+
+- 本版本文档和代码基于[BlazeFace](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/face_detection) 编写
diff --git a/examples/vision/facedet/blazeface/cpp/CMakeLists.txt b/examples/vision/facedet/blazeface/cpp/CMakeLists.txt
new file mode 100644
index 000000000..4ec242a44
--- /dev/null
+++ b/examples/vision/facedet/blazeface/cpp/CMakeLists.txt
@@ -0,0 +1,14 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# Specifies the path to the fastdeploy library after you have downloaded it
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(../../../../../FastDeploy.cmake)
+
+# Add the FastDeploy dependency header
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
+# Add the FastDeploy library dependency
+target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
diff --git a/examples/vision/facedet/blazeface/cpp/README.md b/examples/vision/facedet/blazeface/cpp/README.md
new file mode 100644
index 000000000..dac9fc443
--- /dev/null
+++ b/examples/vision/facedet/blazeface/cpp/README.md
@@ -0,0 +1,78 @@
+English | [简体中文](README_CN.md)
+# BlazeFace C++ Deployment Example
+
+This directory provides examples that `infer.cc` fast finishes the deployment of BlazeFace on CPU/GPU。
+
+Before deployment, two steps require confirmation
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)  
+- 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)
+
+Taking the CPU inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
+
+```bash
+mkdir build
+cd build
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz # x.x.x >= 1.0.4
+tar xvf fastdeploy-linux-x64-x.x.x.tgz # x.x.x >= 1.0.4
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x # x.x.x >= 1.0.4
+make -j
+
+#Download the official converted YOLOv7Face model files and test images
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/blzeface-1000e.tgz
+
+#Use blazeface-1000e model
+# CPU inference
+./infer_demo blazeface-1000e/ test_lite_face_detector_3.jpg 0
+# GPU Inference
+./infer_demo blazeface-1000e/ test_lite_face_detector_3.jpg 1
+```
+
+The visualized result after running is as follows
+
+<img width="640" src="https://user-images.githubusercontent.com/49013063/206170111-843febb6-67d6-4c46-a121-d87d003bba21.jpg">
+
+The above command works for Linux or MacOS. For SDK use-pattern in Windows, refer to:
+- [How to use FastDeploy C++ SDK in Windows](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## BlazeFace C++ Interface
+
+### BlazeFace Class
+
+```c++
+fastdeploy::vision::facedet::BlazeFace(
+        const string& model_file,
+        const string& params_file = "",
+        const string& config_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::PADDLE)
+```
+
+BlazeFace model loading and initialization, among which model_file is the exported PADDLE model format
+
+**Parameter**
+
+> * **model_file**(str): Model file path 
+> * **params_file**(str): Parameter file path. Only passing an empty string when the model is in PADDLE format
+> * **config_file**(str): Config file path. Only passing an empty string when the model is in PADDLE format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat): Model format. PADDLE format by default
+
+#### Predict Function
+
+> ```c++
+> BlazeFace::Predict(cv::Mat& im, FaceDetectionResult* result)
+> ```
+>
+> Model prediction interface. Input images and output detection results.
+>
+> **Parameter**
+>
+> > * **im**: Input images in HWC or BGR format
+> > * **result**: Detection results, including detection box and confidence of each box. Refer to [Vision Model Prediction Result](../../../../../docs/api/vision_results/) for FaceDetectionResult
+
+- [Model Description](../../)
+- [Python Deployment](../python)
+- [Vision Model Prediction Results](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/blazeface/cpp/README_CN.md b/examples/vision/facedet/blazeface/cpp/README_CN.md
new file mode 100644
index 000000000..12b67b6e5
--- /dev/null
+++ b/examples/vision/facedet/blazeface/cpp/README_CN.md
@@ -0,0 +1,77 @@
+[English](README.md) | 简体中文
+# BlazeFace C++部署示例
+
+本目录下提供`infer.cc`快速完成BlazeFace在CPU/GPU部署的示例。
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+以Linux上CPU推理为例，在本目录执行如下命令即可完成编译测试
+
+```bash
+mkdir build
+cd build
+# 下载FastDeploy预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz # x.x.x >= 1.0.4
+tar xvf fastdeploy-linux-x64-x.x.x.tgz # x.x.x >= 1.0.4
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x # x.x.x >= 1.0.4
+make -j
+
+#下载官方转换好的BlazeFace模型文件和测试图片
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/blzeface-1000e.tgz
+
+#使用blazeface-1000e模型
+# CPU推理
+./infer_demo blazeface-1000e/ test_lite_face_detector_3.jpg 0
+# GPU推理
+./infer_demo blazeface-1000e/ test_lite_face_detector_3.jpg 1
+
+运行完成可视化结果如下图所示
+
+<img width="640" src="https://user-images.githubusercontent.com/49013063/206170111-843febb6-67d6-4c46-a121-d87d003bba21.jpg">
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## BlazeFace C++接口
+
+### BlazeFace类
+
+```c++
+fastdeploy::vision::facedet::BlazeFace(
+        const string& model_file,
+        const string& params_file = "",
+        const string& config_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::PADDLE)
+```
+
+BlazeFace模型加载和初始化，其中model_file为导出的PADDLE模型格式。
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX时，此参数传入空字符串即可
+> * **config_file**(str): 配置文件路径，当模型格式为ONNX时，此参数传入空字符串即可
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为PADDLE格式
+
+#### Predict函数
+
+> ```c++
+> BlazeFace::Predict(cv::Mat& im, FaceDetectionResult* result)
+> ```
+>
+> 模型预测接口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 检测结果，包括检测框，各个框的置信度, FaceDetectionResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+- [模型介绍](../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/blazeface/cpp/infer.cc b/examples/vision/facedet/blazeface/cpp/infer.cc
new file mode 100644
index 000000000..c4304f45f
--- /dev/null
+++ b/examples/vision/facedet/blazeface/cpp/infer.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+void CpuInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto config_file = model_dir + sep + "infer_cfg.yml";
+  auto option = fastdeploy::RuntimeOption();
+  option.UseCpu();
+  auto model = fastdeploy::vision::facedet::BlazeFace(
+                model_file, params_file, config_file, option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisFaceDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+void GpuInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto config_file = model_dir + sep + "infer_cfg.yml";
+  auto option = fastdeploy::RuntimeOption();
+  option.UseGpu();
+  auto model = fastdeploy::vision::facedet::BlazeFace(
+                model_file, params_file, config_file, option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisFaceDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    std::cout << "Usage: infer_demo path/to/model path/to/image run_option, "
+                 "e.g ./infer_model yolov5s-face.onnx ./test.jpeg 0"
+              << std::endl;
+    std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
+                 "with gpu; 2: run with gpu and use tensorrt backend."
+              << std::endl;
+    return -1;
+  }
+
+  if (std::atoi(argv[3]) == 0) {
+    CpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 1) {
+    GpuInfer(argv[1], argv[2]);
+  }
+  return 0;
+}
diff --git a/examples/vision/facedet/blazeface/python/README.md b/examples/vision/facedet/blazeface/python/README.md
new file mode 100644
index 000000000..b645317cd
--- /dev/null
+++ b/examples/vision/facedet/blazeface/python/README.md
@@ -0,0 +1,68 @@
+English | [简体中文](README_CN.md)
+# BlazeFace Python Deployment Example
+
+Before deployment, two steps require confirmation
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)  
+- 2. Install FastDeploy Python whl package. Refer to [FastDeploy Python Installation](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)
+
+This directory provides examples that `infer.py` fast finishes the deployment of BlazeFace on CPU/GPU.
+
+```bash
+# Download the example code for deployment
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/facedet/blazeface/python/
+
+# Download BlazeFace model files and test images
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/blazeface-1000e.tgz
+
+# Use blazeface-1000e model
+# CPU Inference
+python infer.py --model blazeface-1000e/ --image test_lite_face_detector_3.jpg --device cpu
+# GPU Inference
+python infer.py --model blazeface-1000e/ --image test_lite_face_detector_3.jpg --device gpu
+```
+
+The visualized result after running is as follows
+
+<img width="640" src="https://user-images.githubusercontent.com/67993288/184301839-a29aefae-16c9-4196-bf9d-9c6cf694f02d.jpg">
+
+## BlazeFace Python Interface
+
+```python
+fastdeploy.vision.facedet.BlzaeFace(model_file, params_file=None, runtime_option=None, config_file=None, model_format=ModelFormat.PADDLE)
+```
+
+BlazeFace model loading and initialization, among which model_file is the exported PADDLE model format
+
+**Parameter**
+
+> * **model_file**(str): Model file path
+> * **params_file**(str): Parameter file path. No need to set when the model is in PADDLE format
+> * **config_file**(str): config file path. No need to set when the model is in PADDLE format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat):  Model format. PADDLE format by default
+
+### predict function
+
+> ```python
+> BlazeFace.predict(input_image)
+> ```
+> Through let BlazeFace.postprocessor.conf_threshold = 0.2，to modify conf_threshold
+>
+> Model prediction interface. Input images and output detection results.
+>
+> **Parameter**
+>
+> > * **input_image**(np.ndarray): Input image in HWC or BGR format
+
+> **Return**
+>
+> > Return`fastdeploy.vision.FaceDetectionResult` structure. Refer to [Vision Model Prediction Results](../../../../../docs/api/vision_results/) for its description.
+
+## Other Documents
+
+- [BlazeFace Model Description](..)
+- [BlazeFace C++ Deployment](../cpp)
+- [Model Prediction Results](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/blazeface/python/README_CN.md b/examples/vision/facedet/blazeface/python/README_CN.md
new file mode 100644
index 000000000..3bbc620e2
--- /dev/null
+++ b/examples/vision/facedet/blazeface/python/README_CN.md
@@ -0,0 +1,68 @@
+[English](README.md) | 简体中文
+# BlazeFace Python部署示例
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+本目录下提供`infer.py`快速完成BlazeFace在CPU/GPU部署的示例。执行如下脚本即可完成
+
+```bash
+#下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/facedet/blazeface/python/
+
+#下载BlazeFace模型文件和测试图片
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/blazeface-1000e.tgz
+
+#使用blazeface-1000e模型
+# CPU推理
+python infer.py --model blazeface-1000e/ --image test_lite_face_detector_3.jpg --device cpu
+# GPU推理
+python infer.py --model blazeface-1000e/ --image test_lite_face_detector_3.jpg --device gpu
+```
+
+运行完成可视化结果如下图所示
+
+<img width="640" src="https://user-images.githubusercontent.com/67993288/184301839-a29aefae-16c9-4196-bf9d-9c6cf694f02d.jpg">
+
+## BlazeFace Python接口
+
+```python
+fastdeploy.vision.facedet.BlzaeFace(model_file, params_file=None, runtime_option=None, config_file=None, model_format=ModelFormat.PADDLE)
+```
+
+BlazeFace模型加载和初始化
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX格式时，此参数无需设定
+> * **config_file**(str): config文件路径，当模型格式为ONNX格式时，此参数无需设定
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为PADDLE
+
+### predict函数
+
+> ```python
+> BlazeFace.predict(input_image)
+> ```
+> 通过BlazeFace.postprocessor.conf_threshold = 0.2，来修改conf_threshold
+>
+> 模型预测结口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **input_image**(np.ndarray): 输入数据，注意需为HWC，BGR格式
+
+> **返回**
+>
+> > 返回`fastdeploy.vision.FaceDetectionResult`结构体，结构体说明参考文档[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+## 其它文档
+
+- [BlazeFace 模型介绍](..)
+- [BlazeFace C++部署](../cpp)
+- [模型预测结果说明](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/blazeface/python/infer.py b/examples/vision/facedet/blazeface/python/infer.py
new file mode 100644
index 000000000..b9904f9c0
--- /dev/null
+++ b/examples/vision/facedet/blazeface/python/infer.py
@@ -0,0 +1,58 @@
+import fastdeploy as fd
+import cv2
+import os
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="Path of blazeface model dir.")
+    parser.add_argument(
+        "--image", required=True, help="Path of test image file.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    parser.add_argument(
+        "--use_trt",
+        type=ast.literal_eval,
+        default=False,
+        help="Wether to use tensorrt.")
+    return parser.parse_args()
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+
+    if args.device.lower() == "gpu":
+        option.use_gpu()
+
+    if args.use_trt:
+        option.use_trt_backend()
+        option.set_trt_input_shape("images", [1, 3, 640, 640])
+    return option
+
+
+args = parse_arguments()
+
+model_dir = args.model
+
+model_file = os.path.join(model_dir, "model.pdmodel")
+params_file = os.path.join(model_dir, "model.pdiparams")
+config_file = os.path.join(model_dir, "infer_cfg.yml")
+
+# Configure runtime and load the model
+runtime_option = build_option(args)
+model = fd.vision.facedet.BlazeFace(model_file, params_file, config_file, runtime_option=runtime_option)
+
+# Predict image detection results
+im = cv2.imread(args.image)
+result = model.predict(im)
+print(result)
+# Visualization of prediction Results
+vis_im = fd.vision.vis_face_detection(im, result)
+cv2.imwrite("visualized_result.jpg", vis_im)
+print("Visualized result save in ./visualized_result.jpg")
diff --git a/examples/vision/facedet/centerface/README.md b/examples/vision/facedet/centerface/README.md
new file mode 100644
index 000000000..722709671
--- /dev/null
+++ b/examples/vision/facedet/centerface/README.md
@@ -0,0 +1,25 @@
+English | [简体中文](README_CN.md)
+
+# CenterFace Ready-to-deploy Model
+
+- The deployment of the CenterFace model is based on [CenterFace](https://github.com/Star-Clouds/CenterFace.git) and [Pre-trained Model Based on WIDER FACE](https://github.com/Star-Clouds/CenterFace.git)
+  - （1）The *.onnx provided by [Official Repository](https://github.com/Star-Clouds/CenterFace.git) can be deployed directly；
+  - （2）The CenterFace train code is not open source and users cannot train it.
+
+
+## Download Pre-trained ONNX Model
+
+For developers' testing, models exported by CenterFace are provided below. Developers can download them directly. (The accuracy in the following table is derived from the source official repository on WIDER FACE test set)
+| Model                                                               | Size    | Accuracy(Easy Set,Medium Set,Hard Set)  | Note |
+|:---------------------------------------------------------------- |:----- |:----- |:---- |
+| [CenterFace](https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx) | 7.2MB | 93.2%,92.1%,87.3% | This model file is sourced from [CenterFace](https://github.com/Star-Clouds/CenterFace.git)，MIT license |
+
+
+## Detailed Deployment Documents
+
+- [Python Deployment](python)
+- [C++ Deployment](cpp)
+
+## Release Note
+
+- Document and code are based on [CenterFace](https://github.com/Star-Clouds/CenterFace.git) 
\ No newline at end of file
diff --git a/examples/vision/facedet/centerface/README_CN.md b/examples/vision/facedet/centerface/README_CN.md
new file mode 100644
index 000000000..34c996fdb
--- /dev/null
+++ b/examples/vision/facedet/centerface/README_CN.md
@@ -0,0 +1,24 @@
+[English](README.md) | 简体中文
+# CenterFace准备部署模型
+
+- CenterFace部署模型实现来自[CenterFace](https://github.com/Star-Clouds/CenterFace.git),和[基于WIDER FACE的预训练模型](https://github.com/Star-Clouds/CenterFace.git)
+  - （1）[官方库](https://github.com/Star-Clouds/CenterFace.git)提供的*.onnx可直接进行部署；
+  - （2）由于CenterFace未开放训练源代码，开发者无法基于自己的数据训练CenterFace模型
+
+
+## 下载预训练ONNX模型
+
+为了方便开发者的测试，下面提供了CenterFace导出的模型，开发者可直接下载使用。（下表中模型的精度来源于源官方库在WIDER FACE测试集上的结果）
+| 模型                                                               | 大小    | 精度(Easy Set,Medium Set,Hard Set)  | 备注 |
+|:---------------------------------------------------------------- |:----- |:----- |:---- |
+| [CenterFace](https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx) | 7.2MB | 93.2%,92.1%,87.3% | 此模型文件来源于[CenterFace](https://github.com/Star-Clouds/CenterFace.git)，MIT license |
+
+
+## 详细部署文档
+
+- [Python部署](python)
+- [C++部署](cpp)
+
+## 版本说明
+
+- 本版本文档和代码基于[CenterFace](https://github.com/Star-Clouds/CenterFace.git) 编写
\ No newline at end of file
diff --git a/examples/vision/facedet/centerface/cpp/CMakeLists.txt b/examples/vision/facedet/centerface/cpp/CMakeLists.txt
new file mode 100644
index 000000000..9ba668762
--- /dev/null
+++ b/examples/vision/facedet/centerface/cpp/CMakeLists.txt
@@ -0,0 +1,14 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# Specifies the path to the fastdeploy library after you have downloaded it
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# Include the FastDeploy dependency header file
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
+# Add the FastDeploy library dependency
+target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
diff --git a/examples/vision/facedet/centerface/cpp/README.md b/examples/vision/facedet/centerface/cpp/README.md
new file mode 100644
index 000000000..7c94c573a
--- /dev/null
+++ b/examples/vision/facedet/centerface/cpp/README.md
@@ -0,0 +1,78 @@
+English | [简体中文](README_CN.md)
+# CenterFace C++ Deployment Example
+
+This directory provides examples that `infer.cc`  fast finishes the deployment of CenterFace on CPU/GPU and GPU accelerated by TensorRT. 
+
+Before deployment, two steps require confirmation
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)  
+- 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)
+
+Taking the CPU inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
+
+```bash
+mkdir build
+cd build
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz # x.x.x > 1.0.4
+tar xvf fastdeploy-linux-x64-x.x.x.tgz # x.x.x > 1.0.4
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x # x.x.x > 1.0.4
+make -j
+
+# Download the official converted CenterFace model files and test images 
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx
+
+# Use CenterFace.onnx model
+# CPU inference
+./infer_demo CenterFace.onnx test_lite_face_detector_3.jpg 0
+# GPU inference
+./infer_demo CenterFace.onnx test_lite_face_detector_3.jpg 1
+# TensorRT inference on GPU
+./infer_demo CenterFace.onnx test_lite_face_detector_3.jpg 2
+```
+
+The visualized result after running is as follows
+
+<img width="640" src="https://user-images.githubusercontent.com/44280887/215670067-e14b5205-e303-4c3a-9812-be4a81173dc6.jpg">
+
+The above command works for Linux or MacOS. For SDK use-pattern in Windows, refer to:
+- [How to use FastDeploy C++ SDK in Windows](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## CenterFace C++ Interface 
+
+### CenterFace Class
+
+```c++
+fastdeploy::vision::facedet::CenterFace(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+CenterFace model loading and initialization, among which model_file is the exported ONNX model format
+
+**Parameter**
+
+> * **model_file**(str): Model file path 
+> * **params_file**(str): Parameter file path. Only passing an empty string when the model is in ONNX format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat): Model format. ONNX format by default
+
+#### Predict Function
+
+> ```c++
+> CenterFace::Predict(cv::Mat* im, FaceDetectionResult* result)
+> ```
+>
+> Model prediction interface. Input images and output detection results.
+>
+> **Parameter**
+>
+> > * **im**: Input images in HWC or BGR format
+> > * **result**: Detection results, including detection box and confidence of each box. Refer to [Vision Model Prediction Result](../../../../../docs/api/vision_results/) for FaceDetectionResult
+
+- [Model Description](../../)
+- [Python Deployment](../python)
+- [Vision Model Prediction Results](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/centerface/cpp/README_CN.md b/examples/vision/facedet/centerface/cpp/README_CN.md
new file mode 100644
index 000000000..b9443271e
--- /dev/null
+++ b/examples/vision/facedet/centerface/cpp/README_CN.md
@@ -0,0 +1,77 @@
+# CenterFace C++部署示例
+
+本目录下提供`infer.cc`快速完成CenterFace在CPU/GPU，以及GPU上通过TensorRT加速部署的示例。
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+以Linux上CPU推理为例，在本目录执行如下命令即可完成编译测试
+
+```bash
+mkdir build
+cd build
+# 下载FastDeploy预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz # x.x.x > 1.0.4
+tar xvf fastdeploy-linux-x64-x.x.x.tgz # x.x.x > 1.0.4
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x # x.x.x > 1.0.4
+make -j
+
+#下载官方转换好的CenterFace模型文件和测试图片
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx
+
+#使用CenterFace.onnx模型
+# CPU推理
+./infer_demo CenterFace.onnx test_lite_face_detector_3.jpg 0
+# GPU推理
+./infer_demo CenterFace.onnx test_lite_face_detector_3.jpg 1
+# GPU上TensorRT推理
+./infer_demo CenterFace.onnx test_lite_face_detector_3.jpg 2
+```
+
+运行完成可视化结果如下图所示
+
+<img width="640" src="https://user-images.githubusercontent.com/44280887/215670067-e14b5205-e303-4c3a-9812-be4a81173dc6.jpg">
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## CenterFace C++接口
+
+### CenterFace类
+
+```c++
+fastdeploy::vision::facedet::CenterFace(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+CenterFace模型加载和初始化，其中model_file为导出的ONNX模型格式。
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX时，此参数传入空字符串即可
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX格式
+
+#### Predict函数
+
+> ```c++
+> CenterFace::Predict(cv::Mat* im, FaceDetectionResult* result)
+> ```
+>
+> 模型预测接口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 检测结果，包括检测框，各个框的置信度, FaceDetectionResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+- [模型介绍](../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/centerface/cpp/infer.cc b/examples/vision/facedet/centerface/cpp/infer.cc
new file mode 100644
index 000000000..1f4af8433
--- /dev/null
+++ b/examples/vision/facedet/centerface/cpp/infer.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+void CpuInfer(const std::string& model_file, const std::string& image_file) {
+  auto model = fastdeploy::vision::facedet::CenterFace(model_file);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisFaceDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+void GpuInfer(const std::string& model_file, const std::string& image_file) {
+  auto option = fastdeploy::RuntimeOption();
+  option.UseGpu();
+  auto model = fastdeploy::vision::facedet::CenterFace(model_file, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisFaceDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+void TrtInfer(const std::string& model_file, const std::string& image_file) {
+  auto option = fastdeploy::RuntimeOption();
+  option.UseGpu();
+  option.UseTrtBackend();
+  option.SetTrtInputShape("images", {1, 3, 640, 640});
+  auto model = fastdeploy::vision::facedet::CenterFace(model_file, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisFaceDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    std::cout << "Usage: infer_demo path/to/model path/to/image run_option, "
+                 "e.g ./infer_model yolov5s-face.onnx ./test.jpeg 0"
+              << std::endl;
+    std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
+                 "with gpu; 2: run with gpu and use tensorrt backend."
+              << std::endl;
+    return -1;
+  }
+
+  if (std::atoi(argv[3]) == 0) {
+    CpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 1) {
+    GpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 2) {
+    TrtInfer(argv[1], argv[2]);
+  }
+  return 0;
+}
diff --git a/examples/vision/facedet/centerface/python/README.md b/examples/vision/facedet/centerface/python/README.md
new file mode 100644
index 000000000..7f7ecf97b
--- /dev/null
+++ b/examples/vision/facedet/centerface/python/README.md
@@ -0,0 +1,75 @@
+English | [简体中文](README_CN.md)
+# CenterFace Python Deployment Example
+
+Before deployment, two steps require confirmation
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)  
+- 2. Install FastDeploy Python whl package. Refer to [FastDeploy Python Installation](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)
+
+This directory provides examples that `infer.py` fast finishes the deployment of CenterFace on CPU/GPU and GPU accelerated by TensorRT. The script is as follows
+
+```bash
+# Download the example code for deployment
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/facedet/CenterFace/python/
+
+# Download CenterFace model files and test images
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx
+
+# Use CenterFace.onnx model
+# CPU inference
+python infer.py --model CenterFace.onnx --image test_lite_face_detector_3.jpg --device cpu
+# GPU inference
+python infer.py --model CenterFace.onnx --image test_lite_face_detector_3.jpg --device gpu
+# TensorRT inference on GPU 
+python infer.py --model CenterFace.onnx --image test_lite_face_detector_3.jpg --device gpu --use_trt True
+```
+
+The visualized result after running is as follows
+
+<img width="640" src="https://user-images.githubusercontent.com/44280887/215670067-e14b5205-e303-4c3a-9812-be4a81173dc6.jpg">
+
+## CenterFace Python Interface 
+
+```python
+fastdeploy.vision.facedet.CenterFace(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+```
+
+CenterFace model loading and initialization, among which model_file is the exported ONNX model format
+
+**Parameter**
+
+> * **model_file**(str): Model file path 
+> * **params_file**(str): Parameter file path. No need to set when the model is in ONNX format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat): Model format. ONNX format by default
+
+### predict function
+
+> ```python
+> CenterFace.predict(image_data)
+> ```
+>
+> Model prediction interface. Input images and output detection results.
+>
+> **Parameter**
+>
+> > * **image_data**(np.ndarray): Input data in HWC or BGR format
+
+
+> **Return**
+>
+> > Return `fastdeploy.vision.FaceDetectionResult`  structure. Refer to [Vision Model Prediction Results](../../../../../docs/api/vision_results/) for its description.
+
+### Class Member Property
+#### Pre-processing Parameter
+Users can modify the following pre-processing parameters to their needs, which affects the final inference and deployment results
+
+> > * **size**(list[int]): This parameter changes the size of the resize used during preprocessing, containing two integer elements for [width, height] with default value [640, 640]
+
+## Other Documents
+
+- [CenterFace Model Description](..)
+- [CenterFace C++ Deployment](../cpp)
+- [Model Prediction Results](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/centerface/python/README_CN.md b/examples/vision/facedet/centerface/python/README_CN.md
new file mode 100644
index 000000000..6fb7f3909
--- /dev/null
+++ b/examples/vision/facedet/centerface/python/README_CN.md
@@ -0,0 +1,74 @@
+[English](README.md) | 简体中文
+# CenterFace Python部署示例
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+本目录下提供`infer.py`快速完成CenterFace在CPU/GPU，以及GPU上通过TensorRT加速部署的示例。执行如下脚本即可完成
+
+```bash
+#下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/facedet/CenterFace/python/
+
+#下载CenterFace模型文件和测试图片
+wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
+wget https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx
+
+#使用CenterFace.onnx模型
+# CPU推理
+python infer.py --model CenterFace.onnx --image test_lite_face_detector_3.jpg --device cpu
+# GPU推理
+python infer.py --model CenterFace.onnx --image test_lite_face_detector_3.jpg --device gpu
+# GPU上使用TensorRT推理
+python infer.py --model CenterFace.onnx --image test_lite_face_detector_3.jpg --device gpu --use_trt True
+```
+
+运行完成可视化结果如下图所示
+
+<img width="640" src="https://user-images.githubusercontent.com/44280887/215670067-e14b5205-e303-4c3a-9812-be4a81173dc6.jpg">
+
+## CenterFace Python接口
+
+```python
+fastdeploy.vision.facedet.CenterFace(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+```
+
+CenterFace模型加载和初始化，其中model_file为导出的ONNX模型格式
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX格式时，此参数无需设定
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX
+
+### predict函数
+
+> ```python
+> CenterFace.predict(image_data)
+> ```
+>
+> 模型预测结口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **image_data**(np.ndarray): 输入数据，注意需为HWC，BGR格式
+
+> **返回**
+>
+> > 返回`fastdeploy.vision.FaceDetectionResult`结构体，结构体说明参考文档[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+### 类成员属性
+#### 预处理参数
+用户可按照自己的实际需求，修改下列预处理参数，从而影响最终的推理和部署效果
+
+> > * **size**(list[int]): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[640, 640]
+
+## 其它文档
+
+- [CenterFace 模型介绍](..)
+- [CenterFace C++部署](../cpp)
+- [模型预测结果说明](../../../../../docs/api/vision_results/)
diff --git a/examples/vision/facedet/centerface/python/infer.py b/examples/vision/facedet/centerface/python/infer.py
new file mode 100644
index 000000000..39eeaf39e
--- /dev/null
+++ b/examples/vision/facedet/centerface/python/infer.py
@@ -0,0 +1,51 @@
+import fastdeploy as fd
+import cv2
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="Path of CenterFace onnx model.")
+    parser.add_argument(
+        "--image", required=True, help="Path of test image file.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    parser.add_argument(
+        "--use_trt",
+        type=ast.literal_eval,
+        default=False,
+        help="Wether to use tensorrt.")
+    return parser.parse_args()
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+
+    if args.device.lower() == "gpu":
+        option.use_gpu()
+
+    if args.use_trt:
+        option.use_trt_backend()
+        option.set_trt_input_shape("images", [1, 3, 640, 640])
+    return option
+
+
+args = parse_arguments()
+
+# Configure runtime and load the model
+runtime_option = build_option(args)
+model = fd.vision.facedet.CenterFace(args.model, runtime_option=runtime_option)
+
+# Predict image detection results
+im = cv2.imread(args.image)
+result = model.predict(im)
+print(result)
+# Visualization of prediction Results
+vis_im = fd.vision.vis_face_detection(im, result)
+cv2.imwrite("visualized_result.jpg", vis_im)
+print("Visualized result save in ./visualized_result.jpg")
diff --git a/examples/vision/faceid/insightface/rknpu2/cpp/README.md b/examples/vision/faceid/insightface/rknpu2/cpp/README.md
index 0c09d4fbe..b2d1fe460 100644
--- a/examples/vision/faceid/insightface/rknpu2/cpp/README.md
+++ b/examples/vision/faceid/insightface/rknpu2/cpp/README.md
@@ -1,19 +1,19 @@
-[English](README.md) | 简体中文
-# InsightFace C++部署示例
+English | [简体中文](README_CN.md)
+# InsightFace C++ Deployment Example
 
-FastDeploy支持在RKNPU上部署包括ArcFace\CosFace\VPL\Partial_FC在内的InsightFace系列模型。
+FastDeploy supports the deployment of InsightFace models like ArcFace\CosFace\VPL\Partial_FC on RKNPU.
 
-本目录下提供`infer_arcface.cc`快速完成InsighFace模型包括ArcFace在CPU/RKNPU加速部署的示例。
+This directoty provides the example that `infer_arcface.cc` fast finishes the deployment of InsighFace models like ArcFace on CPU/RKNPU.
 
 
-在部署前，需确认以下两个步骤:
+Two steps before deployment:
 
-1. 软硬件环境满足要求
-2. 根据开发环境，下载预编译部署库或者从头编译FastDeploy仓库
+1. Software and hardware should meet the requirements. 
+2. Download the precompiled deployment library or deploy FastDeploy repository from scratch according to your development environment. 
 
-以上步骤请参考[RK2代NPU部署库编译](../../../../../../docs/cn/build_and_install/rknpu2.md)实现
+Refer to [RK2 generation NPU deployment library compilation](../../../../../../docs/cn/build_and_install/rknpu2.md) for the above steps
 
-在本目录执行如下命令即可完成编译测试
+The compilation can be completed by executing the following command in this directory. 
 
 ```bash
 mkdir build
@@ -24,18 +24,18 @@ tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
 make -j
 
-# 下载官方转换好的ArcFace模型文件和测试图片
+# Download the official converted ArcFace model files and test images
 wget https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r18.onnx
 wget https://bj.bcebos.com/paddlehub/fastdeploy/rknpu2/face_demo.zip
 unzip face_demo.zip
 
-# CPU推理
+# CPU inference
 ./infer_arcface_demo ms1mv3_arcface_r100.onnx face_0.jpg face_1.jpg face_2.jpg 0
-# RKNPU推理
+# RKNPU inference
 ./infer_arcface_demo ms1mv3_arcface_r100.onnx face_0.jpg face_1.jpg face_2.jpg 1
 ```
 
-运行完成可视化结果如下图所示
+The visualized result is as follows
 
 <div width="700">
 <img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321537-860bf857-0101-4e92-a74c-48e8658d838c.JPG">
@@ -43,12 +43,12 @@ unzip face_demo.zip
 <img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321622-d9a494c3-72f3-47f1-97c5-8a2372de491f.JPG">
 </div>
 
-以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
-- [如何在Windows中使用FastDeploy C++ SDK](../../../../../../docs/cn/faq/use_sdk_on_windows.md)
+The above command works for Linux or MacOS. For SDK in Windows, refer to: 
+- [How to use FastDeploy C++ SDK in Windows](../../../../../../docs/cn/faq/use_sdk_on_windows.md)
 
-## InsightFace C++接口
+## InsightFace C++ Interface
 
-### ArcFace类
+### ArcFace 
 
 ```c++
 fastdeploy::vision::faceid::ArcFace(
@@ -58,9 +58,9 @@ fastdeploy::vision::faceid::ArcFace(
         const ModelFormat& model_format = ModelFormat::ONNX)
 ```
 
-ArcFace模型加载和初始化，其中model_file为导出的ONNX模型格式。
+ArcFace model loading and initialization, among which model_file is the exported ONNX model format
 
-### CosFace类
+### CosFace
 
 ```c++
 fastdeploy::vision::faceid::CosFace(
@@ -70,9 +70,9 @@ fastdeploy::vision::faceid::CosFace(
         const ModelFormat& model_format = ModelFormat::ONNX)
 ```
 
-CosFace模型加载和初始化，其中model_file为导出的ONNX模型格式。
+CosFace model loading and initialization, among which model_file is the exported ONNX model format
 
-### PartialFC类
+### PartialFC
 
 ```c++
 fastdeploy::vision::faceid::PartialFC(
@@ -82,9 +82,9 @@ fastdeploy::vision::faceid::PartialFC(
         const ModelFormat& model_format = ModelFormat::ONNX)
 ```
 
-PartialFC模型加载和初始化，其中model_file为导出的ONNX模型格式。
+PartialFC model loading and initialization, among which model_file is the exported ONNX model format
 
-### VPL类
+### VPL
 
 ```c++
 fastdeploy::vision::faceid::VPL(
@@ -94,43 +94,43 @@ fastdeploy::vision::faceid::VPL(
         const ModelFormat& model_format = ModelFormat::ONNX)
 ```
 
-VPL模型加载和初始化，其中model_file为导出的ONNX模型格式。
-**参数**
+VPL model loading and initialization, among which model_file is the exported ONNX model format
+**Parameter**
 
-> * **model_file**(str): 模型文件路径
-> * **params_file**(str): 参数文件路径，当模型格式为ONNX时，此参数传入空字符串即可
-> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
-> * **model_format**(ModelFormat): 模型格式，默认为ONNX格式
+> * **model_file**(str): Model file path 
+> * **params_file**(str): Parameter file path. Merely passing an empty string when the model is in ONNX format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat): Model format. ONNX format by default
 
-#### Predict函数
+#### Predict function
 
 > ```c++
 > ArcFace::Predict(const cv::Mat& im, FaceRecognitionResult* result)
 > ```
 >
-> 模型预测接口，输入图像直接输出检测结果。
+> Model prediction interface. Input images and output detection results
 >
-> **参数**
+> **Parameter**
 >
-> > * **im**: 输入图像，注意需为HWC，BGR格式
-> > * **result**: 检测结果，包括检测框，各个框的置信度, FaceRecognitionResult说明参考[视觉模型预测结果](../../../../../../docs/api/vision_results/)
+> > * **im**: Input images in HWC or BGR format
+> > * **result**: Detection results, including detection box and confidence of each box. Refer to [Vision Model Prediction Results] for the description of FaceRecognitionResult(../../../../../../docs/api/vision_results/)
 
-### 修改预处理以及后处理的参数
-预处理和后处理的参数的需要通过修改InsightFaceRecognitionPostprocessor，InsightFaceRecognitionPreprocessor的成员变量来进行修改。
+### Change pre-processing and post-processing parameters 
+Pre-processing and post-processing parameters can be changed by modifying the member variables of InsightFaceRecognitionPostprocessor and InsightFaceRecognitionPreprocessor
 
-#### InsightFaceRecognitionPreprocessor成员变量(预处理参数)
-> > * **size**(vector&lt;int&gt;): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[112, 112],
-      通过InsightFaceRecognitionPreprocessor::SetSize(std::vector<int>& size)来进行修改
-> > * **alpha**(vector&lt;float&gt;): 预处理归一化的alpha值，计算公式为`x'=x*alpha+beta`，alpha默认为[1. / 127.5, 1.f / 127.5, 1. / 127.5],
-      通过InsightFaceRecognitionPreprocessor::SetAlpha(std::vector<float>& alpha)来进行修改
-> > * **beta**(vector&lt;float&gt;): 预处理归一化的beta值，计算公式为`x'=x*alpha+beta`，beta默认为[-1.f, -1.f, -1.f],
-      通过InsightFaceRecognitionPreprocessor::SetBeta(std::vector<float>& beta)来进行修改
+#### Member variables of InsightFaceRecognitionPreprocessor (preprocessing parameters)
+> > * **size**(vector&lt;int&gt;): This parameter changes the resize during preprocessing, containing two integer elements for [width, height] with default value [112, 112].
+      Revise through InsightFaceRecognitionPreprocessor::SetSize(std::vector<int>& size)
+> > * **alpha**(vector&lt;float&gt;): Preprocess normalized alpha, and calculated as `x'=x*alpha+beta`. Alpha defaults to [1. / 127.5, 1.f / 127.5, 1. / 127.5].
+      Revise through InsightFaceRecognitionPreprocessor::SetAlpha(std::vector<float>& alpha)
+> > * **beta**(vector&lt;float&gt;): Preprocess normalized beta, and calculated as `x'=x*alpha+beta`. Alpha defaults to [-1.f, -1.f, -1.f],
+      Revise through InsightFaceRecognitionPreprocessor::SetBeta(std::vector<float>& beta)
 
-#### InsightFaceRecognitionPostprocessor成员变量(后处理参数)
-> > * **l2_normalize**(bool): 输出人脸向量之前是否执行l2归一化，默认false,
-      InsightFaceRecognitionPostprocessor::SetL2Normalize(bool& l2_normalize)来进行修改
+####  Member variables of InsightFaceRecognitionPostprocessor(post-processing parameters)
+> > * **l2_normalize**(bool): Whether to perform l2 normalization before outputting the face vector. Default false.
+      Revise through InsightFaceRecognitionPostprocessor::SetL2Normalize(bool& l2_normalize)
 
-- [模型介绍](../../../)
-- [Python部署](../python)
-- [视觉模型预测结果](../../../../../../docs/api/vision_results/README.md)
-- [如何切换模型推理后端引擎](../../../../../../docs/cn/faq/how_to_change_backend.md)
+- [Model Description](../../../)
+- [Python Deployemnt](../python)
+- [Vision Model Prediction Results](../../../../../../docs/api/vision_results/README.md)
+- [How to switch the backend engine](../../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/faceid/insightface/rknpu2/cpp/README_CN.md b/examples/vision/faceid/insightface/rknpu2/cpp/README_CN.md
new file mode 100644
index 000000000..d72f356d8
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/cpp/README_CN.md
@@ -0,0 +1,136 @@
+[English](README.md) | 简体中文
+# InsightFace C++部署示例
+
+FastDeploy支持在RKNPU上部署包括ArcFace\CosFace\VPL\Partial_FC在内的InsightFace系列模型。
+
+本目录下提供`infer_arcface.cc`快速完成InsighFace模型包括ArcFace在CPU/RKNPU加速部署的示例。
+
+
+在部署前，需确认以下两个步骤:
+
+1. 软硬件环境满足要求
+2. 根据开发环境，下载预编译部署库或者从头编译FastDeploy仓库
+
+以上步骤请参考[RK2代NPU部署库编译](../../../../../../docs/cn/build_and_install/rknpu2.md)实现
+
+在本目录执行如下命令即可完成编译测试
+
+```bash
+mkdir build
+cd build
+# FastDeploy version need >=1.0.3
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# 下载官方转换好的ArcFace模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r18.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/rknpu2/face_demo.zip
+unzip face_demo.zip
+
+# CPU推理
+./infer_arcface_demo ms1mv3_arcface_r100.onnx face_0.jpg face_1.jpg face_2.jpg 0
+# RKNPU推理
+./infer_arcface_demo ms1mv3_arcface_r100.onnx face_0.jpg face_1.jpg face_2.jpg 1
+```
+
+运行完成可视化结果如下图所示
+
+<div width="700">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321537-860bf857-0101-4e92-a74c-48e8658d838c.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184322004-a551e6e4-6f47-454e-95d6-f8ba2f47b516.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321622-d9a494c3-72f3-47f1-97c5-8a2372de491f.JPG">
+</div>
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## InsightFace C++接口
+
+### ArcFace类
+
+```c++
+fastdeploy::vision::faceid::ArcFace(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+ArcFace模型加载和初始化，其中model_file为导出的ONNX模型格式。
+
+### CosFace类
+
+```c++
+fastdeploy::vision::faceid::CosFace(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+CosFace模型加载和初始化，其中model_file为导出的ONNX模型格式。
+
+### PartialFC类
+
+```c++
+fastdeploy::vision::faceid::PartialFC(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+PartialFC模型加载和初始化，其中model_file为导出的ONNX模型格式。
+
+### VPL类
+
+```c++
+fastdeploy::vision::faceid::VPL(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+VPL模型加载和初始化，其中model_file为导出的ONNX模型格式。
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX时，此参数传入空字符串即可
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX格式
+
+#### Predict函数
+
+> ```c++
+> ArcFace::Predict(const cv::Mat& im, FaceRecognitionResult* result)
+> ```
+>
+> 模型预测接口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 检测结果，包括检测框，各个框的置信度, FaceRecognitionResult说明参考[视觉模型预测结果](../../../../../../docs/api/vision_results/)
+
+### 修改预处理以及后处理的参数
+预处理和后处理的参数的需要通过修改InsightFaceRecognitionPostprocessor，InsightFaceRecognitionPreprocessor的成员变量来进行修改。
+
+#### InsightFaceRecognitionPreprocessor成员变量(预处理参数)
+> > * **size**(vector&lt;int&gt;): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[112, 112],
+      通过InsightFaceRecognitionPreprocessor::SetSize(std::vector<int>& size)来进行修改
+> > * **alpha**(vector&lt;float&gt;): 预处理归一化的alpha值，计算公式为`x'=x*alpha+beta`，alpha默认为[1. / 127.5, 1.f / 127.5, 1. / 127.5],
+      通过InsightFaceRecognitionPreprocessor::SetAlpha(std::vector<float>& alpha)来进行修改
+> > * **beta**(vector&lt;float&gt;): 预处理归一化的beta值，计算公式为`x'=x*alpha+beta`，beta默认为[-1.f, -1.f, -1.f],
+      通过InsightFaceRecognitionPreprocessor::SetBeta(std::vector<float>& beta)来进行修改
+
+#### InsightFaceRecognitionPostprocessor成员变量(后处理参数)
+> > * **l2_normalize**(bool): 输出人脸向量之前是否执行l2归一化，默认false,
+      通过InsightFaceRecognitionPostprocessor::SetL2Normalize(bool& l2_normalize)来进行修改
+
+- [模型介绍](../../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../../docs/api/vision_results/README.md)
+- [如何切换模型推理后端引擎](../../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/faceid/insightface/rknpu2/python/README.md b/examples/vision/faceid/insightface/rknpu2/python/README.md
new file mode 100644
index 000000000..a05ec5351
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/python/README.md
@@ -0,0 +1,108 @@
+English | [简体中文](README_CN.md)
+# InsightFace Python Deployment Example
+
+FastDeploy supports the deployment of InsightFace models like ArcFace\CosFace\VPL\Partial on RKNPU.
+
+This directoty provides the example that `infer_arcface.py` fast finishes the deployment of InsighFace models like ArcFace on CPU/RKNPU.
+
+
+Two steps before deployment:
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../../docs/cn/build_and_install/rknpu2.md)
+
+```bash
+# Download the example code for deployment
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/faceid/insightface/python/
+
+# Download ArcFace model files and test images
+wget https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r100.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/rknpu2/face_demo.zip
+unzip face_demo.zip
+
+# CPU inference
+python infer_arcface.py --model ms1mv3_arcface_r100.onnx \
+                        --face face_0.jpg \
+                        --face_positive face_1.jpg \
+                        --face_negative face_2.jpg \
+                        --device cpu
+# GPU inference
+python infer_arcface.py --model ms1mv3_arcface_r100.onnx \
+                        --face face_0.jpg \
+                        --face_positive face_1.jpg \
+                        --face_negative face_2.jpg \
+                        --device gpu
+```
+
+The visualized result is as follows
+
+<div width="700">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321537-860bf857-0101-4e92-a74c-48e8658d838c.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184322004-a551e6e4-6f47-454e-95d6-f8ba2f47b516.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321622-d9a494c3-72f3-47f1-97c5-8a2372de491f.JPG">
+</div>
+
+```bash
+Prediction Done!
+--- [Face 0]:FaceRecognitionResult: [Dim(512), Min(-2.309220), Max(2.372197), Mean(0.016987)]
+--- [Face 1]:FaceRecognitionResult: [Dim(512), Min(-2.288258), Max(1.995104), Mean(-0.003400)]
+--- [Face 2]:FaceRecognitionResult: [Dim(512), Min(-3.243411), Max(3.875866), Mean(-0.030682)]
+Detect Done! Cosine 01: 0.814385, Cosine 02:-0.059388
+
+```
+
+## InsightFace Python interface
+
+```python
+fastdeploy.vision.faceid.ArcFace(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+fastdeploy.vision.faceid.CosFace(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+fastdeploy.vision.faceid.PartialFC(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+fastdeploy.vision.faceid.VPL(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+```
+
+ArcFace model loading and initialization, among which model_file is the exported ONNX model format
+
+**Parameter**
+
+> * **model_file**(str): Model file path 
+> * **params_file**(str): Parameter file path. No need to set when the model is in ONNX format
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(ModelFormat): Model format. ONNX format by default
+
+### predict function
+
+> ```python
+> ArcFace.predict(image_data)
+> ```
+>
+> Model prediction interface. Input images and output prediction results
+>
+> **Parameter**
+>
+> > * **image_data**(np.ndarray): Input data in HWC or BGR format
+
+> **Return**
+>
+> > Return the `fastdeploy.vision.FaceRecognitionResult` structure. Refer to [Vision Model Prediction Results](../../../../../../docs/api/vision_results/) for its description
+
+### Class Member Property
+#### Pre-processing Parameter
+Users can modify the following preprocessing parameters based on actual needs to change the final inference and deployment results.
+
+#### Member Variables of AdaFacePreprocessor
+The followings are the member variables of AdaFacePreprocessor
+> > * **size**(list[int]): This parameter changes the resize used during preprocessing, containing two integer elements for [width, height] with default value [112, 112]
+> > * **alpha**(list[float]): Preprocess normalized alpha, and calculated as `x'=x*alpha+beta`. Alpha defaults to [1. / 127.5, 1.f / 127.5, 1. / 127.5]
+> > * **beta**(list[float]): Preprocess normalized beta, and calculated as `x'=x*alpha+beta`. beta defaults to [-1.f, -1.f, -1.f]
+
+#### Member Variables of AdaFacePostprocessor
+The followings are the member variables of AdaFacePostprocessor
+> > * **l2_normalize**(bool): Whether to perform l2 normalization before outputting the face vector. Default false.
+
+
+## Other Documents
+
+- [InsightFace Model Description](..)
+- [InsightFace C++ Deployment](../cpp)
+- [Vision Model Prediction Results](../../../../../../docs/api/vision_results/)
+- [How to switch the backend engine](../../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/faceid/insightface/rknpu2/python/README_CN.md b/examples/vision/faceid/insightface/rknpu2/python/README_CN.md
index c45f28cfc..d154dc16c 100644
--- a/examples/vision/faceid/insightface/rknpu2/python/README_CN.md
+++ b/examples/vision/faceid/insightface/rknpu2/python/README_CN.md
@@ -75,7 +75,7 @@ ArcFace模型加载和初始化，其中model_file为导出的ONNX模型格式
 > ArcFace.predict(image_data)
 > ```
 >
-> 模型预测结口，输入图像直接输出检测结果。
+> 模型预测接口，输入图像直接输出检测结果。
 >
 > **参数**
 >
diff --git a/examples/vision/ocr/PP-OCRv2/cpp/README.md b/examples/vision/ocr/PP-OCRv2/cpp/README.md
index 69454b6a1..99748afba 100755
--- a/examples/vision/ocr/PP-OCRv2/cpp/README.md
+++ b/examples/vision/ocr/PP-OCRv2/cpp/README.md
@@ -1,7 +1,7 @@
 English | [简体中文](README_CN.md)
 # PPOCRv2 C++ Deployment Example
 
-This directory provides examples that `infer.cc` fast finishes the deployment of PPOCRv2 on CPU/GPU and GPU accelerated by TensorRT. 
+This directory provides examples that `infer.cc` fast finishes the deployment of PPOCRv2 on CPU/GPU and GPU accelerated by TensorRT.
 
 Two steps before deployment
 
@@ -13,7 +13,7 @@ Taking the CPU inference on Linux as an example, the compilation test can be com
 ```
 mkdir build
 cd build
-# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
@@ -54,7 +54,7 @@ The visualized result after running is as follows
 <img width="640" src="https://user-images.githubusercontent.com/109218879/185826024-f7593a0c-1bd2-4a60-b76c-15588484fa08.jpg">
 
 
-## PPOCRv2 C++ Interface 
+## PPOCRv2 C++ Interface
 
 ### PPOCRv2 Class
 
@@ -98,7 +98,7 @@ The initialization of PPOCRv2, consisting of detection and recognition models (N
 > > * **result**: OCR prediction results, including the position of the detection box from the detection model, the classification of the direction from the classification model, and the recognition result from the recognition model. Refer to [Vision Model Prediction Results](../../../../../docs/api/vision_results/) for OCRResult
 
 
-## DBDetector C++ Interface 
+## DBDetector C++ Interface
 
 ### DBDetector Class
 
@@ -112,7 +112,7 @@ DBDetector model loading and initialization. The model is in paddle format.
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path. Merely passing an empty string when the model is in ONNX format
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
 > * **model_format**(ModelFormat): Model format. Paddle format by default
@@ -139,7 +139,7 @@ Users can modify the following pre-processing parameters to their needs, which a
 
 > > * **max_side_len**(int): The long side’s maximum size of the oriented view before detection. The long side will be resized to this size when exceeding the value. And the short side will be scaled in equal proportion. Default 960
 > > * **det_db_thresh**(double): The binarization threshold of the prediction image from DB models. Default 0.3
-> > * **det_db_box_thresh**(double): The threshold for the output box of DB models, below which the predicted box is discarded. Default 0.6 
+> > * **det_db_box_thresh**(double): The threshold for the output box of DB models, below which the predicted box is discarded. Default 0.6
 > > * **det_db_unclip_ratio**(double): The expansion ratio of the DB model output box. Default 1.5
 > > * **det_db_score_mode**(string): The way to calculate the average score of the text box in DB post-processing. Default slow, which is identical to the calculation of the polygon area’s average score
 > > * **use_dilation**(bool): Whether to expand the feature map from the detection. Default False
diff --git a/examples/vision/ocr/PP-OCRv2/cpp/infer_static_shape.cc b/examples/vision/ocr/PP-OCRv2/cpp/infer_static_shape.cc
old mode 100755
new mode 100644
index ba5527a2e..7a48ba879
--- a/examples/vision/ocr/PP-OCRv2/cpp/infer_static_shape.cc
+++ b/examples/vision/ocr/PP-OCRv2/cpp/infer_static_shape.cc
@@ -19,7 +19,12 @@ const char sep = '\\';
 const char sep = '/';
 #endif
 
-void InitAndInfer(const std::string& det_model_dir, const std::string& cls_model_dir, const std::string& rec_model_dir, const std::string& rec_label_file, const std::string& image_file, const fastdeploy::RuntimeOption& option) {
+void InitAndInfer(const std::string& det_model_dir,
+                  const std::string& cls_model_dir,
+                  const std::string& rec_model_dir,
+                  const std::string& rec_label_file,
+                  const std::string& image_file,
+                  const fastdeploy::RuntimeOption& option) {
   auto det_model_file = det_model_dir + sep + "inference.pdmodel";
   auto det_params_file = det_model_dir + sep + "inference.pdiparams";
 
@@ -33,33 +38,40 @@ void InitAndInfer(const std::string& det_model_dir, const std::string& cls_model
   auto cls_option = option;
   auto rec_option = option;
 
-  auto det_model = fastdeploy::vision::ocr::DBDetector(det_model_file, det_params_file, det_option);
-  auto cls_model = fastdeploy::vision::ocr::Classifier(cls_model_file, cls_params_file, cls_option);
-  auto rec_model = fastdeploy::vision::ocr::Recognizer(rec_model_file, rec_params_file, rec_label_file, rec_option);
+  auto det_model = fastdeploy::vision::ocr::DBDetector(
+      det_model_file, det_params_file, det_option);
+  auto cls_model = fastdeploy::vision::ocr::Classifier(
+      cls_model_file, cls_params_file, cls_option);
+  auto rec_model = fastdeploy::vision::ocr::Recognizer(
+      rec_model_file, rec_params_file, rec_label_file, rec_option);
 
-  // Users could enable static shape infer for rec model when deploy PP-OCR on hardware 
-  // which can not support dynamic shape infer well, like Huawei Ascend series. 
+  // Users could enable static shape infer for rec model when deploy PP-OCR on
+  // hardware
+  // which can not support dynamic shape infer well, like Huawei Ascend series.
   rec_model.GetPreprocessor().SetStaticShapeInfer(true);
 
   assert(det_model.Initialized());
   assert(cls_model.Initialized());
   assert(rec_model.Initialized());
 
-  // The classification model is optional, so the PP-OCR can also be connected in series as follows
+  // The classification model is optional, so the PP-OCR can also be connected
+  // in series as follows
   // auto ppocr_v2 = fastdeploy::pipeline::PPOCRv2(&det_model, &rec_model);
-  auto ppocr_v2 = fastdeploy::pipeline::PPOCRv2(&det_model, &cls_model, &rec_model);
+  auto ppocr_v2 =
+      fastdeploy::pipeline::PPOCRv2(&det_model, &cls_model, &rec_model);
 
-  // When users enable static shape infer for rec model, the batch size of cls and rec model must to be set to 1.
+  // When users enable static shape infer for rec model, the batch size of cls
+  // and rec model must to be set to 1.
   ppocr_v2.SetClsBatchSize(1);
-  ppocr_v2.SetRecBatchSize(1); 
+  ppocr_v2.SetRecBatchSize(1);
 
-  if(!ppocr_v2.Initialized()){
+  if (!ppocr_v2.Initialized()) {
     std::cerr << "Failed to initialize PP-OCR." << std::endl;
     return;
   }
 
   auto im = cv::imread(image_file);
-  
+
   fastdeploy::vision::OCRResult result;
   if (!ppocr_v2.Predict(im, &result)) {
     std::cerr << "Failed to predict." << std::endl;
@@ -92,7 +104,7 @@ int main(int argc, char* argv[]) {
   int flag = std::atoi(argv[6]);
 
   if (flag == 0) {
-    option.UseCpu(); 
+    option.UseCpu();
   } else if (flag == 1) {
     option.UseAscend();
   }
@@ -102,6 +114,7 @@ int main(int argc, char* argv[]) {
   std::string rec_model_dir = argv[3];
   std::string rec_label_file = argv[4];
   std::string test_image = argv[5];
-  InitAndInfer(det_model_dir, cls_model_dir, rec_model_dir, rec_label_file, test_image, option);
+  InitAndInfer(det_model_dir, cls_model_dir, rec_model_dir, rec_label_file,
+               test_image, option);
   return 0;
 }
diff --git a/examples/vision/ocr/PP-OCRv3/cpp/README.md b/examples/vision/ocr/PP-OCRv3/cpp/README.md
index 79c7af33e..923bda513 100755
--- a/examples/vision/ocr/PP-OCRv3/cpp/README.md
+++ b/examples/vision/ocr/PP-OCRv3/cpp/README.md
@@ -1,7 +1,7 @@
 English | [简体中文](README_CN.md)
 # PPOCRv3 C++ Deployment Example
 
-This directory provides examples that `infer.cc` fast finishes the deployment of PPOCRv3 on CPU/GPU and GPU accelerated by TensorRT. 
+This directory provides examples that `infer.cc` fast finishes the deployment of PPOCRv3 on CPU/GPU and GPU accelerated by TensorRT.
 
 Two steps before deployment
 
@@ -13,7 +13,7 @@ Taking the CPU inference on Linux as an example, the compilation test can be com
 ```
 mkdir build
 cd build
-# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
@@ -44,6 +44,8 @@ wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_
 ./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 3
 # KunlunXin XPU inference
 ./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 4
+# Huawei Ascend inference, need to use the infer_static_shape_demo, if the user needs to predict the image continuously, the input image size needs to be prepared as a uniform size.
+./infer_static_shape_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 1
 ```
 
 The above command works for Linux or MacOS. For SDK in Windows, refer to:
diff --git a/examples/vision/segmentation/paddleseg/cpp/README.md b/examples/vision/segmentation/paddleseg/cpp/README.md
index 4c5be9f6c..572e38078 100755
--- a/examples/vision/segmentation/paddleseg/cpp/README.md
+++ b/examples/vision/segmentation/paddleseg/cpp/README.md
@@ -1,7 +1,7 @@
 English | [简体中文](README_CN.md)
 # PaddleSeg C++ Deployment Example
 
-This directory provides examples that `infer.cc` fast finishes the deployment of Unet on CPU/GPU and GPU accelerated by TensorRT. 
+This directory provides examples that `infer.cc` fast finishes the deployment of Unet on CPU/GPU and GPU accelerated by TensorRT.
 
 Before deployment, two steps require confirmation
 
@@ -15,7 +15,7 @@ Taking the inference on Linux as an example, the compilation test can be complet
 ```bash
 mkdir build
 cd build
-# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above 
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
@@ -35,6 +35,8 @@ wget https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png
 ./infer_demo Unet_cityscapes_without_argmax_infer cityscapes_demo.png 2
 # kunlunxin XPU inference
 ./infer_demo Unet_cityscapes_without_argmax_infer cityscapes_demo.png 3
+# Huawei Ascend Inference
+./infer_demo Unet_cityscapes_without_argmax_infer cityscapes_demo.png 4
 ```
 
 The visualized result after running is as follows
@@ -45,7 +47,7 @@ The visualized result after running is as follows
 The above command works for Linux or MacOS. For SDK use-pattern in Windows, refer to:
 - [How to use FastDeploy C++ SDK in Windows](../../../../../docs/cn/faq/use_sdk_on_windows.md)
 
-## PaddleSeg C++ Interface 
+## PaddleSeg C++ Interface
 
 ### PaddleSeg Class
 
@@ -62,7 +64,7 @@ PaddleSegModel model loading and initialization, among which model_file is the e
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
+> * **model_file**(str): Model file path
 > * **params_file**(str): Parameter file path
 > * **config_file**(str): Inference deployment configuration file
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default, which is the default configuration
diff --git a/fastdeploy/benchmark/benchmark.h b/fastdeploy/benchmark/benchmark.h
new file mode 100755
index 000000000..b7463d3e9
--- /dev/null
+++ b/fastdeploy/benchmark/benchmark.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "fastdeploy/core/config.h"
+#include "fastdeploy/utils/utils.h"
+#include "fastdeploy/utils/perf.h"
+#include "fastdeploy/benchmark/option.h"
+#include "fastdeploy/benchmark/results.h"
+
+#ifdef ENABLE_BENCHMARK
+  #define __RUNTIME_PROFILE_LOOP_BEGIN(option, base_loop)               \
+    int __p_loop = (base_loop);                                         \
+    const bool __p_enable_profile = option.enable_profile;              \
+    const bool __p_include_h2d_d2h = option.include_h2d_d2h;            \
+    const int __p_repeats = option.repeats;                             \
+    const int __p_warmup = option.warmup;                               \
+    if (__p_enable_profile && (!__p_include_h2d_d2h)) {                 \
+      __p_loop = (__p_repeats) + (__p_warmup);                          \
+      FDINFO << option << std::endl;                                    \
+    }                                                                   \
+    TimeCounter __p_tc;                                                 \
+    bool __p_tc_start = false;                                          \
+    for (int __p_i = 0; __p_i < __p_loop; ++__p_i) {                    \
+      if (__p_i >= (__p_warmup) && (!__p_tc_start)) {                   \
+        __p_tc.Start();                                                 \
+        __p_tc_start = true;                                            \
+      }                                                                 \
+
+  #define __RUNTIME_PROFILE_LOOP_END(result)                            \
+    }                                                                   \
+    if ((__p_enable_profile && (!__p_include_h2d_d2h))) {               \
+      if (__p_tc_start) {                                               \
+        __p_tc.End();                                                   \
+        double __p_tc_duration = __p_tc.Duration();                     \
+        result.time_of_runtime =                                        \
+          __p_tc_duration / static_cast<double>(__p_repeats);           \
+      }                                                                 \
+    }
+
+  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN(option, base_loop)       \
+    int __p_loop_h = (base_loop);                                       \
+    const bool __p_enable_profile_h = option.enable_profile;            \
+    const bool __p_include_h2d_d2h_h = option.include_h2d_d2h;          \
+    const int __p_repeats_h = option.repeats;                           \
+    const int __p_warmup_h = option.warmup;                             \
+    if (__p_enable_profile_h && __p_include_h2d_d2h_h) {                \
+      __p_loop_h = (__p_repeats_h) + (__p_warmup_h);                    \
+      FDINFO << option << std::endl;                                    \
+    }                                                                   \
+    TimeCounter __p_tc_h;                                               \
+    bool __p_tc_start_h = false;                                        \
+    for (int __p_i_h = 0; __p_i_h < __p_loop_h; ++__p_i_h) {            \
+      if (__p_i_h >= (__p_warmup_h) && (!__p_tc_start_h)) {             \
+        __p_tc_h.Start();                                               \
+        __p_tc_start_h = true;                                          \
+      }                                                                 \
+
+  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_END(result)                    \
+    }                                                                   \
+    if ((__p_enable_profile_h && __p_include_h2d_d2h_h)) {              \
+      if (__p_tc_start_h) {                                             \
+         __p_tc_h.End();                                                \
+        double __p_tc_duration_h = __p_tc_h.Duration();                 \
+        result.time_of_runtime =                                        \
+          __p_tc_duration_h / static_cast<double>(__p_repeats_h);       \
+      }                                                                 \
+    }
+#else
+  #define __RUNTIME_PROFILE_LOOP_BEGIN(option, base_loop)               \
+    for (int __p_i = 0; __p_i < (base_loop); ++__p_i) {
+  #define __RUNTIME_PROFILE_LOOP_END(result) }
+  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN(option, base_loop)       \
+    for (int __p_i_h = 0; __p_i_h < (base_loop); ++__p_i_h) {
+  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_END(result) }
+#endif
diff --git a/fastdeploy/benchmark/option.h b/fastdeploy/benchmark/option.h
new file mode 100755
index 000000000..5af9f1585
--- /dev/null
+++ b/fastdeploy/benchmark/option.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+namespace fastdeploy {
+
+/** \brief All C++ FastDeploy benchmark profile APIs are defined inside this namespace
+*
+*/
+namespace benchmark {
+
+/*! @brief Option object used to control the behavior of the benchmark profiling.
+ */
+struct BenchmarkOption {
+  int warmup = 50;              ///< Warmup for backend inference.
+  int repeats = 100;            ///< Repeats for backend inference.
+  bool enable_profile = false;  ///< Whether to use profile or not.
+  bool include_h2d_d2h = false;  ///< Whether to include time of H2D_D2H for time of runtime. // NOLINT
+
+  friend std::ostream& operator<<(
+    std::ostream& output, const BenchmarkOption &option) {
+    if (!option.include_h2d_d2h) {
+      output << "Running profiling for Runtime "
+             << "without H2D and D2H, ";
+    } else {
+      output << "Running profiling for Runtime "
+             << "with H2D and D2H, ";
+    }
+    output << "Repeats: " << option.repeats << ", "
+           << "Warmup: " << option.warmup;
+    return output;
+  }
+};
+
+}  // namespace benchmark
+}  // namespace fastdeploy
diff --git a/fastdeploy/benchmark/results.h b/fastdeploy/benchmark/results.h
new file mode 100644
index 000000000..ed5d003e3
--- /dev/null
+++ b/fastdeploy/benchmark/results.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+namespace fastdeploy {
+namespace benchmark {
+
+/*! @brief Result object used to record the time of runtime after benchmark profiling is done.
+ */
+struct BenchmarkResult {
+  ///< Means pure_backend_time+time_of_h2d_d2h(if include_h2d_d2h=true).
+  double time_of_runtime = 0.0f; 
+};
+
+} // namespace benchmark
+} // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/benchmark/utils.cc b/fastdeploy/benchmark/utils.cc
new file mode 100755
index 000000000..2b0bd9df1
--- /dev/null
+++ b/fastdeploy/benchmark/utils.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/types.h>
+#if defined(__linux__) || defined(__ANDROID__)
+#include <unistd.h>
+#endif
+#include <cmath>
+
+#include "fastdeploy/benchmark/utils.h"
+
+namespace fastdeploy {
+namespace benchmark {
+
+// Remove the ch characters at both ends of str
+static std::string strip(const std::string& str, char ch = ' ') {
+  int i = 0;
+  while (str[i] == ch) {
+    i++;
+  }
+  int j = str.size() - 1;
+  while (str[j] == ch) {
+    j--;
+  }
+  return str.substr(i, j + 1 - i);
+}
+
+void DumpCurrentCpuMemoryUsage(const std::string& name) {
+#if defined(__linux__) || defined(__ANDROID__)
+  int iPid = static_cast<int>(getpid());
+  std::string command = "pmap -x " + std::to_string(iPid) + " | grep total";
+  FILE* pp = popen(command.data(), "r");
+  if (!pp) return;
+  char tmp[1024];
+
+  while (fgets(tmp, sizeof(tmp), pp) != NULL) {
+    std::ofstream write;
+    write.open(name, std::ios::app);
+    write << tmp;
+    write.close();
+  }
+  pclose(pp);
+#else
+  FDASSERT(false,
+           "Currently collect cpu memory info only supports Linux and ANDROID.")
+#endif
+  return;
+}
+
+void DumpCurrentGpuMemoryUsage(const std::string& name, int device_id) {
+#if defined(__linux__) && defined(WITH_GPU)
+  std::string command = "nvidia-smi --id=" + std::to_string(device_id) +
+                        " --query-gpu=index,uuid,name,timestamp,memory.total,"
+                        "memory.free,memory.used,utilization.gpu,utilization."
+                        "memory --format=csv,noheader,nounits";
+  FILE* pp = popen(command.data(), "r");
+  if (!pp) return;
+  char tmp[1024];
+
+  while (fgets(tmp, sizeof(tmp), pp) != NULL) {
+    std::ofstream write;
+    write.open(name, std::ios::app);
+    write << tmp;
+    write.close();
+  }
+  pclose(pp);
+#else
+  FDASSERT(false,
+           "Currently collect gpu memory info only supports Linux in GPU.")
+#endif
+  return;
+}
+
+float GetCpuMemoryUsage(const std::string& name) {
+  std::ifstream read(name);
+  std::string line;
+  float max_cpu_mem = -1;
+  while (getline(read, line)) {
+    std::stringstream ss(line);
+    std::string tmp;
+    std::vector<std::string> nums;
+    while (getline(ss, tmp, ' ')) {
+      tmp = strip(tmp);
+      if (tmp.empty()) continue;
+      nums.push_back(tmp);
+    }
+    max_cpu_mem = std::max(max_cpu_mem, stof(nums[3]));
+  }
+  return max_cpu_mem / 1024;
+}
+
+float GetGpuMemoryUsage(const std::string& name) {
+  std::ifstream read(name);
+  std::string line;
+  float max_gpu_mem = -1;
+  while (getline(read, line)) {
+    std::stringstream ss(line);
+    std::string tmp;
+    std::vector<std::string> nums;
+    while (getline(ss, tmp, ',')) {
+      tmp = strip(tmp);
+      if (tmp.empty()) continue;
+      nums.push_back(tmp);
+    }
+    max_gpu_mem = std::max(max_gpu_mem, stof(nums[6]));
+  }
+  return max_gpu_mem;
+}
+
+}  // namespace benchmark
+}  // namespace fastdeploy
diff --git a/fastdeploy/benchmark/utils.h b/fastdeploy/benchmark/utils.h
new file mode 100755
index 000000000..12770f365
--- /dev/null
+++ b/fastdeploy/benchmark/utils.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "fastdeploy/utils/utils.h"
+
+namespace fastdeploy {
+namespace benchmark {
+
+// Record current cpu memory usage into file
+FASTDEPLOY_DECL void DumpCurrentCpuMemoryUsage(const std::string& name);
+
+// Record current gpu memory usage into file
+FASTDEPLOY_DECL void DumpCurrentGpuMemoryUsage(const std::string& name,
+                                               int device_id);
+
+// Get Max cpu memory usage
+FASTDEPLOY_DECL float GetCpuMemoryUsage(const std::string& name);
+
+// Get Max gpu memory usage
+FASTDEPLOY_DECL float GetGpuMemoryUsage(const std::string& name);
+
+}  // namespace benchmark
+}  // namespace fastdeploy
diff --git a/fastdeploy/core/config.h.in b/fastdeploy/core/config.h.in
index e6f202961..5593f9fd8 100755
--- a/fastdeploy/core/config.h.in
+++ b/fastdeploy/core/config.h.in
@@ -56,3 +56,7 @@
 #ifndef ENABLE_TEXT
 #cmakedefine ENABLE_TEXT
 #endif
+
+#ifndef ENABLE_BENCHMARK
+#cmakedefine ENABLE_BENCHMARK
+#endif
\ No newline at end of file
diff --git a/fastdeploy/core/fd_tensor.cc b/fastdeploy/core/fd_tensor.cc
index 533e58fd8..8b111025d 100644
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/core/fd_tensor.h"
-#include "fastdeploy/core/float16.h"
-#include "fastdeploy/utils/utils.h"
 
 #include <algorithm>
 #include <cstring>
+
+#include "fastdeploy/core/float16.h"
+#include "fastdeploy/utils/utils.h"
 #ifdef WITH_GPU
 #include <cuda_runtime_api.h>
 #endif
@@ -142,6 +143,9 @@ void FDTensor::Resize(const std::vector<int64_t>& new_shape,
                       const FDDataType& data_type,
                       const std::string& tensor_name,
                       const Device& new_device) {
+  if (device != new_device) {
+    FreeFn();
+  }
   external_data_ptr = nullptr;
   name = tensor_name;
   device = new_device;
@@ -269,9 +273,10 @@ bool FDTensor::ReallocFn(size_t nbytes) {
     }
     return buffer_ != nullptr;
 #else
-    FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
-                    "-DWITH_GPU=ON,"
-                    "so this is an unexpected problem happend.");
+    FDASSERT(false,
+             "The FastDeploy FDTensor allocator didn't compile under "
+             "-DWITH_GPU=ON,"
+             "so this is an unexpected problem happend.");
 #endif
   } else {
     if (is_pinned_memory) {
@@ -285,9 +290,10 @@ bool FDTensor::ReallocFn(size_t nbytes) {
       }
       return buffer_ != nullptr;
 #else
-      FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
-                      "-DWITH_GPU=ON,"
-                      "so this is an unexpected problem happend.");
+      FDASSERT(false,
+               "The FastDeploy FDTensor allocator didn't compile under "
+               "-DWITH_GPU=ON,"
+               "so this is an unexpected problem happend.");
 #endif
     }
     buffer_ = realloc(buffer_, nbytes);
@@ -296,8 +302,7 @@ bool FDTensor::ReallocFn(size_t nbytes) {
 }
 
 void FDTensor::FreeFn() {
-  if (external_data_ptr != nullptr)
-    external_data_ptr = nullptr;
+  if (external_data_ptr != nullptr) external_data_ptr = nullptr;
   if (buffer_ != nullptr) {
     if (device == Device::GPU) {
 #ifdef WITH_GPU
@@ -381,13 +386,16 @@ FDTensor::FDTensor(const Scalar& scalar) {
       (reinterpret_cast<double*>(Data()))[0] = scalar.to<double>();
       break;
     default:
-      break;  
+      break;
   }
 }
 
 FDTensor::FDTensor(const FDTensor& other)
-    : shape(other.shape), name(other.name), dtype(other.dtype),
-      device(other.device), external_data_ptr(other.external_data_ptr),
+    : shape(other.shape),
+      name(other.name),
+      dtype(other.dtype),
+      device(other.device),
+      external_data_ptr(other.external_data_ptr),
       device_id(other.device_id) {
   // Copy buffer
   if (other.buffer_ == nullptr) {
@@ -401,9 +409,12 @@ FDTensor::FDTensor(const FDTensor& other)
 }
 
 FDTensor::FDTensor(FDTensor&& other)
-    : buffer_(other.buffer_), shape(std::move(other.shape)),
-      name(std::move(other.name)), dtype(other.dtype),
-      external_data_ptr(other.external_data_ptr), device(other.device),
+    : buffer_(other.buffer_),
+      shape(std::move(other.shape)),
+      name(std::move(other.name)),
+      dtype(other.dtype),
+      external_data_ptr(other.external_data_ptr),
+      device(other.device),
       device_id(other.device_id) {
   other.name = "";
   // Note(zhoushunjie): Avoid double free.
diff --git a/fastdeploy/fastdeploy_model.cc b/fastdeploy/fastdeploy_model.cc
index 9eff985fb..d909a6138 100644
--- a/fastdeploy/fastdeploy_model.cc
+++ b/fastdeploy/fastdeploy_model.cc
@@ -31,7 +31,8 @@ std::string Str(const std::vector<Backend>& backends) {
   return oss.str();
 }
 
-bool IsSupported(const std::vector<Backend>& backends, Backend backend) {
+bool CheckBackendSupported(const std::vector<Backend>& backends,
+                           Backend backend) {
   for (size_t i = 0; i < backends.size(); ++i) {
     if (backends[i] == backend) {
       return true;
@@ -40,6 +41,22 @@ bool IsSupported(const std::vector<Backend>& backends, Backend backend) {
   return false;
 }
 
+bool FastDeployModel::IsSupported(const std::vector<Backend>& backends, 
+                                  Backend backend) {
+#ifdef ENABLE_BENCHMARK
+  if (runtime_option.benchmark_option.enable_profile) {
+    FDWARNING << "In benchmark mode, we don't check to see if " 
+              << "the backend [" << backend 
+              << "] is supported for current model!"
+              << std::endl;
+    return true;
+  }
+  return CheckBackendSupported(backends, backend);  
+#else  
+  return CheckBackendSupported(backends, backend);
+#endif  
+}
+
 bool FastDeployModel::InitRuntimeWithSpecifiedBackend() {
   if (!IsBackendAvailable(runtime_option.backend)) {
     FDERROR << runtime_option.backend
@@ -367,12 +384,13 @@ bool FastDeployModel::Infer(std::vector<FDTensor>& input_tensors,
     tc.End();
     if (time_of_runtime_.size() > 50000) {
       FDWARNING << "There are already 50000 records of runtime, will force to "
-                   "disable record time of runtime now."
+                    "disable record time of runtime now."
                 << std::endl;
       enable_record_time_of_runtime_ = false;
     }
     time_of_runtime_.push_back(tc.Duration());
   }
+  
   return ret;
 }
 
@@ -416,6 +434,7 @@ std::map<std::string, float> FastDeployModel::PrintStatisInfoOfRuntime() {
   statis_info_of_runtime_dict["warmup_iter"] = warmup_iter;
   statis_info_of_runtime_dict["avg_time"] = avg_time;
   statis_info_of_runtime_dict["iterations"] = time_of_runtime_.size();
+  
   return statis_info_of_runtime_dict;
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/fastdeploy_model.h b/fastdeploy/fastdeploy_model.h
index 698827cc2..037bb2192 100755
--- a/fastdeploy/fastdeploy_model.h
+++ b/fastdeploy/fastdeploy_model.h
@@ -75,7 +75,7 @@ class FASTDEPLOY_DECL FastDeployModel {
     return runtime_initialized_ && initialized;
   }
 
-  /** \brief This is a debug interface, used to record the time of backend runtime
+  /** \brief This is a debug interface, used to record the time of runtime (backend + h2d + d2h)
    *
    * example code @code
    * auto model = fastdeploy::vision::PPYOLOE("model.pdmodel", "model.pdiparams", "infer_cfg.yml");
@@ -98,7 +98,7 @@ class FASTDEPLOY_DECL FastDeployModel {
     enable_record_time_of_runtime_ = true;
   }
 
-  /** \brief Disable to record the time of backend runtime, see `EnableRecordTimeOfRuntime()` for more detail
+  /** \brief Disable to record the time of runtime, see `EnableRecordTimeOfRuntime()` for more detail
   */
   virtual void DisableRecordTimeOfRuntime() {
     enable_record_time_of_runtime_ = false;
@@ -113,6 +113,11 @@ class FASTDEPLOY_DECL FastDeployModel {
   virtual bool EnabledRecordTimeOfRuntime() {
     return enable_record_time_of_runtime_;
   }
+  /** \brief Get profile time of Runtime after the profile process is done.
+   */
+  virtual double GetProfileTime() {
+    return runtime_->GetProfileTime();
+  }            
 
   /** \brief Release reused input/output buffers
   */
@@ -153,13 +158,13 @@ class FASTDEPLOY_DECL FastDeployModel {
   bool CreateTimVXBackend();
   bool CreateKunlunXinBackend();
   bool CreateASCENDBackend();
+  bool IsSupported(const std::vector<Backend>& backends,
+                   Backend backend);
 
   std::shared_ptr<Runtime> runtime_;
   bool runtime_initialized_ = false;
   // whether to record inference time
   bool enable_record_time_of_runtime_ = false;
-
-  // record inference time for backend
   std::vector<double> time_of_runtime_;
 };
 
diff --git a/fastdeploy/pybind/fastdeploy_model.cc b/fastdeploy/pybind/fastdeploy_model.cc
index 0b138fa60..e90619e37 100644
--- a/fastdeploy/pybind/fastdeploy_model.cc
+++ b/fastdeploy/pybind/fastdeploy_model.cc
@@ -30,6 +30,8 @@ void BindFDModel(pybind11::module& m) {
            &FastDeployModel::DisableRecordTimeOfRuntime)
       .def("print_statis_info_of_runtime",
            &FastDeployModel::PrintStatisInfoOfRuntime)
+      .def("get_profile_time",
+           &FastDeployModel::GetProfileTime)     
       .def("initialized", &FastDeployModel::Initialized)
       .def_readwrite("runtime_option", &FastDeployModel::runtime_option)
       .def_readwrite("valid_cpu_backends", &FastDeployModel::valid_cpu_backends)
diff --git a/fastdeploy/pybind/runtime.cc b/fastdeploy/pybind/runtime.cc
index 6c5c65bc2..ca2f4886b 100644
--- a/fastdeploy/pybind/runtime.cc
+++ b/fastdeploy/pybind/runtime.cc
@@ -16,104 +16,10 @@
 
 namespace fastdeploy {
 
+void BindOption(pybind11::module& m);
+
 void BindRuntime(pybind11::module& m) {
-  pybind11::class_<RuntimeOption>(m, "RuntimeOption")
-      .def(pybind11::init())
-      .def("set_model_path", &RuntimeOption::SetModelPath)
-      .def("set_model_buffer", &RuntimeOption::SetModelBuffer)
-      .def("use_gpu", &RuntimeOption::UseGpu)
-      .def("use_cpu", &RuntimeOption::UseCpu)
-      .def("use_rknpu2", &RuntimeOption::UseRKNPU2)
-      .def("use_sophgo", &RuntimeOption::UseSophgo)
-      .def("use_ascend", &RuntimeOption::UseAscend)
-      .def("use_kunlunxin", &RuntimeOption::UseKunlunXin)
-      .def("set_external_stream", &RuntimeOption::SetExternalStream)
-      .def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
-      .def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
-      .def("use_poros_backend", &RuntimeOption::UsePorosBackend)
-      .def("use_ort_backend", &RuntimeOption::UseOrtBackend)
-      .def("set_ort_graph_opt_level", &RuntimeOption::SetOrtGraphOptLevel)
-      .def("use_trt_backend", &RuntimeOption::UseTrtBackend)
-      .def("use_openvino_backend", &RuntimeOption::UseOpenVINOBackend)
-      .def("use_lite_backend", &RuntimeOption::UseLiteBackend)
-      .def("set_lite_device_names", &RuntimeOption::SetLiteDeviceNames)
-      .def("set_lite_context_properties",
-           &RuntimeOption::SetLiteContextProperties)
-      .def("set_lite_model_cache_dir", &RuntimeOption::SetLiteModelCacheDir)
-      .def("set_lite_dynamic_shape_info",
-           &RuntimeOption::SetLiteDynamicShapeInfo)
-      .def("set_lite_subgraph_partition_path",
-           &RuntimeOption::SetLiteSubgraphPartitionPath)
-      .def("set_lite_mixed_precision_quantization_config_path",
-           &RuntimeOption::SetLiteMixedPrecisionQuantizationConfigPath)
-      .def("set_lite_subgraph_partition_config_buffer",
-           &RuntimeOption::SetLiteSubgraphPartitionConfigBuffer)
-      .def("set_paddle_mkldnn", &RuntimeOption::SetPaddleMKLDNN)
-      .def("set_openvino_device", &RuntimeOption::SetOpenVINODevice)
-      .def("set_openvino_shape_info", &RuntimeOption::SetOpenVINOShapeInfo)
-      .def("set_openvino_cpu_operators",
-           &RuntimeOption::SetOpenVINOCpuOperators)
-      .def("enable_paddle_log_info", &RuntimeOption::EnablePaddleLogInfo)
-      .def("disable_paddle_log_info", &RuntimeOption::DisablePaddleLogInfo)
-      .def("set_paddle_mkldnn_cache_size",
-           &RuntimeOption::SetPaddleMKLDNNCacheSize)
-      .def("enable_lite_fp16", &RuntimeOption::EnableLiteFP16)
-      .def("disable_lite_fp16", &RuntimeOption::DisableLiteFP16)
-      .def("set_lite_power_mode", &RuntimeOption::SetLitePowerMode)
-      .def("set_trt_input_shape", &RuntimeOption::SetTrtInputShape)
-      .def("set_trt_max_workspace_size", &RuntimeOption::SetTrtMaxWorkspaceSize)
-      .def("set_trt_max_batch_size", &RuntimeOption::SetTrtMaxBatchSize)
-      .def("enable_paddle_to_trt", &RuntimeOption::EnablePaddleToTrt)
-      .def("enable_trt_fp16", &RuntimeOption::EnableTrtFP16)
-      .def("disable_trt_fp16", &RuntimeOption::DisableTrtFP16)
-      .def("set_trt_cache_file", &RuntimeOption::SetTrtCacheFile)
-      .def("enable_pinned_memory", &RuntimeOption::EnablePinnedMemory)
-      .def("disable_pinned_memory", &RuntimeOption::DisablePinnedMemory)
-      .def("enable_paddle_trt_collect_shape",
-           &RuntimeOption::EnablePaddleTrtCollectShape)
-      .def("disable_paddle_trt_collect_shape",
-           &RuntimeOption::DisablePaddleTrtCollectShape)
-      .def("use_ipu", &RuntimeOption::UseIpu)
-      .def("set_ipu_config", &RuntimeOption::SetIpuConfig)
-      .def("delete_paddle_backend_pass",
-           &RuntimeOption::DeletePaddleBackendPass)
-      .def("disable_paddle_trt_ops", &RuntimeOption::DisablePaddleTrtOPs)
-      .def_readwrite("model_file", &RuntimeOption::model_file)
-      .def_readwrite("params_file", &RuntimeOption::params_file)
-      .def_readwrite("model_format", &RuntimeOption::model_format)
-      .def_readwrite("backend", &RuntimeOption::backend)
-      .def_readwrite("external_stream", &RuntimeOption::external_stream_)
-      .def_readwrite("model_from_memory", &RuntimeOption::model_from_memory_)
-      .def_readwrite("cpu_thread_num", &RuntimeOption::cpu_thread_num)
-      .def_readwrite("device_id", &RuntimeOption::device_id)
-      .def_readwrite("device", &RuntimeOption::device)
-      .def_readwrite("trt_max_shape", &RuntimeOption::trt_max_shape)
-      .def_readwrite("trt_opt_shape", &RuntimeOption::trt_opt_shape)
-      .def_readwrite("trt_min_shape", &RuntimeOption::trt_min_shape)
-      .def_readwrite("trt_serialize_file", &RuntimeOption::trt_serialize_file)
-      .def_readwrite("trt_enable_fp16", &RuntimeOption::trt_enable_fp16)
-      .def_readwrite("trt_enable_int8", &RuntimeOption::trt_enable_int8)
-      .def_readwrite("trt_max_batch_size", &RuntimeOption::trt_max_batch_size)
-      .def_readwrite("trt_max_workspace_size",
-                     &RuntimeOption::trt_max_workspace_size)
-      .def_readwrite("is_dynamic", &RuntimeOption::is_dynamic)
-      .def_readwrite("long_to_int", &RuntimeOption::long_to_int)
-      .def_readwrite("use_nvidia_tf32", &RuntimeOption::use_nvidia_tf32)
-      .def_readwrite("unconst_ops_thres", &RuntimeOption::unconst_ops_thres)
-      .def_readwrite("poros_file", &RuntimeOption::poros_file)
-      .def_readwrite("ipu_device_num", &RuntimeOption::ipu_device_num)
-      .def_readwrite("ipu_micro_batch_size",
-                     &RuntimeOption::ipu_micro_batch_size)
-      .def_readwrite("ipu_enable_pipelining",
-                     &RuntimeOption::ipu_enable_pipelining)
-      .def_readwrite("ipu_batches_per_step",
-                     &RuntimeOption::ipu_batches_per_step)
-      .def_readwrite("ipu_enable_fp16", &RuntimeOption::ipu_enable_fp16)
-      .def_readwrite("ipu_replica_num", &RuntimeOption::ipu_replica_num)
-      .def_readwrite("ipu_available_memory_proportion",
-                     &RuntimeOption::ipu_available_memory_proportion)
-      .def_readwrite("ipu_enable_half_partial",
-                     &RuntimeOption::ipu_enable_half_partial);
+  BindOption(m);
 
   pybind11::class_<TensorInfo>(m, "TensorInfo")
       .def_readwrite("name", &TensorInfo::name)
@@ -217,6 +123,7 @@ void BindRuntime(pybind11::module& m) {
       .def("num_outputs", &Runtime::NumOutputs)
       .def("get_input_info", &Runtime::GetInputInfo)
       .def("get_output_info", &Runtime::GetOutputInfo)
+      .def("get_profile_time", &Runtime::GetProfileTime)
       .def_readonly("option", &Runtime::option);
 
   pybind11::enum_<Backend>(m, "Backend", pybind11::arithmetic(),
diff --git a/fastdeploy/runtime/backends/backend.h b/fastdeploy/runtime/backends/backend.h
index 88a8e78a0..802db6fa1 100644
--- a/fastdeploy/runtime/backends/backend.h
+++ b/fastdeploy/runtime/backends/backend.h
@@ -22,6 +22,7 @@
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/fd_type.h"
 #include "fastdeploy/runtime/runtime_option.h"
+#include "fastdeploy/benchmark/benchmark.h"
 
 namespace fastdeploy {
 
@@ -79,7 +80,6 @@ class BaseBackend {
   virtual bool Infer(std::vector<FDTensor>& inputs,
                      std::vector<FDTensor>* outputs,
                      bool copy_to_fd = true) = 0;
-
   // Optional: For those backends which can share memory
   // while creating multiple inference engines with same model file
   virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
@@ -88,6 +88,70 @@ class BaseBackend {
     FDERROR << "Clone no support" << std::endl;
     return nullptr;
   }
+
+  benchmark::BenchmarkOption benchmark_option_;  
+  benchmark::BenchmarkResult benchmark_result_; 
 };
 
+/** \brief Macros for Runtime benchmark profiling. 
+ * The param 'base_loop' for 'RUNTIME_PROFILE_LOOP_BEGIN' 
+ * indicates that the least number of times the loop 
+ * will repeat when profiling mode is not enabled.
+ * In most cases, the value should be 1, i.e., results are 
+ * obtained by running the inference process once, when 
+ * the profile mode is turned off, such as ONNX Runtime, 
+ * OpenVINO, TensorRT, Paddle Inference, Paddle Lite, 
+ * RKNPU2, SOPHGO etc. 
+ * 
+ * example code @code
+ * // OpenVINOBackend::Infer 
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
+ * // do something .... 
+ * RUNTIME_PROFILE_LOOP_BEGIN(1)
+ * // The codes which wrapped by 'BEGIN(1) ~ END' scope 
+ * // will only run once when profiling mode is not enabled.
+ * request_.infer();  
+ * RUNTIME_PROFILE_LOOP_END
+ * // do something .... 
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_END
+ * 
+ * @endcode In this case, No global variables inside a function
+ * are wrapped by BEGIN and END, which may be required for 
+ * subsequent tasks. But, some times we need to set 'base_loop'
+ * as 0, such as POROS.
+ * 
+ * * example code @code
+ * // PorosBackend::Infer
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
+ * // do something .... 
+ * RUNTIME_PROFILE_LOOP_BEGIN(0) // set 'base_loop' as 0
+ * // The codes which wrapped by 'BEGIN(0) ~ END' scope 
+ * // will not run when profiling mode is not enabled.
+ * auto poros_outputs = _poros_module->forward(poros_inputs); 
+ * RUNTIME_PROFILE_LOOP_END
+ * // Run another inference beyond the scope of 'BEGIN ~ END'
+ * // to get valid outputs for subsequent tasks.
+ * auto poros_outputs = _poros_module->forward(poros_inputs); 
+ * // do something .... will use 'poros_outputs' ...
+ * if (poros_outputs.isTensor()) {
+ * // ...
+ * }
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_END
+ * 
+ * @endcode In this case, 'poros_outputs' inside a function
+ * are wrapped by BEGIN and END, which may be required for 
+ * subsequent tasks. So, we set 'base_loop' as 0 and lanuch
+ * another infer to get the valid outputs beyond the scope 
+ * of 'BEGIN ~ END' for subsequent tasks.
+ */
+
+#define RUNTIME_PROFILE_LOOP_BEGIN(base_loop)            \
+  __RUNTIME_PROFILE_LOOP_BEGIN(benchmark_option_, (base_loop))
+#define RUNTIME_PROFILE_LOOP_END                         \
+  __RUNTIME_PROFILE_LOOP_END(benchmark_result_)
+#define RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN               \
+  __RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN(benchmark_option_, 1)
+#define RUNTIME_PROFILE_LOOP_H2D_D2H_END                 \
+  __RUNTIME_PROFILE_LOOP_H2D_D2H_END(benchmark_result_)
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/lite/configure_hardware.cc b/fastdeploy/runtime/backends/lite/configure_hardware.cc
index 7c7a9993c..7ac60383f 100644
--- a/fastdeploy/runtime/backends/lite/configure_hardware.cc
+++ b/fastdeploy/runtime/backends/lite/configure_hardware.cc
@@ -13,23 +13,6 @@
 // limitations under the License.
 
 #include "fastdeploy/runtime/backends/lite/lite_backend.h"
-// https://github.com/PaddlePaddle/Paddle-Lite/issues/8290
-// When compiling the FastDeploy dynamic library, namely,
-// WITH_STATIC_LIB=OFF, and depending on the Paddle Lite
-// static library, you need to include the fake registration
-// codes of Paddle Lite. When you compile the FastDeploy static
-// library and depends on the Paddle Lite static library,
-// WITH_STATIC_LIB=ON, you do not need to include the fake
-// registration codes for Paddle Lite, but wait until you
-// use the FastDeploy static library.
-#if (defined(WITH_LITE_STATIC) && (!defined(WITH_STATIC_LIB)))
-#warning You are compiling the FastDeploy dynamic library with \
-Paddle Lite static lib We will automatically add some registration \
-codes for ops, kernels and passes for Paddle Lite.
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
-#endif
 
 #include <cstring>
 
@@ -156,4 +139,5 @@ void LiteBackend::ConfigureNNAdapter(const LiteBackendOption& option) {
 
   config_.set_nnadapter_dynamic_shape_info(option.nnadapter_dynamic_shape_info);
 }
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/lite/lite_backend.cc b/fastdeploy/runtime/backends/lite/lite_backend.cc
index f9d47a7a5..26fcc0acc 100644
--- a/fastdeploy/runtime/backends/lite/lite_backend.cc
+++ b/fastdeploy/runtime/backends/lite/lite_backend.cc
@@ -56,18 +56,39 @@ void LiteBackend::BuildOption(const LiteBackendOption& option) {
   }
 }
 
-bool LiteBackend::InitFromPaddle(const std::string& model_file,
-                                 const std::string& params_file,
-                                 const LiteBackendOption& option) {
+bool LiteBackend::Init(const RuntimeOption& runtime_option) {
   if (initialized_) {
     FDERROR << "LiteBackend is already initialized, cannot initialize again."
             << std::endl;
     return false;
   }
 
-  config_.set_model_file(model_file);
-  config_.set_param_file(params_file);
-  BuildOption(option);
+  if (runtime_option.model_format != ModelFormat::PADDLE) {
+    FDERROR
+        << "PaddleLiteBackend only supports model format PADDLE, but now it's "
+        << runtime_option.model_format << "." << std::endl;
+    return false;
+  }
+  if (runtime_option.device != Device::CPU &&
+      runtime_option.device != Device::KUNLUNXIN &&
+      runtime_option.device != Device::ASCEND &&
+      runtime_option.device != Device::TIMVX) {
+    FDERROR << "PaddleLiteBackend only supports "
+               "Device::CPU/Device::TIMVX/Device::KUNLUNXIN/Device::ASCEND, "
+               "but now it's "
+            << runtime_option.device << "." << std::endl;
+    return false;
+  }
+  if (runtime_option.model_from_memory_) {
+    FDERROR << "PaddleLiteBackend doesn't support load model from memory, "
+               "please load model from disk."
+            << std::endl;
+    return false;
+  }
+
+  config_.set_model_file(runtime_option.model_file);
+  config_.set_param_file(runtime_option.params_file);
+  BuildOption(runtime_option.paddle_lite_option);
   predictor_ =
       paddle::lite_api::CreatePaddlePredictor<paddle::lite_api::CxxConfig>(
           config_);
@@ -100,7 +121,7 @@ bool LiteBackend::InitFromPaddle(const std::string& model_file,
     auto shape = tensor->shape();
     info.shape.assign(shape.begin(), shape.end());
     info.name = output_names[i];
-    if (!option_.device == Device::KUNLUNXIN) {
+    if (option_.device != Device::KUNLUNXIN) {
       info.dtype = LiteDataTypeToFD(tensor->precision());
     }
     outputs_desc_.emplace_back(info);
@@ -136,6 +157,8 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
             << inputs_desc_.size() << ")." << std::endl;
     return false;
   }
+
+  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto iter = inputs_order_.find(inputs[i].name);
     if (iter == inputs_order_.end()) {
@@ -143,6 +166,7 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
               << " in loaded model." << std::endl;
       return false;
     }
+
     auto tensor = predictor_->GetInput(iter->second);
     // Adjust dims only, allocate lazy.
     tensor->Resize(inputs[i].shape);
@@ -175,7 +199,9 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
     }
   }
 
+  RUNTIME_PROFILE_LOOP_BEGIN(1)
   predictor_->Run();
+  RUNTIME_PROFILE_LOOP_END
 
   outputs->resize(outputs_desc_.size());
   for (size_t i = 0; i < outputs_desc_.size(); ++i) {
@@ -188,6 +214,7 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
     memcpy((*outputs)[i].MutableData(), tensor->data<void>(),
            (*outputs)[i].Nbytes());
   }
+  RUNTIME_PROFILE_LOOP_H2D_D2H_END
   return true;
 }
 
diff --git a/fastdeploy/runtime/backends/lite/lite_backend.h b/fastdeploy/runtime/backends/lite/lite_backend.h
old mode 100755
new mode 100644
index bb01551a0..15e71b50a
--- a/fastdeploy/runtime/backends/lite/lite_backend.h
+++ b/fastdeploy/runtime/backends/lite/lite_backend.h
@@ -22,6 +22,7 @@
 #include "paddle_api.h"  // NOLINT
 
 #include "fastdeploy/runtime/backends/backend.h"
+#include "fastdeploy/runtime/runtime_option.h"
 #include "fastdeploy/runtime/backends/lite/option.h"
 
 namespace fastdeploy {
@@ -30,11 +31,8 @@ class LiteBackend : public BaseBackend {
  public:
   LiteBackend() {}
   virtual ~LiteBackend() = default;
-  void BuildOption(const LiteBackendOption& option);
 
-  bool InitFromPaddle(const std::string& model_file,
-                      const std::string& params_file,
-                      const LiteBackendOption& option = LiteBackendOption());
+  bool Init(const RuntimeOption& option) override;
 
   bool Infer(std::vector<FDTensor>& inputs,
             std::vector<FDTensor>* outputs,
@@ -50,6 +48,8 @@ class LiteBackend : public BaseBackend {
   std::vector<TensorInfo> GetOutputInfos() override;
 
  private:
+  void BuildOption(const LiteBackendOption& option);
+
   void ConfigureCpu(const LiteBackendOption& option);
   void ConfigureTimvx(const LiteBackendOption& option);
   void ConfigureAscend(const LiteBackendOption& option);
diff --git a/fastdeploy/runtime/backends/lite/option.h b/fastdeploy/runtime/backends/lite/option.h
index 879cb3472..70781d80f 100755
--- a/fastdeploy/runtime/backends/lite/option.h
+++ b/fastdeploy/runtime/backends/lite/option.h
@@ -21,9 +21,7 @@
 // FastDepoy static library, default OFF. These messages
 // are only reserve for debugging.
 #if defined(WITH_STATIC_WARNING)
-#warning You are using the FastDeploy static library. \
-We will automatically add some registration codes for \
-ops, kernels and passes for Paddle Lite.
+#warning You are using the FastDeploy static library. We will automatically add some registration codes for ops, kernels and passes for Paddle Lite. // NOLINT
 #endif
 #if !defined(WITH_STATIC_LIB_AT_COMPILING)
 #include "paddle_use_ops.h"       // NOLINT
@@ -50,19 +48,29 @@ enum LitePowerMode {
   LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
 };
 
+/*! @brief Option object to configure Paddle Lite backend
+ */
 struct LiteBackendOption {
   /// Paddle Lite power mode for mobile device.
-  LitePowerMode power_mode = LITE_POWER_NO_BIND;
+  int power_mode = 3;
   /// Number of threads while use CPU
   int cpu_threads = 1;
   /// Enable use half precision
   bool enable_fp16 = false;
-  /// Enable use int8 precision for quantized model
-  bool enable_int8 = false;
-
+  /// Inference device, Paddle Lite support CPU/KUNLUNXIN/TIMVX/ASCEND
   Device device = Device::CPU;
+  /// Index of inference device
+  int device_id = 0;
 
-  // optimized model dir for CxxConfig
+  int kunlunxin_l3_workspace_size = 0xfffc00;
+  bool kunlunxin_locked = false;
+  bool kunlunxin_autotune = true;
+  std::string kunlunxin_autotune_file = "";
+  std::string kunlunxin_precision = "int16";
+  bool kunlunxin_adaptive_seqlen = false;
+  bool kunlunxin_enable_multi_stream = false;
+
+  /// Optimized model dir for CxxConfig
   std::string optimized_model_dir = "";
   std::string nnadapter_subgraph_partition_config_path = "";
   std::string nnadapter_subgraph_partition_config_buffer = "";
@@ -72,13 +80,5 @@ struct LiteBackendOption {
   std::map<std::string, std::vector<std::vector<int64_t>>>
     nnadapter_dynamic_shape_info = {{"", {{0}}}};
   std::vector<std::string> nnadapter_device_names = {};
-  int device_id = 0;
-  int kunlunxin_l3_workspace_size = 0xfffc00;
-  bool kunlunxin_locked = false;
-  bool kunlunxin_autotune = true;
-  std::string kunlunxin_autotune_file = "";
-  std::string kunlunxin_precision = "int16";
-  bool kunlunxin_adaptive_seqlen = false;
-  bool kunlunxin_enable_multi_stream = false;
 };
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/lite/option_pybind.cc b/fastdeploy/runtime/backends/lite/option_pybind.cc
new file mode 100644
index 000000000..0a01854ad
--- /dev/null
+++ b/fastdeploy/runtime/backends/lite/option_pybind.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+#include "fastdeploy/runtime/backends/lite/option.h"
+
+namespace fastdeploy {
+
+void BindLiteOption(pybind11::module& m) {
+  pybind11::class_<LiteBackendOption>(m, "LiteBackendOption")
+      .def(pybind11::init())
+      .def_readwrite("power_mode", &LiteBackendOption::power_mode)
+      .def_readwrite("cpu_threads", &LiteBackendOption::cpu_threads)
+      .def_readwrite("enable_fp16", &LiteBackendOption::enable_fp16)
+      .def_readwrite("device", &LiteBackendOption::device)
+      .def_readwrite("optimized_model_dir",
+                     &LiteBackendOption::optimized_model_dir)
+      .def_readwrite(
+          "nnadapter_subgraph_partition_config_path",
+          &LiteBackendOption::nnadapter_subgraph_partition_config_path)
+      .def_readwrite(
+          "nnadapter_subgraph_partition_config_buffer",
+          &LiteBackendOption::nnadapter_subgraph_partition_config_buffer)
+      .def_readwrite("nnadapter_context_properties",
+                     &LiteBackendOption::nnadapter_context_properties)
+      .def_readwrite("nnadapter_model_cache_dir",
+                     &LiteBackendOption::nnadapter_model_cache_dir)
+      .def_readwrite("nnadapter_mixed_precision_quantization_config_path",
+                     &LiteBackendOption::
+                         nnadapter_mixed_precision_quantization_config_path)
+      .def_readwrite("nnadapter_dynamic_shape_info",
+                     &LiteBackendOption::nnadapter_dynamic_shape_info)
+      .def_readwrite("nnadapter_device_names",
+                     &LiteBackendOption::nnadapter_device_names)
+      .def_readwrite("device_id", &LiteBackendOption::device_id)
+      .def_readwrite("kunlunxin_l3_workspace_size",
+                     &LiteBackendOption::kunlunxin_l3_workspace_size)
+      .def_readwrite("kunlunxin_locked", &LiteBackendOption::kunlunxin_locked)
+      .def_readwrite("kunlunxin_autotune",
+                     &LiteBackendOption::kunlunxin_autotune)
+      .def_readwrite("kunlunxin_autotune_file",
+                     &LiteBackendOption::kunlunxin_autotune_file)
+      .def_readwrite("kunlunxin_precision",
+                     &LiteBackendOption::kunlunxin_precision)
+      .def_readwrite("kunlunxin_adaptive_seqlen",
+                     &LiteBackendOption::kunlunxin_adaptive_seqlen)
+      .def_readwrite("kunlunxin_enable_multi_stream",
+                     &LiteBackendOption::kunlunxin_enable_multi_stream);
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/openvino/option.h b/fastdeploy/runtime/backends/openvino/option.h
index e78a73496..1200bd9c7 100644
--- a/fastdeploy/runtime/backends/openvino/option.h
+++ b/fastdeploy/runtime/backends/openvino/option.h
@@ -23,10 +23,39 @@
 #include <set>
 namespace fastdeploy {
 
+/*! @brief Option object to configure OpenVINO backend
+ */
 struct OpenVINOBackendOption {
   std::string device = "CPU";
   int cpu_thread_num = -1;
+
+  /// Number of streams while use OpenVINO
   int num_streams = 0;
+
+  /**
+   * @brief Set device name for OpenVINO, default 'CPU', can also be 'AUTO', 'GPU', 'GPU.1'....
+   */
+  void SetDevice(const std::string& name = "CPU") {
+    device = name;
+  }
+
+  /**
+   * @brief Set shape info for OpenVINO
+   */
+  void SetShapeInfo(
+      const std::map<std::string, std::vector<int64_t>>& _shape_infos) {
+    shape_infos = _shape_infos;
+  }
+
+  /**
+   * @brief While use OpenVINO backend with intel GPU, use this interface to specify operators run on CPU
+   */
+  void SetCpuOperators(const std::vector<std::string>& operators) {
+    for (const auto& op : operators) {
+      cpu_operators.insert(op);
+    }
+  }
+
   std::map<std::string, std::vector<int64_t>> shape_infos;
   std::set<std::string> cpu_operators{"MulticlassNms"};
 };
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls_pybind.cc b/fastdeploy/runtime/backends/openvino/option_pybind.cc
old mode 100755
new mode 100644
similarity index 53%
rename from fastdeploy/vision/classification/contrib/yolov5cls_pybind.cc
rename to fastdeploy/runtime/backends/openvino/option_pybind.cc
index 5a42dec38..ebd069576
--- a/fastdeploy/vision/classification/contrib/yolov5cls_pybind.cc
+++ b/fastdeploy/runtime/backends/openvino/option_pybind.cc
@@ -11,22 +11,20 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "fastdeploy/pybind/main.h"
+#include "fastdeploy/runtime/backends/openvino/option.h"
 
 namespace fastdeploy {
-void BindYOLOv5Cls(pybind11::module& m) {
-  pybind11::class_<vision::classification::YOLOv5Cls, FastDeployModel>(
-      m, "YOLOv5Cls")
-      .def(pybind11::init<std::string, std::string, RuntimeOption,
-                          ModelFormat>())
-      .def("predict",
-           [](vision::classification::YOLOv5Cls& self, pybind11::array& data,
-              int topk = 1) {
-             auto mat = PyArrayToCvMat(data);
-             vision::ClassifyResult res;
-             self.Predict(&mat, &res, topk);
-             return res;
-           })
-      .def_readwrite("size", &vision::classification::YOLOv5Cls::size);
+
+void BindOpenVINOOption(pybind11::module& m) {
+  pybind11::class_<OpenVINOBackendOption>(m, "OpenVINOBackendOption")
+      .def(pybind11::init())
+      .def_readwrite("cpu_thread_num", &OpenVINOBackendOption::cpu_thread_num)
+      .def_readwrite("num_streams", &OpenVINOBackendOption::num_streams)
+      .def("set_device", &OpenVINOBackendOption::SetDevice)
+      .def("set_shape_info", &OpenVINOBackendOption::SetShapeInfo)
+      .def("set_cpu_operators", &OpenVINOBackendOption::SetCpuOperators);
 }
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/openvino/ov_backend.cc b/fastdeploy/runtime/backends/openvino/ov_backend.cc
index d803dc0ab..7f569f92c 100644
--- a/fastdeploy/runtime/backends/openvino/ov_backend.cc
+++ b/fastdeploy/runtime/backends/openvino/ov_backend.cc
@@ -97,6 +97,33 @@ void OpenVINOBackend::InitTensorInfo(
   }
 }
 
+bool OpenVINOBackend::Init(const RuntimeOption& option) {
+  if (option.model_from_memory_) {
+    FDERROR << "OpenVINOBackend doesn't support load model from memory, please "
+               "load model from disk."
+            << std::endl;
+    return false;
+  }
+  if (option.device != Device::CPU) {
+    FDERROR << "OpenVINOBackend only supports Device::CPU, but now its "
+            << option.device << "." << std::endl;
+    return false;
+  }
+
+  if (option.model_format == ModelFormat::PADDLE) {
+    return InitFromPaddle(option.model_file, option.params_file,
+                          option.openvino_option);
+  } else if (option.model_format == ModelFormat::ONNX) {
+    return InitFromOnnx(option.model_file, option.openvino_option);
+  } else {
+    FDERROR << "OpenVINOBackend only supports model format Paddle/ONNX, but "
+               "now its "
+            << option.model_format << std::endl;
+    return false;
+  }
+  return false;
+}
+
 bool OpenVINOBackend::InitFromPaddle(const std::string& model_file,
                                      const std::string& params_file,
                                      const OpenVINOBackendOption& option) {
@@ -348,6 +375,7 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
     return false;
   }
 
+  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
   for (size_t i = 0; i < inputs.size(); ++i) {
     ov::Shape shape(inputs[i].shape.begin(), inputs[i].shape.end());
     ov::Tensor ov_tensor(FDDataTypeToOV(inputs[i].dtype), shape,
@@ -355,7 +383,9 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
     request_.set_tensor(inputs[i].name, ov_tensor);
   }
 
+  RUNTIME_PROFILE_LOOP_BEGIN(1)
   request_.infer();
+  RUNTIME_PROFILE_LOOP_END
 
   outputs->resize(output_infos_.size());
   for (size_t i = 0; i < output_infos_.size(); ++i) {
@@ -376,6 +406,7 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
           out_tensor.data(), Device::CPU);
     }
   }
+  RUNTIME_PROFILE_LOOP_H2D_D2H_END
   return true;
 }
 
@@ -392,4 +423,4 @@ std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(
   return new_backend;
 }
 
-}  // namespace fastdeploy
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/runtime/backends/openvino/ov_backend.h b/fastdeploy/runtime/backends/openvino/ov_backend.h
index e288b8b44..a27f17480 100644
--- a/fastdeploy/runtime/backends/openvino/ov_backend.h
+++ b/fastdeploy/runtime/backends/openvino/ov_backend.h
@@ -32,13 +32,7 @@ class OpenVINOBackend : public BaseBackend {
   OpenVINOBackend() {}
   virtual ~OpenVINOBackend() = default;
 
-  bool
-  InitFromPaddle(const std::string& model_file, const std::string& params_file,
-                 const OpenVINOBackendOption& option = OpenVINOBackendOption());
-
-  bool
-  InitFromOnnx(const std::string& model_file,
-               const OpenVINOBackendOption& option = OpenVINOBackendOption());
+  bool Init(const RuntimeOption& option);
 
   bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
              bool copy_to_fd = true) override;
@@ -55,8 +49,17 @@ class OpenVINOBackend : public BaseBackend {
   std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
                                      void* stream = nullptr,
                                      int device_id = -1) override;
-
+  
  private:
+  bool
+  InitFromPaddle(const std::string& model_file, const std::string& params_file,
+                 const OpenVINOBackendOption& option = OpenVINOBackendOption());
+
+  bool
+  InitFromOnnx(const std::string& model_file,
+               const OpenVINOBackendOption& option = OpenVINOBackendOption());
+
+
   void InitTensorInfo(const std::vector<ov::Output<ov::Node>>& ov_outputs,
                       std::map<std::string, TensorInfo>* tensor_infos);
 
@@ -67,4 +70,4 @@ class OpenVINOBackend : public BaseBackend {
   std::vector<TensorInfo> output_infos_;
 };
 
-}  // namespace fastdeploy
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/runtime/backends/ort/option.h b/fastdeploy/runtime/backends/ort/option.h
index ca4d3254c..9487e5da9 100644
--- a/fastdeploy/runtime/backends/ort/option.h
+++ b/fastdeploy/runtime/backends/ort/option.h
@@ -22,20 +22,30 @@
 #include <map>
 namespace fastdeploy {
 
+/*! @brief Option object to configure ONNX Runtime backend
+ */
 struct OrtBackendOption {
-  // -1 means default
-  // 0: ORT_DISABLE_ALL
-  // 1: ORT_ENABLE_BASIC
-  // 2: ORT_ENABLE_EXTENDED
-  // 99: ORT_ENABLE_ALL (enable some custom optimizations e.g bert)
+  /*
+   * @brief Level of graph optimization, -1: mean default(Enable all the optimization strategy)/0: disable all the optimization strategy/1: enable basic strategy/2:enable extend strategy/99: enable all
+   */
   int graph_optimization_level = -1;
+  /*
+   * @brief Number of threads to execute the operator, -1: default
+   */
   int intra_op_num_threads = -1;
+  /*
+   * @brief Number of threads to execute the graph, -1: default. This parameter only will bring effects while the `OrtBackendOption::execution_mode` set to 1.
+   */
   int inter_op_num_threads = -1;
-  // 0: ORT_SEQUENTIAL
-  // 1: ORT_PARALLEL
+  /*
+   * @brief Execution mode for the graph, -1: default(Sequential mode)/0: Sequential mode, execute the operators in graph one by one. /1: Parallel mode, execute the operators in graph parallelly.
+   */
   int execution_mode = -1;
+  /// Inference device, OrtBackend supports CPU/GPU
   Device device = Device::CPU;
+  /// Inference device id
   int device_id = 0;
+
   void* external_stream_ = nullptr;
 };
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/ort/option_pybind.cc b/fastdeploy/runtime/backends/ort/option_pybind.cc
new file mode 100644
index 000000000..4b8f47975
--- /dev/null
+++ b/fastdeploy/runtime/backends/ort/option_pybind.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+#include "fastdeploy/runtime/backends/ort/option.h"
+
+namespace fastdeploy {
+
+void BindOrtOption(pybind11::module& m) {
+  pybind11::class_<OrtBackendOption>(m, "OrtBackendOption")
+      .def(pybind11::init())
+      .def_readwrite("graph_optimization_level",
+                     &OrtBackendOption::graph_optimization_level)
+      .def_readwrite("intra_op_num_threads",
+                     &OrtBackendOption::intra_op_num_threads)
+      .def_readwrite("inter_op_num_threads",
+                     &OrtBackendOption::inter_op_num_threads)
+      .def_readwrite("execution_mode", &OrtBackendOption::execution_mode)
+      .def_readwrite("device", &OrtBackendOption::device)
+      .def_readwrite("device_id", &OrtBackendOption::device_id);
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/ort/ort_backend.cc b/fastdeploy/runtime/backends/ort/ort_backend.cc
index 70cb18121..58c449cc6 100644
--- a/fastdeploy/runtime/backends/ort/ort_backend.cc
+++ b/fastdeploy/runtime/backends/ort/ort_backend.cc
@@ -13,9 +13,6 @@
 // limitations under the License.
 
 #include "fastdeploy/runtime/backends/ort/ort_backend.h"
-
-#include <memory>
-
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/runtime/backends/ort/ops/adaptive_pool2d.h"
 #include "fastdeploy/runtime/backends/ort/ops/multiclass_nms.h"
@@ -25,6 +22,9 @@
 #include "paddle2onnx/converter.h"
 #endif
 
+#include <memory>
+
+
 namespace fastdeploy {
 
 std::vector<OrtCustomOp*> OrtBackend::custom_operators_ =
@@ -258,6 +258,7 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
   }
 
   // from FDTensor to Ort Inputs
+  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto ort_value = CreateOrtValue(inputs[i], option_.device == Device::GPU);
     binding_->BindInput(inputs[i].name.c_str(), ort_value);
@@ -270,12 +271,14 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
   }
 
   // Inference with inputs
+  RUNTIME_PROFILE_LOOP_BEGIN(1)
   try {
     session_.Run({}, *(binding_.get()));
   } catch (const std::exception& e) {
     FDERROR << "Failed to Infer: " << e.what() << std::endl;
     return false;
   }
+  RUNTIME_PROFILE_LOOP_END
 
   // Convert result after inference
   std::vector<Ort::Value> ort_outputs = binding_->GetOutputValues();
@@ -284,7 +287,7 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
     OrtValueToFDTensor(ort_outputs[i], &((*outputs)[i]), outputs_desc_[i].name,
                        copy_to_fd);
   }
-
+  RUNTIME_PROFILE_LOOP_H2D_D2H_END
   return true;
 }
 
diff --git a/fastdeploy/runtime/backends/ort/ort_backend.h b/fastdeploy/runtime/backends/ort/ort_backend.h
index 61308b9da..e0caf48a3 100644
--- a/fastdeploy/runtime/backends/ort/ort_backend.h
+++ b/fastdeploy/runtime/backends/ort/ort_backend.h
@@ -54,7 +54,7 @@ class OrtBackend : public BaseBackend {
   std::vector<TensorInfo> GetOutputInfos() override;
   static std::vector<OrtCustomOp*> custom_operators_;
   void InitCustomOperators();
-
+  
  private:
   bool InitFromPaddle(const std::string& model_buffer,
                       const std::string& params_buffer,
diff --git a/fastdeploy/runtime/backends/paddle/option.h b/fastdeploy/runtime/backends/paddle/option.h
index 8b311bf3f..749a35705 100644
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -24,54 +24,81 @@
 
 namespace fastdeploy {
 
+/*! @brief Option object to configure GraphCore IPU
+ */
 struct IpuOption {
+  /// IPU device id
   int ipu_device_num;
+  /// the batch size in the graph, only work when graph has no batch shape info
   int ipu_micro_batch_size;
+  /// enable pipelining
   bool ipu_enable_pipelining;
+  /// the number of batches per run in pipelining
   int ipu_batches_per_step;
+  /// enable fp16
   bool ipu_enable_fp16;
+  /// the number of graph replication
   int ipu_replica_num;
+  /// the available memory proportion for matmul/conv
   float ipu_available_memory_proportion;
+  /// enable fp16 partial for matmul, only work with fp16
   bool ipu_enable_half_partial;
 };
 
+/*! @brief Option object to configure Paddle Inference backend
+ */
 struct PaddleBackendOption {
+  /// Print log information while initialize Paddle Inference backend
+  bool enable_log_info = false;
+  /// Enable MKLDNN while inference on CPU
+  bool enable_mkldnn = true;
+  /// Use Paddle Inference + TensorRT to inference model on GPU
+  bool enable_trt = false;
+
+  /*
+   * @brief IPU option, this will configure the IPU hardware, if inference model in IPU
+   */
+  IpuOption ipu_option;
+
+  /// Collect shape for model while enabel_trt is true
+  bool collect_trt_shape = false;
+  /// Cache input shape for mkldnn while the input data will change dynamiclly
+  int mkldnn_cache_size = -1;
+  /// initialize memory size(MB) for GPU
+  int gpu_mem_init_size = 100;
+
+  void DisableTrtOps(const std::vector<std::string>& ops) {
+    trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
+  }
+
+  void DeletePass(const std::string& pass_name) {
+    delete_pass_names.push_back(pass_name);
+  }
+
+  void SetIpuConfig(bool enable_fp16, int replica_num,
+                                   float available_memory_proportion,
+                                   bool enable_half_partial) {
+    ipu_option.ipu_enable_fp16 = enable_fp16;
+    ipu_option.ipu_replica_num = replica_num;
+    ipu_option.ipu_available_memory_proportion =
+        available_memory_proportion;
+    ipu_option.ipu_enable_half_partial = enable_half_partial;
+  }
+
+  // The belowing parameters may be removed, please do not
+  // read or write them directly
+  TrtBackendOption trt_option;
+  bool enable_pinned_memory = false;
+  void* external_stream_ = nullptr;
+  Device device = Device::CPU;
+  int device_id = 0;
+  std::vector<std::string> trt_disabled_ops_{};
+  int cpu_thread_num = 8;
+  std::vector<std::string> delete_pass_names = {};
   std::string model_file = "";   // Path of model file
   std::string params_file = "";  // Path of parameters file, can be empty
 
   // load model and paramters from memory
   bool model_from_memory_ = false;
-
-#ifdef WITH_GPU
-  bool use_gpu = true;
-#else
-  bool use_gpu = false;
-#endif
-  bool enable_mkldnn = true;
-
-  bool enable_log_info = false;
-
-  bool enable_trt = false;
-  TrtBackendOption trt_option;
-  bool collect_shape = false;
-  std::vector<std::string> trt_disabled_ops_{};
-
-#ifdef WITH_IPU
-  bool use_ipu = true;
-  IpuOption ipu_option;
-#else
-  bool use_ipu = false;
-#endif
-
-  int mkldnn_cache_size = 1;
-  int cpu_thread_num = 8;
-  // initialize memory size(MB) for GPU
-  int gpu_mem_init_size = 100;
-  // gpu device id
-  int gpu_id = 0;
-  bool enable_pinned_memory = false;
-  void* external_stream_ = nullptr;
-
-  std::vector<std::string> delete_pass_names = {};
 };
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/paddle/option_pybind.cc b/fastdeploy/runtime/backends/paddle/option_pybind.cc
new file mode 100644
index 000000000..50b34ca61
--- /dev/null
+++ b/fastdeploy/runtime/backends/paddle/option_pybind.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+#include "fastdeploy/runtime/backends/paddle/option.h"
+
+namespace fastdeploy {
+
+void BindIpuOption(pybind11::module& m) {
+  pybind11::class_<IpuOption>(m, "IpuOption")
+      .def(pybind11::init())
+      .def_readwrite("ipu_device_num", &IpuOption::ipu_device_num)
+      .def_readwrite("ipu_micro_batch_size", &IpuOption::ipu_micro_batch_size)
+      .def_readwrite("ipu_enable_pipelining", &IpuOption::ipu_enable_pipelining)
+      .def_readwrite("ipu_batches_per_step", &IpuOption::ipu_batches_per_step)
+      .def_readwrite("ipu_enable_fp16", &IpuOption::ipu_enable_fp16)
+      .def_readwrite("ipu_replica_num", &IpuOption::ipu_replica_num)
+      .def_readwrite("ipu_available_memory_proportion",
+                     &IpuOption::ipu_available_memory_proportion)
+      .def_readwrite("ipu_enable_half_partial",
+                     &IpuOption::ipu_enable_half_partial);
+}
+
+void BindPaddleOption(pybind11::module& m) {
+  BindIpuOption(m);
+  pybind11::class_<PaddleBackendOption>(m, "PaddleBackendOption")
+      .def(pybind11::init())
+      .def_readwrite("enable_log_info", &PaddleBackendOption::enable_log_info)
+      .def_readwrite("enable_mkldnn", &PaddleBackendOption::enable_mkldnn)
+      .def_readwrite("enable_trt", &PaddleBackendOption::enable_trt)
+      .def_readwrite("ipu_option", &PaddleBackendOption::ipu_option)
+      .def_readwrite("collect_trt_shape",
+                     &PaddleBackendOption::collect_trt_shape)
+      .def_readwrite("mkldnn_cache_size",
+                     &PaddleBackendOption::mkldnn_cache_size)
+      .def_readwrite("gpu_mem_init_size",
+                     &PaddleBackendOption::gpu_mem_init_size)
+      .def("disable_trt_ops", &PaddleBackendOption::DisableTrtOps)
+      .def("delete_pass", &PaddleBackendOption::DeletePass)
+      .def("set_ipu_config", &PaddleBackendOption::SetIpuConfig);
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.cc b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
index 90bd27682..e0e908c36 100644
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -22,8 +22,8 @@ namespace fastdeploy {
 
 void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
   option_ = option;
-  if (option.use_gpu) {
-    config_.EnableUseGpu(option.gpu_mem_init_size, option.gpu_id);
+  if (option.device == Device::GPU) {
+    config_.EnableUseGpu(option.gpu_mem_init_size, option.device_id);
     if (option_.external_stream_) {
       config_.SetExecStream(option_.external_stream_);
     }
@@ -50,7 +50,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
                                    precision, use_static);
       SetTRTDynamicShapeToConfig(option);
     }
-  } else if (option.use_ipu) {
+  } else if (option.device == Device::IPU) {
 #ifdef WITH_IPU
     config_.EnableIpu(option.ipu_option.ipu_device_num,
                       option.ipu_option.ipu_micro_batch_size,
@@ -101,14 +101,15 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
                          params_buffer.c_str(), params_buffer.size());
   config_.EnableMemoryOptim();
   BuildOption(option);
-  
+
   // The input/output information get from predictor is not right, use
   // PaddleReader instead now
-  auto reader = paddle2onnx::PaddleReader(model_buffer.c_str(), model_buffer.size());
+  auto reader =
+      paddle2onnx::PaddleReader(model_buffer.c_str(), model_buffer.size());
   // If it's a quantized model, and use cpu with mkldnn, automaticaly switch to
   // int8 mode
   if (reader.is_quantize_model) {
-    if (option.use_gpu) {
+    if (option.device == Device::GPU) {
       FDWARNING << "The loaded model is a quantized model, while inference on "
                    "GPU, please use TensorRT backend to get better performance."
                 << std::endl;
@@ -158,7 +159,7 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
     outputs_desc_[i].shape.assign(shape.begin(), shape.end());
     outputs_desc_[i].dtype = ReaderDataTypeToFD(reader.outputs[i].dtype);
   }
-  if (option.collect_shape) {
+  if (option.collect_trt_shape) {
     // Set the shape info file.
     std::string curr_model_dir = "./";
     if (!option.model_from_memory_) {
@@ -222,15 +223,18 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
     return false;
   }
 
+  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto handle = predictor_->GetInputHandle(inputs[i].name);
     ShareTensorFromFDTensor(handle.get(), inputs[i]);
   }
 
+  RUNTIME_PROFILE_LOOP_BEGIN(1)
   predictor_->Run();
+  RUNTIME_PROFILE_LOOP_END
 
   // output share backend memory only support CPU or GPU
-  if (option_.use_ipu) {
+  if (option_.device == Device::IPU) {
     copy_to_fd = true;
   }
   outputs->resize(outputs_desc_.size());
@@ -241,6 +245,7 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
     }
     PaddleTensorToFDTensor(handle, &((*outputs)[i]), copy_to_fd);
   }
+  RUNTIME_PROFILE_LOOP_H2D_D2H_END
   return true;
 }
 
@@ -249,9 +254,10 @@ std::unique_ptr<BaseBackend> PaddleBackend::Clone(RuntimeOption& runtime_option,
   std::unique_ptr<BaseBackend> new_backend =
       utils::make_unique<PaddleBackend>();
   auto casted_backend = dynamic_cast<PaddleBackend*>(new_backend.get());
-  if (device_id > 0 && option_.use_gpu == true && device_id != option_.gpu_id) {
+  if (device_id > 0 && (option_.device == Device::GPU) &&
+      device_id != option_.device_id) {
     auto clone_option = option_;
-    clone_option.gpu_id = device_id;
+    clone_option.device_id = device_id;
     clone_option.external_stream_ = stream;
     if (runtime_option.model_from_memory_) {
       FDASSERT(
@@ -275,7 +281,7 @@ std::unique_ptr<BaseBackend> PaddleBackend::Clone(RuntimeOption& runtime_option,
     }
 
     FDWARNING << "The target device id:" << device_id
-              << " is different from current device id:" << option_.gpu_id
+              << " is different from current device id:" << option_.device_id
               << ", cannot share memory with current engine." << std::endl;
     return new_backend;
   }
@@ -343,10 +349,13 @@ void PaddleBackend::CollectShapeRun(
     const std::map<std::string, std::vector<int>>& shape) const {
   auto input_names = predictor->GetInputNames();
   auto input_type = predictor->GetInputTypes();
-  for (auto name : input_names) {
+  for (const auto& name : input_names) {
     FDASSERT(shape.find(name) != shape.end() &&
                  input_type.find(name) != input_type.end(),
-             "Paddle Input name [%s] is not one of the trt dynamic shape.",
+             "When collect_trt_shape is true, please define max/opt/min shape "
+             "for model's input:[\"%s\"] by "
+             "(C++)RuntimeOption.trt_option.SetShape/"
+             "(Python)RuntimeOption.trt_option.set_shape.",
              name.c_str());
     auto tensor = predictor->GetInputHandle(name);
     auto shape_value = shape.at(name);
diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.h b/fastdeploy/runtime/backends/paddle/paddle_backend.h
index 8cde22cfd..02c430ade 100755
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.h
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.h
@@ -89,4 +89,4 @@ class PaddleBackend : public BaseBackend {
   std::vector<TensorInfo> inputs_desc_;
   std::vector<TensorInfo> outputs_desc_;
 };
-}  // namespace fastdeploy
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/runtime/backends/poros/option.h b/fastdeploy/runtime/backends/poros/option.h
index 2b715f7dc..ebaffec09 100755
--- a/fastdeploy/runtime/backends/poros/option.h
+++ b/fastdeploy/runtime/backends/poros/option.h
@@ -22,13 +22,11 @@
 
 namespace fastdeploy {
 
+/*! @brief Option object to configure Poros backend
+ */
 struct PorosBackendOption {
-#ifdef WITH_GPU
-  bool use_gpu = true;
-#else
-  bool use_gpu = false;
-#endif
-  int gpu_id = 0;
+  Device device = Device::CPU;
+  int device_id = 0;
   bool long_to_int = true;
   // There is calculation precision in tf32 mode on A10, it can bring some
   // performance improvement, but there may be diff
diff --git a/fastdeploy/runtime/backends/poros/option_pybind.cc b/fastdeploy/runtime/backends/poros/option_pybind.cc
new file mode 100644
index 000000000..b545ea85c
--- /dev/null
+++ b/fastdeploy/runtime/backends/poros/option_pybind.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+#include "fastdeploy/runtime/backends/poros/option.h"
+
+namespace fastdeploy {
+
+void BindPorosOption(pybind11::module& m) {
+  pybind11::class_<PorosBackendOption>(m, "PorosBackendOption")
+      .def(pybind11::init())
+      .def_readwrite("long_to_int", &PorosBackendOption::long_to_int)
+      .def_readwrite("use_nvidia_tf32", &PorosBackendOption::use_nvidia_tf32)
+      .def_readwrite("unconst_ops_thres",
+                     &PorosBackendOption::unconst_ops_thres)
+      .def_readwrite("prewarm_datatypes",
+                     &PorosBackendOption::prewarm_datatypes)
+      .def_readwrite("enable_fp16", &PorosBackendOption::enable_fp16)
+      .def_readwrite("enable_int8", &PorosBackendOption::enable_int8)
+      .def_readwrite("is_dynamic", &PorosBackendOption::is_dynamic)
+      .def_readwrite("max_batch_size", &PorosBackendOption::max_batch_size)
+      .def_readwrite("max_workspace_size",
+                     &PorosBackendOption::max_workspace_size);
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/poros/poros_backend.cc b/fastdeploy/runtime/backends/poros/poros_backend.cc
index ebe359b3d..64b07dd91 100644
--- a/fastdeploy/runtime/backends/poros/poros_backend.cc
+++ b/fastdeploy/runtime/backends/poros/poros_backend.cc
@@ -43,11 +43,12 @@ std::vector<TensorInfo> PorosBackend::GetOutputInfos() {
 }
 
 void PorosBackend::BuildOption(const PorosBackendOption& option) {
-  _options.device = option.use_gpu ? baidu::mirana::poros::Device::GPU
-                                   : baidu::mirana::poros::Device::CPU;
+  _options.device = (option.device == Device::GPU)
+                        ? baidu::mirana::poros::Device::GPU
+                        : baidu::mirana::poros::Device::CPU;
   _options.long_to_int = option.long_to_int;
   _options.use_nvidia_tf32 = option.use_nvidia_tf32;
-  _options.device_id = option.gpu_id;
+  _options.device_id = option.device_id;
   _options.unconst_ops_thres = option.unconst_ops_thres;
   _options.is_dynamic = option.is_dynamic;
   _options.max_workspace_size = option.max_workspace_size;
@@ -67,7 +68,7 @@ bool PorosBackend::Compile(const std::string& model_file,
   torch::jit::Module mod;
   mod = torch::jit::load(model_file);
   mod.eval();
-  if (option.use_gpu) {
+  if (option.device == Device::GPU) {
     mod.to(at::kCUDA);
   } else {
     mod.to(at::kCPU);
@@ -79,7 +80,7 @@ bool PorosBackend::Compile(const std::string& model_file,
   _numinputs = inputs.size() - 1;
   // FDTensor to at::Tensor
   std::vector<std::vector<c10::IValue>> prewarm_datas;
-  bool is_backend_cuda = option.use_gpu ? true : false;
+  bool is_backend_cuda = (option.device == Device::GPU);
   for (size_t i = 0; i < prewarm_tensors.size(); ++i) {
     std::vector<c10::IValue> prewarm_data;
     for (size_t j = 0; j < prewarm_tensors[i].size(); ++j) {
@@ -121,73 +122,6 @@ bool PorosBackend::Compile(const std::string& model_file,
   return true;
 }
 
-bool PorosBackend::InitFromTorchScript(const std::string& model_file,
-                                       const PorosBackendOption& option) {
-  if (initialized_) {
-    FDERROR << "PorosBackend is already initlized, cannot initialize again."
-            << std::endl;
-    return false;
-  }
-  if (option.poros_file != "") {
-    std::ifstream fin(option.poros_file, std::ios::binary | std::ios::in);
-    if (fin) {
-      FDINFO << "Detect compiled Poros file in " << option.poros_file
-             << ", will load it directly." << std::endl;
-      fin.close();
-      return InitFromPoros(option.poros_file, option);
-    }
-  }
-  BuildOption(option);
-  torch::jit::Module mod;
-  mod = torch::jit::load(model_file);
-  mod.eval();
-  if (option.use_gpu) {
-    mod.to(at::kCUDA);
-  } else {
-    mod.to(at::kCPU);
-  }
-  // get inputs_nums and outputs_nums
-  auto graph = mod.get_method("forward").graph();
-  auto inputs = graph->inputs();
-  // remove self node
-  _numinputs = inputs.size() - 1;
-  auto outputs = graph->outputs();
-  _numoutputs = outputs.size();
-  _poros_module = baidu::mirana::poros::Compile(mod, _prewarm_datas, _options);
-  if (_poros_module == nullptr) {
-    FDERROR << "PorosBackend initlize Failed, try initialize again."
-            << std::endl;
-    return false;
-  }
-  initialized_ = true;
-  return true;
-}
-
-bool PorosBackend::InitFromPoros(const std::string& model_file,
-                                 const PorosBackendOption& option) {
-  if (initialized_) {
-    FDERROR << "PorosBackend is already initlized, cannot initialize again."
-            << std::endl;
-    return false;
-  }
-  BuildOption(option);
-  _poros_module = baidu::mirana::poros::Load(model_file, _options);
-  if (_poros_module == nullptr) {
-    FDERROR << "PorosBackend initlize Failed, try initialize again."
-            << std::endl;
-    return false;
-  }
-  // get inputs_nums and outputs_nums
-  auto graph = _poros_module->get_method("forward").graph();
-  auto inputs = graph->inputs();
-  // remove self node
-  _numinputs = inputs.size() - 1;
-  auto outputs = graph->outputs();
-  _numoutputs = outputs.size();
-  initialized_ = true;
-  return true;
-}
-
 bool PorosBackend::Infer(std::vector<FDTensor>& inputs,
                          std::vector<FDTensor>* outputs, bool copy_to_fd) {
   // Convert FD Tensor to PyTorch Tensor
@@ -238,4 +172,4 @@ bool PorosBackend::Infer(std::vector<FDTensor>& inputs,
   return true;
 }
 
-}  // namespace fastdeploy
\ No newline at end of file
+}  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/poros/poros_backend.h b/fastdeploy/runtime/backends/poros/poros_backend.h
index 5d15128cf..0d01a6884 100755
--- a/fastdeploy/runtime/backends/poros/poros_backend.h
+++ b/fastdeploy/runtime/backends/poros/poros_backend.h
@@ -51,13 +51,6 @@ class PorosBackend : public BaseBackend {
 
   void BuildOption(const PorosBackendOption& option);
 
-  bool
-  InitFromTorchScript(const std::string& model_file,
-                      const PorosBackendOption& option = PorosBackendOption());
-
-  bool InitFromPoros(const std::string& model_file,
-                     const PorosBackendOption& option = PorosBackendOption());
-
   bool Compile(const std::string& model_file,
                std::vector<std::vector<FDTensor>>& prewarm_tensors,
                const PorosBackendOption& option = PorosBackendOption());
diff --git a/fastdeploy/runtime/backends/sophgo/sophgo_backend.cc b/fastdeploy/runtime/backends/sophgo/sophgo_backend.cc
index 8ae6e39f4..39b2770f4 100644
--- a/fastdeploy/runtime/backends/sophgo/sophgo_backend.cc
+++ b/fastdeploy/runtime/backends/sophgo/sophgo_backend.cc
@@ -27,19 +27,7 @@ SophgoBackend::~SophgoBackend() { bm_dev_free(handle_); }
 bool SophgoBackend::GetSDKAndDeviceVersion() { return true; }
 
 /***************************************************************
- *  @name      BuildOption
- *  @brief     save option
- *  @param     SOPHGOTPU2BackendOption
- *  @note      None
- ***************************************************************/
-void SophgoBackend::BuildOption(const SophgoBackendOption& option) {
-  //  this->option_ = option;
-  // save cpu_name
-  //   this->option_.cpu_name = option.cpu_name;
-}
-
-/***************************************************************
- *  @name       InitFromSophgo
+ *  @name       Init
  *  @brief      Initialize Sophgo model
  *  @param      model_file: Binary data for the Sophgo model.
  *              params_file: None
@@ -47,8 +35,26 @@ void SophgoBackend::BuildOption(const SophgoBackendOption& option) {
  *  @return     bool
  *  @note       None
  ***************************************************************/
-bool SophgoBackend::InitFromSophgo(const std::string& model_file,
-                                   const SophgoBackendOption& option) {
+bool SophgoBackend::Init(const RuntimeOption& option) {
+  if (option.model_from_memory_) {
+    FDERROR << "SophgoBackend doesn't support load model from memory, please "
+               "load model from disk."
+            << std::endl;
+    return false;
+  }
+  if (option.model_format != ModelFormat::SOPHGO) {
+    FDERROR << "SophgoBackend only supports model format SOPHGO, but now it's "
+            << option.model_format << "." << std::endl;
+    return false;
+  }
+  if (option.device != Device::SOPHGOTPUD) {
+    FDERROR << "SophgoBackend only supports device::SOPHGOTPUD, but now it's "
+            << option.device << "." << std::endl;
+    return false;
+  }
+
+  std::string model_file = option.model_file;
+
   // LoadModel
   if (!this->LoadModel((char*)model_file.data())) {
     FDERROR << "load model failed" << std::endl;
@@ -61,9 +67,6 @@ bool SophgoBackend::InitFromSophgo(const std::string& model_file,
     return false;
   }
 
-  // BuildOption
-  this->BuildOption(option);
-
   // GetModelInputOutputInfos
   if (!this->GetModelInputOutputInfos()) {
     FDERROR << "get model input output infos failed" << std::endl;
diff --git a/fastdeploy/runtime/backends/sophgo/sophgo_backend.h b/fastdeploy/runtime/backends/sophgo/sophgo_backend.h
index 956a89cd5..34ba2838f 100644
--- a/fastdeploy/runtime/backends/sophgo/sophgo_backend.h
+++ b/fastdeploy/runtime/backends/sophgo/sophgo_backend.h
@@ -30,12 +30,7 @@ class SophgoBackend : public BaseBackend {
  public:
   SophgoBackend() = default;
   virtual ~SophgoBackend();
-  bool LoadModel(void* model);
-  bool GetSDKAndDeviceVersion();
-  bool GetModelInputOutputInfos();
-  void BuildOption(const SophgoBackendOption& option);
-  bool InitFromSophgo(const std::string& model_file,
-              const SophgoBackendOption& option = SophgoBackendOption());
+  bool Init(const RuntimeOption& option);
 
   int NumInputs() const override {
       return static_cast<int>(inputs_desc_.size());
@@ -54,6 +49,10 @@ class SophgoBackend : public BaseBackend {
               bool copy_to_fd = true) override;
 
  private:
+  bool LoadModel(void* model);
+  bool GetSDKAndDeviceVersion();
+  bool GetModelInputOutputInfos();
+
   std::vector<TensorInfo> inputs_desc_;
   std::vector<TensorInfo> outputs_desc_;
   std::string net_name_;
diff --git a/fastdeploy/runtime/backends/tensorrt/option.h b/fastdeploy/runtime/backends/tensorrt/option.h
index 8d4ad4aaf..5cee0a7e3 100755
--- a/fastdeploy/runtime/backends/tensorrt/option.h
+++ b/fastdeploy/runtime/backends/tensorrt/option.h
@@ -21,23 +21,64 @@
 
 namespace fastdeploy {
 
+/*! @brief Option object to configure TensorRT backend
+ */
 struct TrtBackendOption {
-  std::string model_file = "";   // Path of model file
-  std::string params_file = "";  // Path of parameters file, can be empty
-
-  // format of input model
-  ModelFormat model_format = ModelFormat::AUTOREC;
-
-  int gpu_id = 0;
-  bool enable_fp16 = false;
-  bool enable_int8 = false;
+  /// `max_batch_size`, it's deprecated in TensorRT 8.x
   size_t max_batch_size = 32;
+
+  /// `max_workspace_size` for TensorRT
   size_t max_workspace_size = 1 << 30;
+
+  /*
+   * @brief Enable half precison inference, on some device not support half precision, it will fallback to float32 mode
+   */
+  bool enable_fp16 = false;
+
+  /** \brief Set shape range of input tensor for the model that contain dynamic input shape while using TensorRT backend
+   *
+   * \param[in] tensor_name The name of input for the model which is dynamic shape
+   * \param[in] min The minimal shape for the input tensor
+   * \param[in] opt The optimized shape for the input tensor, just set the most common shape, if set as default value, it will keep same with min_shape
+   * \param[in] max The maximum shape for the input tensor, if set as default value, it will keep same with min_shape
+   */
+  void SetShape(const std::string& tensor_name,
+                const std::vector<int32_t>& min,
+                const std::vector<int32_t>& opt,
+                const std::vector<int32_t>& max) {
+    min_shape[tensor_name].clear();
+    max_shape[tensor_name].clear();
+    opt_shape[tensor_name].clear();
+    min_shape[tensor_name].assign(min.begin(), min.end());
+    if (opt.size() == 0) {
+      opt_shape[tensor_name].assign(min.begin(), min.end());
+    } else {
+      opt_shape[tensor_name].assign(opt.begin(), opt.end());
+    }
+    if (max.size() == 0) {
+      max_shape[tensor_name].assign(min.begin(), min.end());
+    } else {
+      max_shape[tensor_name].assign(max.begin(), max.end());
+    }
+  }
+  /**
+   * @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
+   */
+  std::string serialize_file = "";
+
+  // The below parameters may be removed in next version, please do not
+  // visit or use them directly
   std::map<std::string, std::vector<int32_t>> max_shape;
   std::map<std::string, std::vector<int32_t>> min_shape;
   std::map<std::string, std::vector<int32_t>> opt_shape;
-  std::string serialize_file = "";
   bool enable_pinned_memory = false;
   void* external_stream_ = nullptr;
+  int gpu_id = 0;
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+  // format of input model
+  ModelFormat model_format = ModelFormat::AUTOREC;
 };
+
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/tensorrt/option_pybind.cc b/fastdeploy/runtime/backends/tensorrt/option_pybind.cc
new file mode 100644
index 000000000..d781256a5
--- /dev/null
+++ b/fastdeploy/runtime/backends/tensorrt/option_pybind.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+#include "fastdeploy/runtime/backends/tensorrt/option.h"
+
+namespace fastdeploy {
+
+void BindTrtOption(pybind11::module& m) {
+  pybind11::class_<TrtBackendOption>(m, "TrtBackendOption")
+      .def(pybind11::init())
+      .def_readwrite("enable_fp16", &TrtBackendOption::enable_fp16)
+      .def_readwrite("max_batch_size", &TrtBackendOption::max_batch_size)
+      .def_readwrite("max_workspace_size",
+                     &TrtBackendOption::max_workspace_size)
+      .def_readwrite("serialize_file", &TrtBackendOption::serialize_file)
+      .def("set_shape", &TrtBackendOption::SetShape);
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
index 6972cf8ed..74bd3ae4f 100644
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
@@ -113,6 +113,50 @@ bool TrtBackend::LoadTrtCache(const std::string& trt_engine_file) {
   return true;
 }
 
+bool TrtBackend::Init(const RuntimeOption& runtime_option) {
+  if (runtime_option.device != Device::GPU) {
+    FDERROR << "TrtBackend only supports Device::GPU, but now it's "
+            << runtime_option.device << "." << std::endl;
+    return false;
+  }
+  if (runtime_option.model_format != ModelFormat::PADDLE &&
+      runtime_option.model_format != ModelFormat::ONNX) {
+    FDERROR
+        << "TrtBackend only supports model format PADDLE/ONNX, but now it's "
+        << runtime_option.model_format << "." << std::endl;
+    return false;
+  }
+  if (runtime_option.model_format == ModelFormat::PADDLE) {
+    if (runtime_option.model_from_memory_) {
+      return InitFromPaddle(runtime_option.model_file,
+                            runtime_option.params_file,
+                            runtime_option.trt_option);
+    } else {
+      std::string model_buffer;
+      std::string params_buffer;
+      FDASSERT(ReadBinaryFromFile(runtime_option.model_file, &model_buffer),
+               "Failed to read model file %s.",
+               runtime_option.model_file.c_str());
+      FDASSERT(ReadBinaryFromFile(runtime_option.params_file, &params_buffer),
+               "Failed to read parameters file %s.",
+               runtime_option.params_file.c_str());
+      return InitFromPaddle(model_buffer, params_buffer,
+                            runtime_option.trt_option);
+    }
+  } else {
+    if (runtime_option.model_from_memory_) {
+      return InitFromOnnx(runtime_option.model_file, runtime_option.trt_option);
+    } else {
+      std::string model_buffer;
+      FDASSERT(ReadBinaryFromFile(runtime_option.model_file, &model_buffer),
+               "Failed to read model file %s.",
+               runtime_option.model_file.c_str());
+      return InitFromOnnx(model_buffer, runtime_option.trt_option);
+    }
+  }
+  return true;
+}
+
 bool TrtBackend::InitFromPaddle(const std::string& model_buffer,
                                 const std::string& params_buffer,
                                 const TrtBackendOption& option, bool verbose) {
@@ -287,14 +331,18 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
     BuildTrtEngine();
   }
 
+  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
   cudaSetDevice(option_.gpu_id);
   SetInputs(inputs);
   AllocateOutputsBuffer(outputs, copy_to_fd);
 
+  RUNTIME_PROFILE_LOOP_BEGIN(1)
   if (!context_->enqueueV2(bindings_.data(), stream_, nullptr)) {
     FDERROR << "Failed to Infer with TensorRT." << std::endl;
     return false;
   }
+  RUNTIME_PROFILE_LOOP_END
+
   for (size_t i = 0; i < outputs->size(); ++i) {
     // if the final output tensor's dtype is different from the model output
     // tensor's dtype, then we need cast the data to the final output's dtype
@@ -335,7 +383,7 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
     FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess,
              "[ERROR] Error occurs while sync cuda stream.");
   }
-
+  RUNTIME_PROFILE_LOOP_H2D_D2H_END
   return true;
 }
 
diff --git a/fastdeploy/runtime/backends/tensorrt/trt_backend.h b/fastdeploy/runtime/backends/tensorrt/trt_backend.h
index 84698ac9f..74d1da36f 100755
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.h
@@ -70,14 +70,8 @@ FDDataType GetFDDataType(const nvinfer1::DataType& dtype);
 class TrtBackend : public BaseBackend {
  public:
   TrtBackend() : engine_(nullptr), context_(nullptr) {}
-  void BuildOption(const TrtBackendOption& option);
 
-  bool InitFromPaddle(const std::string& model_buffer,
-                      const std::string& params_buffer,
-                      const TrtBackendOption& option = TrtBackendOption(),
-                      bool verbose = false);
-  bool InitFromOnnx(const std::string& model_buffer,
-                    const TrtBackendOption& option = TrtBackendOption());
+  bool Init(const RuntimeOption& runtime_option);
   bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
              bool copy_to_fd = true) override;
 
@@ -98,6 +92,15 @@ class TrtBackend : public BaseBackend {
   }
 
  private:
+  void BuildOption(const TrtBackendOption& option);
+
+  bool InitFromPaddle(const std::string& model_buffer,
+                      const std::string& params_buffer,
+                      const TrtBackendOption& option = TrtBackendOption(),
+                      bool verbose = false);
+  bool InitFromOnnx(const std::string& model_buffer,
+                    const TrtBackendOption& option = TrtBackendOption());
+
   TrtBackendOption option_;
   std::shared_ptr<nvinfer1::ICudaEngine> engine_;
   std::shared_ptr<nvinfer1::IExecutionContext> context_;
diff --git a/fastdeploy/runtime/option_pybind.cc b/fastdeploy/runtime/option_pybind.cc
new file mode 100644
index 000000000..1c786459b
--- /dev/null
+++ b/fastdeploy/runtime/option_pybind.cc
@@ -0,0 +1,73 @@
+// Cropyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+
+void BindLiteOption(pybind11::module& m);
+void BindOpenVINOOption(pybind11::module& m);
+void BindOrtOption(pybind11::module& m);
+void BindTrtOption(pybind11::module& m);
+void BindPaddleOption(pybind11::module& m);
+void BindPorosOption(pybind11::module& m);
+
+void BindOption(pybind11::module& m) {
+  BindLiteOption(m);
+  BindOpenVINOOption(m);
+  BindOrtOption(m);
+  BindTrtOption(m);
+  BindPaddleOption(m);
+  BindPorosOption(m);
+
+  pybind11::class_<RuntimeOption>(m, "RuntimeOption")
+      .def(pybind11::init())
+      .def("set_model_path", &RuntimeOption::SetModelPath)
+      .def("set_model_buffer", &RuntimeOption::SetModelBuffer)
+      .def("use_gpu", &RuntimeOption::UseGpu)
+      .def("use_cpu", &RuntimeOption::UseCpu)
+      .def("use_rknpu2", &RuntimeOption::UseRKNPU2)
+      .def("use_sophgo", &RuntimeOption::UseSophgo)
+      .def("use_ascend", &RuntimeOption::UseAscend)
+      .def("use_kunlunxin", &RuntimeOption::UseKunlunXin)
+      .def_readwrite("paddle_lite_option", &RuntimeOption::paddle_lite_option)
+      .def_readwrite("openvino_option", &RuntimeOption::openvino_option)
+      .def_readwrite("ort_option", &RuntimeOption::ort_option)
+      .def_readwrite("trt_option", &RuntimeOption::trt_option)
+      .def_readwrite("poros_option", &RuntimeOption::poros_option)
+      .def_readwrite("paddle_infer_option", &RuntimeOption::paddle_infer_option)
+      .def("set_external_stream", &RuntimeOption::SetExternalStream)
+      .def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
+      .def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
+      .def("use_poros_backend", &RuntimeOption::UsePorosBackend)
+      .def("use_ort_backend", &RuntimeOption::UseOrtBackend)
+      .def("use_trt_backend", &RuntimeOption::UseTrtBackend)
+      .def("use_openvino_backend", &RuntimeOption::UseOpenVINOBackend)
+      .def("use_lite_backend", &RuntimeOption::UseLiteBackend)
+      .def("enable_pinned_memory", &RuntimeOption::EnablePinnedMemory)
+      .def("disable_pinned_memory", &RuntimeOption::DisablePinnedMemory)
+      .def("use_ipu", &RuntimeOption::UseIpu)
+      .def("enable_profiling", &RuntimeOption::EnableProfiling)
+      .def("disable_profiling", &RuntimeOption::DisableProfiling)
+      .def_readwrite("model_file", &RuntimeOption::model_file)
+      .def_readwrite("params_file", &RuntimeOption::params_file)
+      .def_readwrite("model_format", &RuntimeOption::model_format)
+      .def_readwrite("backend", &RuntimeOption::backend)
+      .def_readwrite("external_stream", &RuntimeOption::external_stream_)
+      .def_readwrite("model_from_memory", &RuntimeOption::model_from_memory_)
+      .def_readwrite("cpu_thread_num", &RuntimeOption::cpu_thread_num)
+      .def_readwrite("device_id", &RuntimeOption::device_id)
+      .def_readwrite("device", &RuntimeOption::device);
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/runtime/runtime.cc b/fastdeploy/runtime/runtime.cc
index f0347805d..70714e4f0 100644
--- a/fastdeploy/runtime/runtime.cc
+++ b/fastdeploy/runtime/runtime.cc
@@ -226,59 +226,24 @@ void Runtime::CreatePaddleBackend() {
       option.model_format == ModelFormat::PADDLE,
       "Backend::PDINFER only supports model format of ModelFormat::PADDLE.");
 #ifdef ENABLE_PADDLE_BACKEND
-  auto pd_option = PaddleBackendOption();
-  pd_option.model_file = option.model_file;
-  pd_option.params_file = option.params_file;
-  pd_option.enable_mkldnn = option.pd_enable_mkldnn;
-  pd_option.enable_log_info = option.pd_enable_log_info;
-  pd_option.mkldnn_cache_size = option.pd_mkldnn_cache_size;
-  pd_option.use_gpu = (option.device == Device::GPU) ? true : false;
-  pd_option.use_ipu = (option.device == Device::IPU) ? true : false;
-  pd_option.gpu_id = option.device_id;
-  pd_option.delete_pass_names = option.pd_delete_pass_names;
-  pd_option.cpu_thread_num = option.cpu_thread_num;
-  pd_option.enable_pinned_memory = option.enable_pinned_memory;
-  pd_option.external_stream_ = option.external_stream_;
-  pd_option.model_from_memory_ = option.model_from_memory_;
-#ifdef ENABLE_TRT_BACKEND
-  if (pd_option.use_gpu && option.pd_enable_trt) {
-    pd_option.enable_trt = true;
-    pd_option.collect_shape = option.pd_collect_shape;
-    auto trt_option = TrtBackendOption();
-    trt_option.gpu_id = option.device_id;
-    trt_option.enable_fp16 = option.trt_enable_fp16;
-    trt_option.max_batch_size = option.trt_max_batch_size;
-    trt_option.max_workspace_size = option.trt_max_workspace_size;
-    trt_option.max_shape = option.trt_max_shape;
-    trt_option.min_shape = option.trt_min_shape;
-    trt_option.opt_shape = option.trt_opt_shape;
-    trt_option.serialize_file = option.trt_serialize_file;
-    trt_option.enable_pinned_memory = option.enable_pinned_memory;
-    pd_option.trt_option = trt_option;
-    pd_option.trt_disabled_ops_ = option.trt_disabled_ops_;
-  }
-#endif
-#ifdef WITH_IPU
-  if (pd_option.use_ipu) {
-    auto ipu_option = IpuOption();
-    ipu_option.ipu_device_num = option.ipu_device_num;
-    ipu_option.ipu_micro_batch_size = option.ipu_micro_batch_size;
-    ipu_option.ipu_enable_pipelining = option.ipu_enable_pipelining;
-    ipu_option.ipu_batches_per_step = option.ipu_batches_per_step;
-    ipu_option.ipu_enable_fp16 = option.ipu_enable_fp16;
-    ipu_option.ipu_replica_num = option.ipu_replica_num;
-    ipu_option.ipu_available_memory_proportion =
-        option.ipu_available_memory_proportion;
-    ipu_option.ipu_enable_half_partial = option.ipu_enable_half_partial;
-    pd_option.ipu_option = ipu_option;
-  }
-#endif
+  option.paddle_infer_option.model_file = option.model_file;
+  option.paddle_infer_option.params_file = option.params_file;
+  option.paddle_infer_option.model_from_memory_ = option.model_from_memory_;
+  option.paddle_infer_option.device = option.device;
+  option.paddle_infer_option.device_id = option.device_id;
+  option.paddle_infer_option.enable_pinned_memory = option.enable_pinned_memory;
+  option.paddle_infer_option.external_stream_ = option.external_stream_;
+  option.paddle_infer_option.trt_option = option.trt_option;
+  option.paddle_infer_option.trt_option.gpu_id = option.device_id;
   backend_ = utils::make_unique<PaddleBackend>();
   auto casted_backend = dynamic_cast<PaddleBackend*>(backend_.get());
-  if (pd_option.model_from_memory_) {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
-                                            option.params_file, pd_option),
-             "Load model from Paddle failed while initliazing PaddleBackend.");
+  casted_backend->benchmark_option_ = option.benchmark_option;
+
+  if (option.model_from_memory_) {
+    FDASSERT(
+        casted_backend->InitFromPaddle(option.model_file, option.params_file,
+                                       option.paddle_infer_option),
+        "Load model from Paddle failed while initliazing PaddleBackend.");
     ReleaseModelMemoryBuffer();
   } else {
     std::string model_buffer = "";
@@ -287,9 +252,9 @@ void Runtime::CreatePaddleBackend() {
              "Fail to read binary from model file");
     FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
              "Fail to read binary from parameter file");
-    FDASSERT(
-        casted_backend->InitFromPaddle(model_buffer, params_buffer, pd_option),
-        "Load model from Paddle failed while initliazing PaddleBackend.");
+    FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
+                                            option.paddle_infer_option),
+             "Load model from Paddle failed while initliazing PaddleBackend.");
   }
 #else
   FDASSERT(false,
@@ -301,36 +266,10 @@ void Runtime::CreatePaddleBackend() {
 }
 
 void Runtime::CreateOpenVINOBackend() {
-  // TODO(huangjianhui) OpenVINO only supports to load ONNX format model from
-  // memory Temporarily disable this function
-  FDASSERT(option.model_from_memory_ == false,
-           "OpenVINOBackend don't support to load model from memory");
-  FDASSERT(option.device == Device::CPU,
-           "Backend::OPENVINO only supports Device::CPU");
-  FDASSERT(option.model_format == ModelFormat::PADDLE ||
-               option.model_format == ModelFormat::ONNX,
-           "OpenVINOBackend only support model format of ModelFormat::PADDLE / "
-           "ModelFormat::ONNX.");
 #ifdef ENABLE_OPENVINO_BACKEND
-  auto ov_option = OpenVINOBackendOption();
-  ov_option.cpu_thread_num = option.cpu_thread_num;
-  ov_option.device = option.openvino_device;
-  ov_option.shape_infos = option.ov_shape_infos;
-  ov_option.num_streams = option.ov_num_streams;
-  for (const auto& op : option.ov_cpu_operators) {
-    ov_option.cpu_operators.insert(op);
-  }
   backend_ = utils::make_unique<OpenVINOBackend>();
-  auto casted_backend = dynamic_cast<OpenVINOBackend*>(backend_.get());
-
-  if (option.model_format == ModelFormat::ONNX) {
-    FDASSERT(casted_backend->InitFromOnnx(option.model_file, ov_option),
-             "Load model from ONNX failed while initliazing OrtBackend.");
-  } else {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
-                                            option.params_file, ov_option),
-             "Load model from Paddle failed while initliazing OrtBackend.");
-  }
+  backend_->benchmark_option_ = option.benchmark_option;
+  FDASSERT(backend_->Init(option), "Failed to initialize OpenVINOBackend.");
 #else
   FDASSERT(false,
            "OpenVINOBackend is not available, please compiled with "
@@ -343,6 +282,8 @@ void Runtime::CreateOpenVINOBackend() {
 void Runtime::CreateOrtBackend() {
 #ifdef ENABLE_ORT_BACKEND
   backend_ = utils::make_unique<OrtBackend>();
+  backend_->benchmark_option_ = option.benchmark_option;
+
   FDASSERT(backend_->Init(option), "Failed to initialize Backend::ORT.");
 #else
   FDASSERT(false,
@@ -354,60 +295,16 @@ void Runtime::CreateOrtBackend() {
 }
 
 void Runtime::CreateTrtBackend() {
-  FDASSERT(option.device == Device::GPU,
-           "Backend::TRT only supports Device::GPU.");
-  FDASSERT(option.model_format == ModelFormat::PADDLE ||
-               option.model_format == ModelFormat::ONNX,
-           "TrtBackend only support model format of ModelFormat::PADDLE / "
-           "ModelFormat::ONNX.");
 #ifdef ENABLE_TRT_BACKEND
-  auto trt_option = TrtBackendOption();
-  trt_option.model_file = option.model_file;
-  trt_option.params_file = option.params_file;
-  trt_option.model_format = option.model_format;
-  trt_option.gpu_id = option.device_id;
-  trt_option.enable_fp16 = option.trt_enable_fp16;
-  trt_option.enable_int8 = option.trt_enable_int8;
-  trt_option.max_batch_size = option.trt_max_batch_size;
-  trt_option.max_workspace_size = option.trt_max_workspace_size;
-  trt_option.max_shape = option.trt_max_shape;
-  trt_option.min_shape = option.trt_min_shape;
-  trt_option.opt_shape = option.trt_opt_shape;
-  trt_option.serialize_file = option.trt_serialize_file;
-  trt_option.enable_pinned_memory = option.enable_pinned_memory;
-  trt_option.external_stream_ = option.external_stream_;
+  option.trt_option.model_file = option.model_file;
+  option.trt_option.params_file = option.params_file;
+  option.trt_option.model_format = option.model_format;
+  option.trt_option.gpu_id = option.device_id;
+  option.trt_option.enable_pinned_memory = option.enable_pinned_memory;
+  option.trt_option.external_stream_ = option.external_stream_;
   backend_ = utils::make_unique<TrtBackend>();
-  auto casted_backend = dynamic_cast<TrtBackend*>(backend_.get());
-  if (option.model_format == ModelFormat::ONNX) {
-    if (option.model_from_memory_) {
-      FDASSERT(casted_backend->InitFromOnnx(option.model_file, trt_option),
-               "Load model from ONNX failed while initliazing TrtBackend.");
-      ReleaseModelMemoryBuffer();
-    } else {
-      std::string model_buffer = "";
-      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
-               "Fail to read binary from model file");
-      FDASSERT(casted_backend->InitFromOnnx(model_buffer, trt_option),
-               "Load model from ONNX failed while initliazing TrtBackend.");
-    }
-  } else {
-    if (option.model_from_memory_) {
-      FDASSERT(casted_backend->InitFromPaddle(option.model_file,
-                                              option.params_file, trt_option),
-               "Load model from Paddle failed while initliazing TrtBackend.");
-      ReleaseModelMemoryBuffer();
-    } else {
-      std::string model_buffer = "";
-      std::string params_buffer = "";
-      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
-               "Fail to read binary from model file");
-      FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
-               "Fail to read binary from parameter file");
-      FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
-                                              trt_option),
-               "Load model from Paddle failed while initliazing TrtBackend.");
-    }
-  }
+  backend_->benchmark_option_ = option.benchmark_option;
+  FDASSERT(backend_->Init(option), "Failed to initialize TensorRT backend.");
 #else
   FDASSERT(false,
            "TrtBackend is not available, please compiled with "
@@ -419,27 +316,18 @@ void Runtime::CreateTrtBackend() {
 
 void Runtime::CreateLiteBackend() {
 #ifdef ENABLE_LITE_BACKEND
-  FDASSERT(option.model_from_memory_ == false,
-           "LiteBackend don't support to load model from memory");
-  FDASSERT(option.device == Device::CPU || option.device == Device::TIMVX ||
-               option.device == Device::KUNLUNXIN ||
-               option.device == Device::ASCEND,
-           "Backend::LITE only supports "
-           "Device::CPU/Device::TIMVX/Device::KUNLUNXIN/Device::ASCEND.");
-  FDASSERT(option.model_format == ModelFormat::PADDLE,
-           "LiteBackend only support model format of ModelFormat::PADDLE");
   backend_ = utils::make_unique<LiteBackend>();
-  auto casted_backend = dynamic_cast<LiteBackend*>(backend_.get());
-  FDASSERT(casted_backend->InitFromPaddle(option.model_file, option.params_file,
-                                          option.paddle_lite_option),
+  backend_->benchmark_option_ = option.benchmark_option;
+
+  FDASSERT(backend_->Init(option),
            "Load model from nb file failed while initializing LiteBackend.");
 #else
   FDASSERT(false,
            "LiteBackend is not available, please compiled with "
            "ENABLE_LITE_BACKEND=ON.");
 #endif
-  FDINFO << "Runtime initialized with Backend::LITE in " << option.device << "."
-         << std::endl;
+  FDINFO << "Runtime initialized with Backend::PDLITE in " << option.device
+         << "." << std::endl;
 }
 
 void Runtime::CreateRKNPU2Backend() {
@@ -468,18 +356,8 @@ void Runtime::CreateRKNPU2Backend() {
 
 void Runtime::CreateSophgoNPUBackend() {
 #ifdef ENABLE_SOPHGO_BACKEND
-  auto sophgo_option = SophgoBackendOption();
-  FDASSERT(option.model_from_memory_ == false,
-           "SophgoBackend don't support to load model from memory");
-  FDASSERT(option.device == Device::SOPHGOTPUD,
-           "Backend::SOPHGO only supports Device::SOPHGO");
-  FDASSERT(option.model_format == ModelFormat::SOPHGO,
-           "SophgoBackend only support model format of ModelFormat::SOPHGO");
-  auto sophgo_option = SophgoBackendOption();
   backend_ = utils::make_unique<SophgoBackend>();
-  auto casted_backend = dynamic_cast<SophgoBackend*>(backend_.get());
-  FDASSERT(casted_backend->InitFromSophgo(option.model_file, sophgo_option),
-           "Load model from nb file failed while initializing LiteBackend.");
+  FDASSERT(backend_->Init(option), "Failed to initialize Sophgo backend.");
 #else
   FDASSERT(false,
            "SophgoBackend is not available, please compiled with "
@@ -513,25 +391,25 @@ Runtime* Runtime::Clone(void* stream, int device_id) {
 bool Runtime::Compile(std::vector<std::vector<FDTensor>>& prewarm_tensors,
                       const RuntimeOption& _option) {
 #ifdef ENABLE_POROS_BACKEND
-  option = _option;
-  auto poros_option = PorosBackendOption();
-  poros_option.use_gpu = (option.device == Device::GPU) ? true : false;
-  poros_option.gpu_id = option.device_id;
-  poros_option.long_to_int = option.long_to_int;
-  poros_option.use_nvidia_tf32 = option.use_nvidia_tf32;
-  poros_option.unconst_ops_thres = option.unconst_ops_thres;
-  poros_option.poros_file = option.poros_file;
-  poros_option.is_dynamic = option.is_dynamic;
-  poros_option.enable_fp16 = option.trt_enable_fp16;
-  poros_option.max_batch_size = option.trt_max_batch_size;
-  poros_option.max_workspace_size = option.trt_max_workspace_size;
   FDASSERT(
       option.model_format == ModelFormat::TORCHSCRIPT,
       "PorosBackend only support model format of ModelFormat::TORCHSCRIPT.");
+  if (option.device != Device::CPU && option.device != Device::GPU) {
+    FDERROR << "PorosBackend only supports CPU/GPU, but now its "
+            << option.device << "." << std::endl;
+    return false;
+  }
+  option.poros_option.device = option.device;
+  option.poros_option.device_id = option.device_id;
+  option.poros_option.enable_fp16 = option.trt_option.enable_fp16;
+  option.poros_option.max_batch_size = option.trt_option.max_batch_size;
+  option.poros_option.max_workspace_size = option.trt_option.max_workspace_size;
+
   backend_ = utils::make_unique<PorosBackend>();
   auto casted_backend = dynamic_cast<PorosBackend*>(backend_.get());
   FDASSERT(
-      casted_backend->Compile(option.model_file, prewarm_tensors, poros_option),
+      casted_backend->Compile(option.model_file, prewarm_tensors,
+                              option.poros_option),
       "Load model from Torchscript failed while initliazing PorosBackend.");
 #else
   FDASSERT(false,
diff --git a/fastdeploy/runtime/runtime.h b/fastdeploy/runtime/runtime.h
index 22a09c355..6e7dc9629 100755
--- a/fastdeploy/runtime/runtime.h
+++ b/fastdeploy/runtime/runtime.h
@@ -95,6 +95,11 @@ struct FASTDEPLOY_DECL Runtime {
    */
   bool Compile(std::vector<std::vector<FDTensor>>& prewarm_tensors,
                const RuntimeOption& _option);
+  /** \brief Get profile time of Runtime after the profile process is done.
+   */
+  double GetProfileTime() {
+    return backend_->benchmark_result_.time_of_runtime;
+  }             
 
  private:
   void CreateOrtBackend();
diff --git a/fastdeploy/runtime/runtime_option.cc b/fastdeploy/runtime/runtime_option.cc
index 7dee5365a..c09352d58 100644
--- a/fastdeploy/runtime/runtime_option.cc
+++ b/fastdeploy/runtime/runtime_option.cc
@@ -98,9 +98,15 @@ void RuntimeOption::SetCpuThreadNum(int thread_num) {
   cpu_thread_num = thread_num;
   paddle_lite_option.cpu_threads = thread_num;
   ort_option.intra_op_num_threads = thread_num;
+  openvino_option.cpu_thread_num = thread_num;
+  paddle_infer_option.cpu_thread_num = thread_num;
 }
 
 void RuntimeOption::SetOrtGraphOptLevel(int level) {
+  FDWARNING << "`RuntimeOption::SetOrtGraphOptLevel` will be removed in "
+               "v1.2.0, please modify its member variables directly, e.g "
+               "`runtime_option.ort_option.graph_optimization_level = 99`."
+            << std::endl;
   std::vector<int> supported_level{-1, 0, 1, 2};
   auto valid_level = std::find(supported_level.begin(), supported_level.end(),
                                level) != supported_level.end();
@@ -169,25 +175,47 @@ void RuntimeOption::UseLiteBackend() {
 }
 
 void RuntimeOption::SetPaddleMKLDNN(bool pd_mkldnn) {
-  pd_enable_mkldnn = pd_mkldnn;
+  FDWARNING << "`RuntimeOption::SetPaddleMKLDNN` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`option.paddle_infer_option.enable_mkldnn = true`"
+            << std::endl;
+  paddle_infer_option.enable_mkldnn = pd_mkldnn;
 }
 
 void RuntimeOption::DeletePaddleBackendPass(const std::string& pass_name) {
-  pd_delete_pass_names.push_back(pass_name);
+  FDWARNING
+      << "`RuntimeOption::DeletePaddleBackendPass` will be removed in v1.2.0, "
+         "please use `option.paddle_infer_option.DeletePass` instead."
+      << std::endl;
+  paddle_infer_option.DeletePass(pass_name);
+}
+void RuntimeOption::EnablePaddleLogInfo() {
+  FDWARNING << "`RuntimeOption::EnablePaddleLogInfo` will be removed in "
+               "v1.2.0, please modify its member variable directly, e.g "
+               "`option.paddle_infer_option.enable_log_info = true`"
+            << std::endl;
+  paddle_infer_option.enable_log_info = true;
 }
-void RuntimeOption::EnablePaddleLogInfo() { pd_enable_log_info = true; }
 
-void RuntimeOption::DisablePaddleLogInfo() { pd_enable_log_info = false; }
+void RuntimeOption::DisablePaddleLogInfo() {
+  FDWARNING << "`RuntimeOption::DisablePaddleLogInfo` will be removed in "
+               "v1.2.0, please modify its member variable directly, e.g "
+               "`option.paddle_infer_option.enable_log_info = false`"
+            << std::endl;
+  paddle_infer_option.enable_log_info = false;
+}
 
 void RuntimeOption::EnablePaddleToTrt() {
-  FDASSERT(backend == Backend::TRT,
-           "Should call UseTrtBackend() before call EnablePaddleToTrt().");
 #ifdef ENABLE_PADDLE_BACKEND
+  FDWARNING << "`RuntimeOption::EnablePaddleToTrt` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`option.paddle_infer_option.enable_trt = true`"
+            << std::endl;
   FDINFO << "While using TrtBackend with EnablePaddleToTrt, FastDeploy will "
             "change to use Paddle Inference Backend."
          << std::endl;
   backend = Backend::PDINFER;
-  pd_enable_trt = true;
+  paddle_infer_option.enable_trt = true;
 #else
   FDASSERT(false,
            "While using TrtBackend with EnablePaddleToTrt, require the "
@@ -197,72 +225,135 @@ void RuntimeOption::EnablePaddleToTrt() {
 }
 
 void RuntimeOption::SetPaddleMKLDNNCacheSize(int size) {
-  FDASSERT(size > 0, "Parameter size must greater than 0.");
-  pd_mkldnn_cache_size = size;
+  FDWARNING << "`RuntimeOption::SetPaddleMKLDNNCacheSize` will be removed in "
+               "v1.2.0, please modify its member variable directly, e.g "
+               "`option.paddle_infer_option.mkldnn_cache_size = size`."
+            << std::endl;
+  paddle_infer_option.mkldnn_cache_size = size;
 }
 
 void RuntimeOption::SetOpenVINODevice(const std::string& name) {
-  openvino_device = name;
+  FDWARNING << "`RuntimeOption::SetOpenVINODevice` will be removed in v1.2.0, "
+               "please use `RuntimeOption.openvino_option.SetDeivce(const "
+               "std::string&)` instead."
+            << std::endl;
+  openvino_option.SetDevice(name);
 }
 
-void RuntimeOption::EnableLiteFP16() { paddle_lite_option.enable_fp16 = true; }
+void RuntimeOption::EnableLiteFP16() {
+  FDWARNING << "`RuntimeOption::EnableLiteFP16` will be removed in v1.2.0, "
+               "please modify its member variables directly, e.g "
+               "`runtime_option.paddle_lite_option.enable_fp16 = true`"
+            << std::endl;
+  paddle_lite_option.enable_fp16 = true;
+}
 
 void RuntimeOption::DisableLiteFP16() {
+  FDWARNING << "`RuntimeOption::EnableLiteFP16` will be removed in v1.2.0, "
+               "please modify its member variables directly, e.g "
+               "`runtime_option.paddle_lite_option.enable_fp16 = false`"
+            << std::endl;
   paddle_lite_option.enable_fp16 = false;
 }
 
-void RuntimeOption::EnableLiteInt8() { paddle_lite_option.enable_int8 = true; }
+void RuntimeOption::EnableLiteInt8() {
+  FDWARNING << "RuntimeOption::EnableLiteInt8 is a useless api, this calling "
+               "will not bring any effects, and will be removed in v1.2.0. if "
+               "you load a quantized model, it will automatically run with "
+               "int8 mode; otherwise it will run with float mode."
+            << std::endl;
+}
 
 void RuntimeOption::DisableLiteInt8() {
-  paddle_lite_option.enable_int8 = false;
+  FDWARNING << "RuntimeOption::DisableLiteInt8 is a useless api, this calling "
+               "will not bring any effects, and will be removed in v1.2.0. if "
+               "you load a quantized model, it will automatically run with "
+               "int8 mode; otherwise it will run with float mode."
+            << std::endl;
 }
 
 void RuntimeOption::SetLitePowerMode(LitePowerMode mode) {
+  FDWARNING << "`RuntimeOption::SetLitePowerMode` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`runtime_option.paddle_lite_option.power_mode = 3;`"
+            << std::endl;
   paddle_lite_option.power_mode = mode;
 }
 
 void RuntimeOption::SetLiteOptimizedModelDir(
     const std::string& optimized_model_dir) {
+  FDWARNING
+      << "`RuntimeOption::SetLiteOptimizedModelDir` will be removed in v1.2.0, "
+         "please modify its member variable directly, e.g "
+         "`runtime_option.paddle_lite_option.optimized_model_dir = \"...\"`"
+      << std::endl;
   paddle_lite_option.optimized_model_dir = optimized_model_dir;
 }
 
 void RuntimeOption::SetLiteSubgraphPartitionPath(
     const std::string& nnadapter_subgraph_partition_config_path) {
+  FDWARNING << "`RuntimeOption::SetLiteSubgraphPartitionPath` will be removed "
+               "in v1.2.0, please modify its member variable directly, e.g "
+               "`runtime_option.paddle_lite_option.nnadapter_subgraph_"
+               "partition_config_path = \"...\";` "
+            << std::endl;
   paddle_lite_option.nnadapter_subgraph_partition_config_path =
       nnadapter_subgraph_partition_config_path;
 }
 
 void RuntimeOption::SetLiteSubgraphPartitionConfigBuffer(
     const std::string& nnadapter_subgraph_partition_config_buffer) {
+  FDWARNING
+      << "`RuntimeOption::SetLiteSubgraphPartitionConfigBuffer` will be "
+         "removed in v1.2.0, please modify its member variable directly, e.g "
+         "`runtime_option.paddle_lite_option.nnadapter_subgraph_partition_"
+         "config_buffer = ...`"
+      << std::endl;
   paddle_lite_option.nnadapter_subgraph_partition_config_buffer =
       nnadapter_subgraph_partition_config_buffer;
 }
 
-void RuntimeOption::SetLiteDeviceNames(
-    const std::vector<std::string>& nnadapter_device_names) {
-  paddle_lite_option.nnadapter_device_names = nnadapter_device_names;
-}
-
 void RuntimeOption::SetLiteContextProperties(
     const std::string& nnadapter_context_properties) {
+  FDWARNING << "`RuntimeOption::SetLiteContextProperties` will be removed in "
+               "v1.2.0, please modify its member variable directly, e.g "
+               "`runtime_option.paddle_lite_option.nnadapter_context_"
+               "properties = ...`"
+            << std::endl;
   paddle_lite_option.nnadapter_context_properties =
       nnadapter_context_properties;
 }
 
 void RuntimeOption::SetLiteModelCacheDir(
     const std::string& nnadapter_model_cache_dir) {
+  FDWARNING
+      << "`RuntimeOption::SetLiteModelCacheDir` will be removed in v1.2.0, "
+         "please modify its member variable directly, e.g "
+         "`runtime_option.paddle_lite_option.nnadapter_model_cache_dir = ...`"
+      << std::endl;
   paddle_lite_option.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
 }
 
 void RuntimeOption::SetLiteDynamicShapeInfo(
     const std::map<std::string, std::vector<std::vector<int64_t>>>&
         nnadapter_dynamic_shape_info) {
+  FDWARNING << "`RuntimeOption::SetLiteDynamicShapeInfo` will be removed in "
+               "v1.2.0, please modify its member variable directly, e.g "
+               "`runtime_option.paddle_lite_option.paddle_lite_option."
+               "nnadapter_dynamic_shape_info = ...`"
+            << std::endl;
   paddle_lite_option.nnadapter_dynamic_shape_info =
       nnadapter_dynamic_shape_info;
 }
 
 void RuntimeOption::SetLiteMixedPrecisionQuantizationConfigPath(
     const std::string& nnadapter_mixed_precision_quantization_config_path) {
+  FDWARNING
+      << "`RuntimeOption::SetLiteMixedPrecisionQuantizationConfigPath` will be "
+         "removed in v1.2.0, please modify its member variable directly, e.g "
+         "`runtime_option.paddle_lite_option.paddle_lite_option.nnadapter_"
+         "mixed_precision_quantization_config_path = ...`"
+      << std::endl;
   paddle_lite_option.nnadapter_mixed_precision_quantization_config_path =
       nnadapter_mixed_precision_quantization_config_path;
 }
@@ -271,51 +362,85 @@ void RuntimeOption::SetTrtInputShape(const std::string& input_name,
                                      const std::vector<int32_t>& min_shape,
                                      const std::vector<int32_t>& opt_shape,
                                      const std::vector<int32_t>& max_shape) {
-  trt_min_shape[input_name].clear();
-  trt_max_shape[input_name].clear();
-  trt_opt_shape[input_name].clear();
-  trt_min_shape[input_name].assign(min_shape.begin(), min_shape.end());
-  if (opt_shape.size() == 0) {
-    trt_opt_shape[input_name].assign(min_shape.begin(), min_shape.end());
-  } else {
-    trt_opt_shape[input_name].assign(opt_shape.begin(), opt_shape.end());
-  }
-  if (max_shape.size() == 0) {
-    trt_max_shape[input_name].assign(min_shape.begin(), min_shape.end());
-  } else {
-    trt_max_shape[input_name].assign(max_shape.begin(), max_shape.end());
-  }
+  FDWARNING << "`RuntimeOption::SetTrtInputShape` will be removed in v1.2.0, "
+               "please use `RuntimeOption.trt_option.SetShape()` instead."
+            << std::endl;
+  trt_option.SetShape(input_name, min_shape, opt_shape, max_shape);
 }
 
 void RuntimeOption::SetTrtMaxWorkspaceSize(size_t max_workspace_size) {
-  trt_max_workspace_size = max_workspace_size;
+  FDWARNING << "`RuntimeOption::SetTrtMaxWorkspaceSize` will be removed in "
+               "v1.2.0, please modify its member variable directly, e.g "
+               "`RuntimeOption.trt_option.max_workspace_size = "
+            << max_workspace_size << "`." << std::endl;
+  trt_option.max_workspace_size = max_workspace_size;
 }
 void RuntimeOption::SetTrtMaxBatchSize(size_t max_batch_size) {
-  trt_max_batch_size = max_batch_size;
+  FDWARNING << "`RuntimeOption::SetTrtMaxBatchSize` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`RuntimeOption.trt_option.max_batch_size = "
+            << max_batch_size << "`." << std::endl;
+  trt_option.max_batch_size = max_batch_size;
 }
 
-void RuntimeOption::EnableTrtFP16() { trt_enable_fp16 = true; }
+void RuntimeOption::EnableTrtFP16() {
+  FDWARNING << "`RuntimeOption::EnableTrtFP16` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`runtime_option.trt_option.enable_fp16 = true;`"
+            << std::endl;
+  trt_option.enable_fp16 = true;
+}
 
-void RuntimeOption::DisableTrtFP16() { trt_enable_fp16 = false; }
+void RuntimeOption::DisableTrtFP16() {
+  FDWARNING << "`RuntimeOption::DisableTrtFP16` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`runtime_option.trt_option.enable_fp16 = false;`"
+            << std::endl;
+  trt_option.enable_fp16 = false;
+}
 
 void RuntimeOption::EnablePinnedMemory() { enable_pinned_memory = true; }
 
 void RuntimeOption::DisablePinnedMemory() { enable_pinned_memory = false; }
 
 void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) {
-  trt_serialize_file = cache_file_path;
+  FDWARNING << "`RuntimeOption::SetTrtCacheFile` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`runtime_option.trt_option.serialize_file = \""
+            << cache_file_path << "\"." << std::endl;
+  trt_option.serialize_file = cache_file_path;
 }
 
 void RuntimeOption::SetOpenVINOStreams(int num_streams) {
-  ov_num_streams = num_streams;
+  FDWARNING << "`RuntimeOption::SetOpenVINOStreams` will be removed in v1.2.0, "
+               "please modify its member variable directly, e.g "
+               "`runtime_option.openvino_option.num_streams = "
+            << num_streams << "`." << std::endl;
+  openvino_option.num_streams = num_streams;
 }
 
-void RuntimeOption::EnablePaddleTrtCollectShape() { pd_collect_shape = true; }
+void RuntimeOption::EnablePaddleTrtCollectShape() {
+  FDWARNING << "`RuntimeOption::EnablePaddleTrtCollectShape` will be removed "
+               "in v1.2.0, please modify its member variable directly, e.g "
+               "runtime_option.paddle_infer_option.collect_trt_shape = true`."
+            << std::endl;
+  paddle_infer_option.collect_trt_shape = true;
+}
 
-void RuntimeOption::DisablePaddleTrtCollectShape() { pd_collect_shape = false; }
+void RuntimeOption::DisablePaddleTrtCollectShape() {
+  FDWARNING << "`RuntimeOption::DisablePaddleTrtCollectShape` will be removed "
+               "in v1.2.0, please modify its member variable directly, e.g "
+               "runtime_option.paddle_infer_option.collect_trt_shape = false`."
+            << std::endl;
+  paddle_infer_option.collect_trt_shape = false;
+}
 
 void RuntimeOption::DisablePaddleTrtOPs(const std::vector<std::string>& ops) {
-  trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
+  FDWARNING << "`RuntimeOption::DisablePaddleTrtOps` will be removed in "
+               "v.1.20, please use "
+               "`runtime_option.paddle_infer_option.DisableTrtOps` instead."
+            << std::endl;
+  paddle_infer_option.DisableTrtOps(ops);
 }
 
 void RuntimeOption::UseIpu(int device_num, int micro_batch_size,
@@ -333,13 +458,4 @@ void RuntimeOption::UseIpu(int device_num, int micro_batch_size,
 #endif
 }
 
-void RuntimeOption::SetIpuConfig(bool enable_fp16, int replica_num,
-                                 float available_memory_proportion,
-                                 bool enable_half_partial) {
-  ipu_enable_fp16 = enable_fp16;
-  ipu_replica_num = replica_num;
-  ipu_available_memory_proportion = available_memory_proportion;
-  ipu_enable_half_partial = enable_half_partial;
-}
-
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/runtime_option.h b/fastdeploy/runtime/runtime_option.h
index 3aea02e0d..0aa6bbec8 100644
--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -32,6 +32,7 @@
 #include "fastdeploy/runtime/backends/rknpu2/option.h"
 #include "fastdeploy/runtime/backends/sophgo/option.h"
 #include "fastdeploy/runtime/backends/tensorrt/option.h"
+#include "fastdeploy/benchmark/option.h"
 
 namespace fastdeploy {
 
@@ -60,22 +61,19 @@ struct FASTDEPLOY_DECL RuntimeOption {
 
   /// Use cpu to inference, the runtime will inference on CPU by default
   void UseCpu();
-
   /// Use Nvidia GPU to inference
   void UseGpu(int gpu_id = 0);
-
+  /// Use RKNPU2 e.g RK3588/RK356X to inference
   void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name =
                      fastdeploy::rknpu2::CpuName::RK3588,
                  fastdeploy::rknpu2::CoreMask rknpu2_core =
                      fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_0);
-
-  /// Use TimVX to inference
+  /// Use TimVX e.g RV1126/A311D to inference
   void UseTimVX();
-
   /// Use Huawei Ascend to inference
   void UseAscend();
-
-  ///
+  /// Use Sophgo to inference
+  void UseSophgo();
   /// \brief Turn on KunlunXin XPU.
   ///
   /// \param kunlunxin_id the KunlunXin XPU card to use (default is 0).
@@ -105,227 +103,25 @@ struct FASTDEPLOY_DECL RuntimeOption {
                     bool adaptive_seqlen = false,
                     bool enable_multi_stream = false);
 
-  /// Use Sophgo to inference
-  void UseSophgo();
-
   void SetExternalStream(void* external_stream);
-
   /*
    * @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
    */
   void SetCpuThreadNum(int thread_num);
-
-  /// Set ORT graph opt level, default is decide by ONNX Runtime itself
-  void SetOrtGraphOptLevel(int level = -1);
-
   /// Set Paddle Inference as inference backend, support CPU/GPU
-  void UsePaddleBackend();
-
-  /// Wrapper function of UsePaddleBackend()
   void UsePaddleInferBackend() { return UsePaddleBackend(); }
-
   /// Set ONNX Runtime as inference backend, support CPU/GPU
   void UseOrtBackend();
-
-  /// Set SOPHGO Runtime as inference backend, support CPU/GPU
+  /// Set SOPHGO Runtime as inference backend, support SOPHGO
   void UseSophgoBackend();
-
   /// Set TensorRT as inference backend, only support GPU
   void UseTrtBackend();
-
   /// Set Poros backend as inference backend, support CPU/GPU
   void UsePorosBackend();
-
   /// Set OpenVINO as inference backend, only support CPU
   void UseOpenVINOBackend();
-
   /// Set Paddle Lite as inference backend, only support arm cpu
-  void UseLiteBackend();
-
-  /// Wrapper function of UseLiteBackend()
   void UsePaddleLiteBackend() { return UseLiteBackend(); }
-
-  /// Set mkldnn switch while using Paddle Inference as inference backend
-  void SetPaddleMKLDNN(bool pd_mkldnn = true);
-
-  /*
-   * @brief If TensorRT backend is used, EnablePaddleToTrt will change to use Paddle Inference backend, and use its integrated TensorRT instead.
-   */
-  void EnablePaddleToTrt();
-
-  /**
-   * @brief Delete pass by name while using Paddle Inference as inference backend, this can be called multiple times to delete a set of passes
-   */
-  void DeletePaddleBackendPass(const std::string& delete_pass_name);
-
-  /**
-   * @brief Enable print debug information while using Paddle Inference as inference backend, the backend disable the debug information by default
-   */
-  void EnablePaddleLogInfo();
-
-  /**
-   * @brief Disable print debug information while using Paddle Inference as inference backend
-   */
-  void DisablePaddleLogInfo();
-
-  /**
-   * @brief Set shape cache size while using Paddle Inference with mkldnn, by default it will cache all the difference shape
-   */
-  void SetPaddleMKLDNNCacheSize(int size);
-
-  /**
-   * @brief Set device name for OpenVINO, default 'CPU', can also be 'AUTO', 'GPU', 'GPU.1'....
-   */
-  void SetOpenVINODevice(const std::string& name = "CPU");
-
-  /**
-   * @brief Set shape info for OpenVINO
-   */
-  void SetOpenVINOShapeInfo(
-      const std::map<std::string, std::vector<int64_t>>& shape_info) {
-    ov_shape_infos = shape_info;
-  }
-
-  /**
-   * @brief While use OpenVINO backend with intel GPU, use this interface to specify operators run on CPU
-   */
-  void SetOpenVINOCpuOperators(const std::vector<std::string>& operators) {
-    ov_cpu_operators = operators;
-  }
-
-  /**
-   * @brief Set optimzed model dir for Paddle Lite backend.
-   */
-  void SetLiteOptimizedModelDir(const std::string& optimized_model_dir);
-
-  /**
-   * @brief Set subgraph partition path for Paddle Lite backend.
-   */
-  void SetLiteSubgraphPartitionPath(
-      const std::string& nnadapter_subgraph_partition_config_path);
-
-  /**
-   * @brief Set subgraph partition path for Paddle Lite backend.
-   */
-  void SetLiteSubgraphPartitionConfigBuffer(
-      const std::string& nnadapter_subgraph_partition_config_buffer);
-
-  /**
-   * @brief Set device name for Paddle Lite backend.
-   */
-  void
-  SetLiteDeviceNames(const std::vector<std::string>& nnadapter_device_names);
-
-  /**
-   * @brief Set context properties for Paddle Lite backend.
-   */
-  void
-  SetLiteContextProperties(const std::string& nnadapter_context_properties);
-
-  /**
-   * @brief Set model cache dir for Paddle Lite backend.
-   */
-  void SetLiteModelCacheDir(const std::string& nnadapter_model_cache_dir);
-
-  /**
-   * @brief Set dynamic shape info for Paddle Lite backend.
-   */
-  void SetLiteDynamicShapeInfo(
-      const std::map<std::string, std::vector<std::vector<int64_t>>>&
-          nnadapter_dynamic_shape_info);
-
-  /**
-   * @brief Set mixed precision quantization config path for Paddle Lite backend.
-   */
-  void SetLiteMixedPrecisionQuantizationConfigPath(
-      const std::string& nnadapter_mixed_precision_quantization_config_path);
-
-  /**
-   * @brief enable half precision while use paddle lite backend
-   */
-  void EnableLiteFP16();
-
-  /**
-   * @brief disable half precision, change to full precision(float32)
-   */
-  void DisableLiteFP16();
-
-  /**
-    * @brief enable int8 precision while use paddle lite backend
-    */
-  void EnableLiteInt8();
-
-  /**
-    * @brief disable int8 precision, change to full precision(float32)
-    */
-  void DisableLiteInt8();
-
-  /**
-   * @brief Set power mode while using Paddle Lite as inference backend, mode(0: LITE_POWER_HIGH; 1: LITE_POWER_LOW; 2: LITE_POWER_FULL; 3: LITE_POWER_NO_BIND, 4: LITE_POWER_RAND_HIGH; 5: LITE_POWER_RAND_LOW, refer [paddle lite](https://paddle-lite.readthedocs.io/zh/latest/api_reference/cxx_api_doc.html#set-power-mode) for more details)
-   */
-  void SetLitePowerMode(LitePowerMode mode);
-
-  /** \brief Set shape range of input tensor for the model that contain dynamic input shape while using TensorRT backend
-   *
-   * \param[in] input_name The name of input for the model which is dynamic shape
-   * \param[in] min_shape The minimal shape for the input tensor
-   * \param[in] opt_shape The optimized shape for the input tensor, just set the most common shape, if set as default value, it will keep same with min_shape
-   * \param[in] max_shape The maximum shape for the input tensor, if set as default value, it will keep same with min_shape
-   */
-  void SetTrtInputShape(
-      const std::string& input_name, const std::vector<int32_t>& min_shape,
-      const std::vector<int32_t>& opt_shape = std::vector<int32_t>(),
-      const std::vector<int32_t>& max_shape = std::vector<int32_t>());
-
-  /// Set max_workspace_size for TensorRT, default 1<<30
-  void SetTrtMaxWorkspaceSize(size_t trt_max_workspace_size);
-
-  /// Set max_batch_size for TensorRT, default 32
-  void SetTrtMaxBatchSize(size_t max_batch_size);
-
-  /**
-   * @brief Enable FP16 inference while using TensorRT backend. Notice: not all the GPU device support FP16, on those device doesn't support FP16, FastDeploy will fallback to FP32 automaticly
-   */
-  void EnableTrtFP16();
-
-  /// Disable FP16 inference while using TensorRT backend
-  void DisableTrtFP16();
-
-  /**
-   * @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
-   */
-  void SetTrtCacheFile(const std::string& cache_file_path);
-
-  /**
-   * @brief Enable pinned memory. Pinned memory can be utilized to speedup the data transfer between CPU and GPU. Currently it's only suppurted in TRT backend and Paddle Inference backend.
-   */
-  void EnablePinnedMemory();
-
-  /**
-   * @brief Disable pinned memory
-   */
-  void DisablePinnedMemory();
-
-  /**
-   * @brief Enable to collect shape in paddle trt backend
-   */
-  void EnablePaddleTrtCollectShape();
-
-  /**
-   * @brief Disable to collect shape in paddle trt backend
-   */
-  void DisablePaddleTrtCollectShape();
-
-  /**
-   * @brief Prevent ops running in paddle trt backend
-   */
-  void DisablePaddleTrtOPs(const std::vector<std::string>& ops);
-
-  /*
-   * @brief Set number of streams by the OpenVINO backends
-   */
-  void SetOpenVINOStreams(int num_streams);
-
   /** \Use Graphcore IPU to inference.
    *
    * \param[in] device_num the number of IPUs.
@@ -336,93 +132,123 @@ struct FASTDEPLOY_DECL RuntimeOption {
   void UseIpu(int device_num = 1, int micro_batch_size = 1,
               bool enable_pipelining = false, int batches_per_step = 1);
 
-  /** \brief Set IPU config.
-   *
-   * \param[in] enable_fp16 enable fp16.
-   * \param[in] replica_num the number of graph replication.
-   * \param[in] available_memory_proportion the available memory proportion for matmul/conv.
-   * \param[in] enable_half_partial enable fp16 partial for matmul, only work with fp16.
-   */
-  void SetIpuConfig(bool enable_fp16 = false, int replica_num = 1,
-                    float available_memory_proportion = 1.0,
-                    bool enable_half_partial = false);
-
-  Backend backend = Backend::UNKNOWN;
-
-  // for cpu inference
-  // default will let the backend choose their own default value
-  int cpu_thread_num = -1;
-  int device_id = 0;
-
-  Device device = Device::CPU;
-
-  void* external_stream_ = nullptr;
-
-  bool enable_pinned_memory = false;
-
+  /// Option to configure ONNX Runtime backend
   OrtBackendOption ort_option;
-
-  // ======Only for Paddle Backend=====
-  bool pd_enable_mkldnn = true;
-  bool pd_enable_log_info = false;
-  bool pd_enable_trt = false;
-  bool pd_collect_shape = false;
-  int pd_mkldnn_cache_size = 1;
-  std::vector<std::string> pd_delete_pass_names;
-
-  // ======Only for Paddle IPU Backend =======
-  int ipu_device_num = 1;
-  int ipu_micro_batch_size = 1;
-  bool ipu_enable_pipelining = false;
-  int ipu_batches_per_step = 1;
-  bool ipu_enable_fp16 = false;
-  int ipu_replica_num = 1;
-  float ipu_available_memory_proportion = 1.0;
-  bool ipu_enable_half_partial = false;
-
-  // ======Only for Trt Backend=======
-  std::map<std::string, std::vector<int32_t>> trt_max_shape;
-  std::map<std::string, std::vector<int32_t>> trt_min_shape;
-  std::map<std::string, std::vector<int32_t>> trt_opt_shape;
-  std::string trt_serialize_file = "";
-  bool trt_enable_fp16 = false;
-  bool trt_enable_int8 = false;
-  size_t trt_max_batch_size = 1;
-  size_t trt_max_workspace_size = 1 << 30;
-  // ======Only for PaddleTrt Backend=======
-  std::vector<std::string> trt_disabled_ops_{};
-
-  // ======Only for Poros Backend=======
-  bool is_dynamic = false;
-  bool long_to_int = true;
-  bool use_nvidia_tf32 = false;
-  int unconst_ops_thres = -1;
-  std::string poros_file = "";
-
-  // ======Only for OpenVINO Backend=======
-  int ov_num_streams = 0;
-  std::string openvino_device = "CPU";
-  std::map<std::string, std::vector<int64_t>> ov_shape_infos;
-  std::vector<std::string> ov_cpu_operators;
-
-  // ======Only for RKNPU2 Backend=======
-  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_ =
-      fastdeploy::rknpu2::CpuName::RK3588;
-  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_ =
-      fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
-
-
+  /// Option to configure TensorRT backend
+  TrtBackendOption trt_option;
+  /// Option to configure Paddle Inference backend
+  PaddleBackendOption paddle_infer_option;
+  /// Option to configure Poros backend
+  PorosBackendOption poros_option;
+  /// Option to configure OpenVINO backend
+  OpenVINOBackendOption openvino_option;
   /// Option to configure Paddle Lite backend
   LiteBackendOption paddle_lite_option;
 
+  /** \brief Set the profile mode as 'true'.
+   *
+   * \param[in] inclue_h2d_d2h Whether to include time of H2D_D2H for time of runtime.
+   * \param[in] repeat Repeat times for runtime inference.
+   * \param[in] warmup Warmup times for runtime inference.
+   */
+  void EnableProfiling(bool inclue_h2d_d2h = false,
+                       int repeat = 100, int warmup = 50) {
+    benchmark_option.enable_profile = true;
+    benchmark_option.warmup = warmup;
+    benchmark_option.repeats = repeat;
+    benchmark_option.include_h2d_d2h = inclue_h2d_d2h;
+  }
+
+  /** \brief Set the profile mode as 'false'.
+   */
+  void DisableProfiling() {
+    benchmark_option.enable_profile = false;
+  }
+
+
+  /// Benchmark option
+  benchmark::BenchmarkOption benchmark_option;
+
   // If model_from_memory is true, the model_file and params_file is
   // binary stream in memory;
   // Otherwise, the model_file and params_file means the path of file
   std::string model_file = "";
   std::string params_file = "";
   bool model_from_memory_ = false;
-  // format of input model
+  /// format of input model
   ModelFormat model_format = ModelFormat::PADDLE;
+
+  // for cpu inference
+  // default will let the backend choose their own default value
+  int cpu_thread_num = -1;
+  int device_id = 0;
+  Backend backend = Backend::UNKNOWN;
+
+  Device device = Device::CPU;
+
+  void* external_stream_ = nullptr;
+
+  bool enable_pinned_memory = false;
+
+  // ======Only for RKNPU2 Backend=======
+  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_ =
+      fastdeploy::rknpu2::CpuName::RK3588;
+  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_ =
+      fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
+
+  // *** The belowing api are deprecated, will be removed in v1.2.0
+  // *** Do not use it anymore
+
+  void SetPaddleMKLDNN(bool pd_mkldnn = true); 
+  void EnablePaddleToTrt();
+  void DeletePaddleBackendPass(const std::string& delete_pass_name);
+  void EnablePaddleLogInfo();
+  void DisablePaddleLogInfo();
+  void SetPaddleMKLDNNCacheSize(int size);
+  void SetOpenVINODevice(const std::string& name = "CPU");
+  void SetOpenVINOShapeInfo(
+      const std::map<std::string, std::vector<int64_t>>& shape_info) {
+    openvino_option.shape_infos = shape_info;
+  }
+  void SetOpenVINOCpuOperators(const std::vector<std::string>& operators) {
+    openvino_option.SetCpuOperators(operators);
+  }
+  void SetLiteOptimizedModelDir(const std::string& optimized_model_dir);
+  void SetLiteSubgraphPartitionPath(
+      const std::string& nnadapter_subgraph_partition_config_path);
+  void SetLiteSubgraphPartitionConfigBuffer(
+      const std::string& nnadapter_subgraph_partition_config_buffer);
+  void
+  SetLiteContextProperties(const std::string& nnadapter_context_properties);
+  void SetLiteModelCacheDir(const std::string& nnadapter_model_cache_dir);
+  void SetLiteDynamicShapeInfo(
+      const std::map<std::string, std::vector<std::vector<int64_t>>>&
+          nnadapter_dynamic_shape_info);
+  void SetLiteMixedPrecisionQuantizationConfigPath(
+      const std::string& nnadapter_mixed_precision_quantization_config_path);
+  void EnableLiteFP16();
+  void DisableLiteFP16();
+  void EnableLiteInt8();
+  void DisableLiteInt8();
+  void SetLitePowerMode(LitePowerMode mode);
+  void SetTrtInputShape(
+      const std::string& input_name, const std::vector<int32_t>& min_shape,
+      const std::vector<int32_t>& opt_shape = std::vector<int32_t>(),
+      const std::vector<int32_t>& max_shape = std::vector<int32_t>());
+  void SetTrtMaxWorkspaceSize(size_t trt_max_workspace_size);
+  void SetTrtMaxBatchSize(size_t max_batch_size);
+  void EnableTrtFP16();
+  void DisableTrtFP16();
+  void SetTrtCacheFile(const std::string& cache_file_path);
+  void EnablePinnedMemory();
+  void DisablePinnedMemory();
+  void EnablePaddleTrtCollectShape();
+  void DisablePaddleTrtCollectShape();
+  void DisablePaddleTrtOPs(const std::vector<std::string>& ops);
+  void SetOpenVINOStreams(int num_streams);
+  void SetOrtGraphOptLevel(int level = -1);
+  void UsePaddleBackend();
+  void UseLiteBackend();
 };
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision.h b/fastdeploy/vision.h
old mode 100755
new mode 100644
index 8788e889d..9051c66fc
--- a/fastdeploy/vision.h
+++ b/fastdeploy/vision.h
@@ -16,7 +16,7 @@
 #include "fastdeploy/core/config.h"
 #ifdef ENABLE_VISION
 #include "fastdeploy/vision/classification/contrib/resnet.h"
-#include "fastdeploy/vision/classification/contrib/yolov5cls.h"
+#include "fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.h"
 #include "fastdeploy/vision/classification/ppcls/model.h"
 #include "fastdeploy/vision/detection/contrib/nanodet_plus.h"
 #include "fastdeploy/vision/detection/contrib/scaledyolov4.h"
@@ -41,6 +41,8 @@
 #include "fastdeploy/vision/facedet/contrib/ultraface.h"
 #include "fastdeploy/vision/facedet/contrib/yolov5face.h"
 #include "fastdeploy/vision/facedet/contrib/yolov7face/yolov7face.h"
+#include "fastdeploy/vision/facedet/contrib/centerface/centerface.h"
+#include "fastdeploy/vision/facedet/ppdet/blazeface/blazeface.h"
 #include "fastdeploy/vision/faceid/contrib/insightface/model.h"
 #include "fastdeploy/vision/faceid/contrib/adaface/adaface.h"
 #include "fastdeploy/vision/headpose/contrib/fsanet.h"
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls.cc b/fastdeploy/vision/classification/contrib/yolov5cls.cc
deleted file mode 100755
index 8dfc0a9a0..000000000
--- a/fastdeploy/vision/classification/contrib/yolov5cls.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "fastdeploy/vision/classification/contrib/yolov5cls.h"
-
-#include "fastdeploy/utils/perf.h"
-#include "fastdeploy/vision/utils/utils.h"
-
-namespace fastdeploy {
-namespace vision {
-namespace classification {
-
-YOLOv5Cls::YOLOv5Cls(const std::string& model_file,
-                     const std::string& params_file,
-                     const RuntimeOption& custom_option,
-                     const ModelFormat& model_format) {
-  if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
-    valid_gpu_backends = {Backend::ORT, Backend::TRT};
-  } else {
-    valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
-    valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
-  }
-  runtime_option = custom_option;
-  runtime_option.model_format = model_format;
-  runtime_option.model_file = model_file;
-  runtime_option.params_file = params_file;
-  initialized = Initialize();
-}
-
-bool YOLOv5Cls::Initialize() {
-  // preprocess parameters
-  size = {224, 224};
-  if (!InitRuntime()) {
-    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
-    return false;
-  }
-  return true;
-}
-
-bool YOLOv5Cls::Preprocess(Mat* mat, FDTensor* output,
-                           const std::vector<int>& size) {
-  // CenterCrop
-  int crop_size = std::min(mat->Height(), mat->Width());
-  CenterCrop::Run(mat, crop_size, crop_size);
-  Resize::Run(mat, size[0], size[1], -1, -1, cv::INTER_LINEAR);
-  // Normalize
-  BGR2RGB::Run(mat);
-  std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
-  std::vector<float> beta = {0.0f, 0.0f, 0.0f};
-  Convert::Run(mat, alpha, beta);
-  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
-  std::vector<float> std = {0.229f, 0.224f, 0.225f};
-  Normalize::Run(mat, mean, std, false);
-  HWC2CHW::Run(mat);
-  Cast::Run(mat, "float");
-
-  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);
-  return true;
-}
-
-bool YOLOv5Cls::Postprocess(const FDTensor& infer_result,
-                            ClassifyResult* result, int topk) {
-  // Softmax
-  FDTensor infer_result_softmax;
-  function::Softmax(infer_result, &infer_result_softmax, 1);
-  int num_classes = infer_result_softmax.shape[1];
-  const float* infer_result_buffer =
-      reinterpret_cast<const float*>(infer_result_softmax.Data());
-  topk = std::min(num_classes, topk);
-  result->label_ids =
-      utils::TopKIndices(infer_result_buffer, num_classes, topk);
-  result->scores.resize(topk);
-  for (int i = 0; i < topk; ++i) {
-    result->scores[i] = *(infer_result_buffer + result->label_ids[i]);
-  }
-  return true;
-}
-
-bool YOLOv5Cls::Predict(cv::Mat* im, ClassifyResult* result, int topk) {
-  Mat mat(*im);
-  std::vector<FDTensor> input_tensors(1);
-  if (!Preprocess(&mat, &input_tensors[0], size)) {
-    FDERROR << "Failed to preprocess input image." << std::endl;
-    return false;
-  }
-
-  input_tensors[0].name = InputInfoOfRuntime(0).name;
-  std::vector<FDTensor> output_tensors(1);
-  if (!Infer(input_tensors, &output_tensors)) {
-    FDERROR << "Failed to inference." << std::endl;
-    return false;
-  }
-
-  if (!Postprocess(output_tensors[0], result, topk)) {
-    FDERROR << "Failed to post process." << std::endl;
-    return false;
-  }
-  return true;
-}
-
-}  // namespace classification
-}  // namespace vision
-}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls.h b/fastdeploy/vision/classification/contrib/yolov5cls.h
deleted file mode 100755
index bbf93e9e4..000000000
--- a/fastdeploy/vision/classification/contrib/yolov5cls.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "fastdeploy/fastdeploy_model.h"
-#include "fastdeploy/vision/common/processors/transform.h"
-#include "fastdeploy/vision/common/result.h"
-
-namespace fastdeploy {
-namespace vision {
-/** \brief All image classification model APIs are defined inside this namespace
- *
- */
-namespace classification {
-
-/*! @brief YOLOv5Cls model object used when to load a YOLOv5Cls model exported by YOLOv5
- */
-class FASTDEPLOY_DECL YOLOv5Cls : public FastDeployModel {
- public:
-  /** \brief Set path of model file and configuration file, and the configuration of runtime
-   *
-   * \param[in] model_file Path of model file, e.g yolov5cls/yolov5n-cls.onnx
-   * \param[in] params_file Path of parameter file, if the model format is ONNX, this parameter will be ignored
-   * \param[in] custom_option RuntimeOption for inference, the default will use cpu, and choose the backend defined in `valid_cpu_backends`
-   * \param[in] model_format Model format of the loaded model, default is ONNX format
-   */
-  YOLOv5Cls(const std::string& model_file, const std::string& params_file = "",
-            const RuntimeOption& custom_option = RuntimeOption(),
-            const ModelFormat& model_format = ModelFormat::ONNX);
-
-  /// Get model's name
-  virtual std::string ModelName() const { return "yolov5cls"; }
-
-  /** \brief Predict the classification result for an input image
-   *
-   * \param[in] im The input image data, comes from cv::imread(), is a 3-D array with layout HWC, BGR format
-   * \param[in] result The output classification result will be writen to this structure
-   * \param[in] topk Returns the topk classification result with the highest predicted probability, the default is 1
-   * \return true if the prediction successed, otherwise false
-   */
-  virtual bool Predict(cv::Mat* im, ClassifyResult* result, int topk = 1);
-
-  /// Preprocess image size, the default is (224, 224)
-  std::vector<int> size;
-
- private:
-  bool Initialize();
-  /// Preprocess an input image, and set the preprocessed results to `outputs`
-  bool Preprocess(Mat* mat, FDTensor* output,
-                  const std::vector<int>& size = {224, 224});
-
-  /// Postprocess the inferenced results, and set the final result to `result`
-  bool Postprocess(const FDTensor& infer_result, ClassifyResult* result,
-                   int topk = 1);
-};
-
-}  // namespace classification
-}  // namespace vision
-}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.cc b/fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.cc
new file mode 100644
index 000000000..f4c40cfc8
--- /dev/null
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace classification {
+
+YOLOv5ClsPostprocessor::YOLOv5ClsPostprocessor() {
+  topk_ = 1;
+}
+
+bool YOLOv5ClsPostprocessor::Run(
+    const std::vector<FDTensor> &tensors, std::vector<ClassifyResult> *results,
+    const std::vector<std::map<std::string, std::array<float, 2>>> &ims_info) {
+  int batch = tensors[0].shape[0];
+  FDTensor infer_result = tensors[0];
+  FDTensor infer_result_softmax;
+  function::Softmax(infer_result, &infer_result_softmax, 1);
+  results->resize(batch);
+
+  for (size_t bs = 0; bs < batch; ++bs) {
+    (*results)[bs].Clear();
+    // output (1,1000) score classnum 1000
+    int num_classes = infer_result_softmax.shape[1];
+    const float* infer_result_buffer =
+        reinterpret_cast<const float*>(infer_result_softmax.Data()) + bs * infer_result_softmax.shape[1];
+    topk_ = std::min(num_classes, topk_);
+    (*results)[bs].label_ids =
+        utils::TopKIndices(infer_result_buffer, num_classes, topk_);
+    (*results)[bs].scores.resize(topk_);
+    for (int i = 0; i < topk_; ++i) {
+      (*results)[bs].scores[i] = *(infer_result_buffer + (*results)[bs].label_ids[i]);
+    }
+
+    if ((*results)[bs].label_ids.size() == 0) {
+      return true;
+    }
+  }
+  return true;
+}
+
+} // namespace classification
+} // namespace vision
+} // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.h b/fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.h
new file mode 100644
index 000000000..8fed59617
--- /dev/null
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+namespace vision {
+
+namespace classification {
+/*! @brief Postprocessor object for YOLOv5Cls serials model.
+ */
+class FASTDEPLOY_DECL YOLOv5ClsPostprocessor {
+ public:
+  /** \brief Create a postprocessor instance for YOLOv5Cls serials model
+   */
+  YOLOv5ClsPostprocessor();
+
+  /** \brief Process the result of runtime and fill to ClassifyResult structure
+   *
+   * \param[in] tensors The inference result from runtime
+   * \param[in] result The output result of classification
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \return true if the postprocess successed, otherwise false
+   */
+  bool Run(const std::vector<FDTensor>& tensors,
+      std::vector<ClassifyResult>* results,
+      const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info);
+
+  /// Set topk, default 1
+  void SetTopK(const int& topk) {
+    topk_ = topk;
+  }
+
+  /// Get topk, default 1
+  float GetTopK() const { return topk_; }
+
+ protected:
+  int topk_;
+};
+
+}  // namespace classification
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc b/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc
new file mode 100644
index 000000000..e252ba0ee
--- /dev/null
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.h"
+#include "fastdeploy/function/concat.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace classification {
+
+YOLOv5ClsPreprocessor::YOLOv5ClsPreprocessor() {
+  size_ = {224, 224}; //{h,w}
+}
+
+bool YOLOv5ClsPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
+            std::map<std::string, std::array<float, 2>>* im_info) {
+  // Record the shape of image and the shape of preprocessed image
+  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
+                               static_cast<float>(mat->Width())};
+
+  // process after image load
+  double ratio = (size_[0] * 1.0) / std::max(static_cast<float>(mat->Height()),
+                                            static_cast<float>(mat->Width()));
+
+  // yolov5cls's preprocess steps
+  // 1. CenterCrop
+  // 2. Normalize
+  // CenterCrop
+  int crop_size = std::min(mat->Height(), mat->Width());
+  CenterCrop::Run(mat, crop_size, crop_size);
+  Resize::Run(mat, size_[0], size_[1], -1, -1, cv::INTER_LINEAR);
+  // Normalize
+  BGR2RGB::Run(mat);
+  std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
+  std::vector<float> beta = {0.0f, 0.0f, 0.0f};
+  Convert::Run(mat, alpha, beta);
+  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
+  std::vector<float> std = {0.229f, 0.224f, 0.225f};
+  NormalizeAndPermute::Run(mat, mean, std, false);
+
+  // Record output shape of preprocessed image
+  (*im_info)["output_shape"] = {static_cast<float>(mat->Height()),
+                                static_cast<float>(mat->Width())};
+
+  mat->ShareWithTensor(output);
+  output->ExpandDim(0);  // reshape to n, h, w, c
+  return true;
+}
+
+bool YOLOv5ClsPreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+                             std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+  if (images->size() == 0) {
+    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    return false;
+  }
+  ims_info->resize(images->size());
+  outputs->resize(1);
+  // Concat all the preprocessed data to a batch tensor
+  std::vector<FDTensor> tensors(images->size()); 
+  for (size_t i = 0; i < images->size(); ++i) {
+    if (!Preprocess(&(*images)[i], &tensors[i], &(*ims_info)[i])) {
+      FDERROR << "Failed to preprocess input image." << std::endl;
+      return false;
+    }
+  }
+
+  if (tensors.size() == 1) {
+    (*outputs)[0] = std::move(tensors[0]);
+  } else {
+    function::Concat(tensors, &((*outputs)[0]), 0);
+  }
+  return true;
+}
+
+}  // namespace classification
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.h b/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.h
new file mode 100644
index 000000000..a075df613
--- /dev/null
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+namespace vision {
+
+namespace classification {
+/*! @brief Preprocessor object for YOLOv5Cls serials model.
+ */
+class FASTDEPLOY_DECL YOLOv5ClsPreprocessor {
+ public:
+  /** \brief Create a preprocessor instance for YOLOv5Cls serials model
+   */
+  YOLOv5ClsPreprocessor();
+
+  /** \brief Process the input image and prepare input tensors for runtime
+   *
+   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] outputs The output tensors which will feed in runtime
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \return true if the preprocess successed, otherwise false
+   */
+  bool Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+           std::vector<std::map<std::string, std::array<float, 2>>>* ims_info);
+
+  /// Set target size, tuple of (width, height), default size = {224, 224}
+  void SetSize(const std::vector<int>& size) { size_ = size; }
+
+  /// Get target size, tuple of (width, height), default size = {224, 224}
+  std::vector<int> GetSize() const { return size_; }
+
+ protected:
+  bool Preprocess(FDMat* mat, FDTensor* output,
+                  std::map<std::string, std::array<float, 2>>* im_info);
+
+  // target size, tuple of (width, height), default size = {224, 224}
+  std::vector<int> size_;
+};
+
+}  // namespace classification
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.cc b/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.cc
new file mode 100755
index 000000000..84cb8d7b5
--- /dev/null
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace classification {
+
+YOLOv5Cls::YOLOv5Cls(const std::string& model_file, const std::string& params_file,
+               const RuntimeOption& custom_option,
+               const ModelFormat& model_format) {
+  if (model_format == ModelFormat::ONNX) {
+    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
+  } else {
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
+    valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
+  }
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool YOLOv5Cls::Initialize() {
+  if (!InitRuntime()) {
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool YOLOv5Cls::Predict(const cv::Mat& im, ClassifyResult* result) {
+  std::vector<ClassifyResult> results;
+  if (!BatchPredict({im}, &results)) {
+    return false;
+  }
+  *result = std::move(results[0]);
+  return true;
+}
+
+bool YOLOv5Cls::BatchPredict(const std::vector<cv::Mat>& images, std::vector<ClassifyResult>* results) {
+  std::vector<std::map<std::string, std::array<float, 2>>> ims_info;
+  std::vector<FDMat> fd_images = WrapMat(images);
+
+  if (!preprocessor_.Run(&fd_images, &reused_input_tensors_, &ims_info)) {
+    FDERROR << "Failed to preprocess the input image." << std::endl;
+    return false;
+  }
+
+  reused_input_tensors_[0].name = InputInfoOfRuntime(0).name;
+  if (!Infer(reused_input_tensors_, &reused_output_tensors_)) {
+    FDERROR << "Failed to inference by runtime." << std::endl;
+    return false;
+  }
+
+  if (!postprocessor_.Run(reused_output_tensors_, results, ims_info)) {
+    FDERROR << "Failed to postprocess the inference results by runtime." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace classification
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.h b/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.h
new file mode 100755
index 000000000..5eab4eeeb
--- /dev/null
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  //NOLINT
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.h"
+#include "fastdeploy/vision/classification/contrib/yolov5cls/postprocessor.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace classification {
+/*! @brief YOLOv5Cls model object used when to load a YOLOv5Cls model exported by YOLOv5Cls.
+ */
+class FASTDEPLOY_DECL YOLOv5Cls : public FastDeployModel {
+ public:
+  /** \brief  Set path of model file and the configuration of runtime.
+   *
+   * \param[in] model_file Path of model file, e.g ./yolov5cls.onnx
+   * \param[in] params_file Path of parameter file, e.g ppyoloe/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+   * \param[in] custom_option RuntimeOption for inference, the default will use cpu, and choose the backend defined in "valid_cpu_backends"
+   * \param[in] model_format Model format of the loaded model, default is ONNX format
+   */
+  YOLOv5Cls(const std::string& model_file, const std::string& params_file = "",
+         const RuntimeOption& custom_option = RuntimeOption(),
+         const ModelFormat& model_format = ModelFormat::ONNX);
+
+  std::string ModelName() const { return "yolov5cls"; }
+
+  /** \brief Predict the classification result for an input image
+   *
+   * \param[in] img The input image data, comes from cv::imread(), is a 3-D array with layout HWC, BGR format
+   * \param[in] result The output classification result will be writen to this structure
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool Predict(const cv::Mat& img, ClassifyResult* result);
+
+  /** \brief Predict the classification results for a batch of input images
+   *
+   * \param[in] imgs, The input image list, each element comes from cv::imread()
+   * \param[in] results The output classification result list
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool BatchPredict(const std::vector<cv::Mat>& imgs,
+                            std::vector<ClassifyResult>* results);
+
+  /// Get preprocessor reference of YOLOv5Cls
+  virtual YOLOv5ClsPreprocessor& GetPreprocessor() {
+    return preprocessor_;
+  }
+
+  /// Get postprocessor reference of YOLOv5Cls
+  virtual YOLOv5ClsPostprocessor& GetPostprocessor() {
+    return postprocessor_;
+  }
+
+ protected:
+  bool Initialize();
+  YOLOv5ClsPreprocessor preprocessor_;
+  YOLOv5ClsPostprocessor postprocessor_;
+};
+
+}  // namespace classification
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls_pybind.cc b/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls_pybind.cc
new file mode 100755
index 000000000..99e277d1f
--- /dev/null
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/yolov5cls_pybind.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindYOLOv5Cls(pybind11::module& m) {
+  pybind11::class_<vision::classification::YOLOv5ClsPreprocessor>(
+      m, "YOLOv5ClsPreprocessor")
+      .def(pybind11::init<>())
+      .def("run", [](vision::classification::YOLOv5ClsPreprocessor& self, std::vector<pybind11::array>& im_list) {
+        std::vector<vision::FDMat> images;
+        for (size_t i = 0; i < im_list.size(); ++i) {
+          images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
+        }
+        std::vector<FDTensor> outputs;
+        std::vector<std::map<std::string, std::array<float, 2>>> ims_info;
+        if (!self.Run(&images, &outputs, &ims_info)) {
+          throw std::runtime_error("raise Exception('Failed to preprocess the input data in YOLOv5ClsPreprocessor.')");
+        }
+        for (size_t i = 0; i < outputs.size(); ++i) {
+          outputs[i].StopSharing();
+        }
+        return make_pair(outputs, ims_info);
+      })
+      .def_property("size", &vision::classification::YOLOv5ClsPreprocessor::GetSize, &vision::classification::YOLOv5ClsPreprocessor::SetSize);
+
+  pybind11::class_<vision::classification::YOLOv5ClsPostprocessor>(
+      m, "YOLOv5ClsPostprocessor")
+      .def(pybind11::init<>())
+      .def("run", [](vision::classification::YOLOv5ClsPostprocessor& self, std::vector<FDTensor>& inputs,
+                     const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info) {
+        std::vector<vision::ClassifyResult> results;
+        if (!self.Run(inputs, &results, ims_info)) {
+          throw std::runtime_error("raise Exception('Failed to postprocess the runtime result in YOLOv5ClsPostprocessor.')");
+        }
+        return results;
+      })
+      .def("run", [](vision::classification::YOLOv5ClsPostprocessor& self, std::vector<pybind11::array>& input_array,
+                     const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info) {
+        std::vector<vision::ClassifyResult> results;
+        std::vector<FDTensor> inputs;
+        PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
+        if (!self.Run(inputs, &results, ims_info)) {
+          throw std::runtime_error("raise Exception('Failed to postprocess the runtime result in YOLOv5ClsPostprocessor.')");
+        }
+        return results;
+      })
+      .def_property("topk", &vision::classification::YOLOv5ClsPostprocessor::GetTopK, &vision::classification::YOLOv5ClsPostprocessor::SetTopK);
+
+  pybind11::class_<vision::classification::YOLOv5Cls, FastDeployModel>(m, "YOLOv5Cls")
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>())
+      .def("predict",
+           [](vision::classification::YOLOv5Cls& self, pybind11::array& data) {
+             auto mat = PyArrayToCvMat(data);
+             vision::ClassifyResult res;
+             self.Predict(mat, &res);
+             return res;
+           })
+      .def("batch_predict", [](vision::classification::YOLOv5Cls& self, std::vector<pybind11::array>& data) {
+        std::vector<cv::Mat> images;
+        for (size_t i = 0; i < data.size(); ++i) {
+          images.push_back(PyArrayToCvMat(data[i]));
+        }
+        std::vector<vision::ClassifyResult> results;
+        self.BatchPredict(images, &results);
+        return results;
+      })
+      .def_property_readonly("preprocessor", &vision::classification::YOLOv5Cls::GetPreprocessor)
+      .def_property_readonly("postprocessor", &vision::classification::YOLOv5Cls::GetPostprocessor);
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc b/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc
index 514f5ad9d..1d4a24adf 100644
--- a/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc
+++ b/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc
@@ -15,33 +15,9 @@
 
 namespace fastdeploy {
 void BindPaddleClas(pybind11::module& m) {
-  pybind11::class_<vision::classification::PaddleClasPreprocessor>(
-      m, "PaddleClasPreprocessor")
+  pybind11::class_<vision::classification::PaddleClasPreprocessor,
+                   vision::ProcessorManager>(m, "PaddleClasPreprocessor")
       .def(pybind11::init<std::string>())
-      .def("run",
-           [](vision::classification::PaddleClasPreprocessor& self,
-              std::vector<pybind11::array>& im_list) {
-             std::vector<vision::FDMat> images;
-             for (size_t i = 0; i < im_list.size(); ++i) {
-               images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
-             }
-             std::vector<FDTensor> outputs;
-             if (!self.Run(&images, &outputs)) {
-               throw std::runtime_error(
-                   "Failed to preprocess the input data in "
-                   "PaddleClasPreprocessor.");
-             }
-             if (!self.CudaUsed()) {
-               for (size_t i = 0; i < outputs.size(); ++i) {
-                 outputs[i].StopSharing();
-               }
-             }
-             return outputs;
-           })
-      .def("use_cuda",
-           [](vision::classification::PaddleClasPreprocessor& self,
-              bool enable_cv_cuda = false,
-              int gpu_id = -1) { self.UseCuda(enable_cv_cuda, gpu_id); })
       .def("disable_normalize",
            [](vision::classification::PaddleClasPreprocessor& self) {
              self.DisableNormalize();
@@ -49,6 +25,10 @@ void BindPaddleClas(pybind11::module& m) {
       .def("disable_permute",
            [](vision::classification::PaddleClasPreprocessor& self) {
              self.DisablePermute();
+           })
+      .def("initial_resize_on_cpu",
+           [](vision::classification::PaddleClasPreprocessor& self, bool v) {
+             self.InitialResizeOnCpu(v);
            });
 
   pybind11::class_<vision::classification::PaddleClasPostprocessor>(
diff --git a/fastdeploy/vision/classification/ppcls/preprocessor.cc b/fastdeploy/vision/classification/ppcls/preprocessor.cc
index 90d40e094..ef0da9ce5 100644
--- a/fastdeploy/vision/classification/ppcls/preprocessor.cc
+++ b/fastdeploy/vision/classification/ppcls/preprocessor.cc
@@ -100,32 +100,23 @@ void PaddleClasPreprocessor::DisablePermute() {
   }
 }
 
-bool PaddleClasPreprocessor::Apply(std::vector<FDMat>* images,
+bool PaddleClasPreprocessor::Apply(FDMatBatch* image_batch,
                                    std::vector<FDTensor>* outputs) {
-  for (size_t i = 0; i < images->size(); ++i) {
-    for (size_t j = 0; j < processors_.size(); ++j) {
-      bool ret = false;
-      ret = (*(processors_[j].get()))(&((*images)[i]));
-      if (!ret) {
-        FDERROR << "Failed to processs image:" << i << " in "
-                << processors_[j]->Name() << "." << std::endl;
-        return false;
-      }
+  for (size_t j = 0; j < processors_.size(); ++j) {
+    ProcLib lib = ProcLib::DEFAULT;
+    if (initial_resize_on_cpu_ && j == 0 &&
+        processors_[j]->Name().find("Resize") == 0) {
+      lib = ProcLib::OPENCV;
+    }
+    if (!(*(processors_[j].get()))(image_batch, lib)) {
+      FDERROR << "Failed to processs image in " << processors_[j]->Name() << "."
+              << std::endl;
+      return false;
     }
   }
 
   outputs->resize(1);
-  // Concat all the preprocessed data to a batch tensor
-  std::vector<FDTensor> tensors(images->size());
-  for (size_t i = 0; i < images->size(); ++i) {
-    (*images)[i].ShareWithTensor(&(tensors[i]));
-    tensors[i].ExpandDim(0);
-  }
-  if (tensors.size() == 1) {
-    (*outputs)[0] = std::move(tensors[0]);
-  } else {
-    function::Concat(tensors, &((*outputs)[0]), 0);
-  }
+  (*outputs)[0] = std::move(*(image_batch->Tensor()));
   (*outputs)[0].device_id = DeviceId();
   return true;
 }
diff --git a/fastdeploy/vision/classification/ppcls/preprocessor.h b/fastdeploy/vision/classification/ppcls/preprocessor.h
index 2f2beaddb..fc347fc3d 100644
--- a/fastdeploy/vision/classification/ppcls/preprocessor.h
+++ b/fastdeploy/vision/classification/ppcls/preprocessor.h
@@ -33,11 +33,11 @@ class FASTDEPLOY_DECL PaddleClasPreprocessor : public ProcessorManager {
 
   /** \brief Process the input image and prepare input tensors for runtime
    *
-   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] image_batch The input image batch
    * \param[in] outputs The output tensors which will feed in runtime
    * \return true if the preprocess successed, otherwise false
    */
-  virtual bool Apply(std::vector<FDMat>* images,
+  virtual bool Apply(FDMatBatch* image_batch,
                      std::vector<FDTensor>* outputs);
 
   /// This function will disable normalize in preprocessing step.
@@ -45,6 +45,14 @@ class FASTDEPLOY_DECL PaddleClasPreprocessor : public ProcessorManager {
   /// This function will disable hwc2chw in preprocessing step.
   void DisablePermute();
 
+  /** \brief When the initial operator is Resize, and input image size is large,
+   *     maybe it's better to run resize on CPU, because the HostToDevice memcpy
+   *     is time consuming. Set this true to run the initial resize on CPU.
+   *
+   * \param[in] v ture or false
+   */
+  void InitialResizeOnCpu(bool v) { initial_resize_on_cpu_ = v; }
+
  private:
   bool BuildPreprocessPipelineFromConfig();
   std::vector<std::shared_ptr<Processor>> processors_;
@@ -54,6 +62,7 @@ class FASTDEPLOY_DECL PaddleClasPreprocessor : public ProcessorManager {
   bool disable_normalize_ = false;
   // read config file
   std::string config_file_;
+  bool initial_resize_on_cpu_ = false;
 };
 
 }  // namespace classification
diff --git a/fastdeploy/vision/common/processors/base.cc b/fastdeploy/vision/common/processors/base.cc
index a47cfe378..9c4a0177e 100644
--- a/fastdeploy/vision/common/processors/base.cc
+++ b/fastdeploy/vision/common/processors/base.cc
@@ -20,7 +20,7 @@
 namespace fastdeploy {
 namespace vision {
 
-bool Processor::operator()(Mat* mat, ProcLib lib) {
+bool Processor::operator()(FDMat* mat, ProcLib lib) {
   ProcLib target = lib;
   if (lib == ProcLib::DEFAULT) {
     target = DefaultProcLib::default_lib;
@@ -52,39 +52,38 @@ bool Processor::operator()(Mat* mat, ProcLib lib) {
   return ImplByOpenCV(mat);
 }
 
-FDTensor* Processor::UpdateAndGetCachedTensor(
-    const std::vector<int64_t>& new_shape, const FDDataType& data_type,
-    const std::string& tensor_name, const Device& new_device,
-    const bool& use_pinned_memory) {
-  if (cached_tensors_.count(tensor_name) == 0) {
-    cached_tensors_[tensor_name] = FDTensor();
-  }
-  cached_tensors_[tensor_name].is_pinned_memory = use_pinned_memory;
-  cached_tensors_[tensor_name].Resize(new_shape, data_type, tensor_name,
-                                      new_device);
-  return &cached_tensors_[tensor_name];
-}
-
-FDTensor* Processor::CreateCachedGpuInputTensor(
-    Mat* mat, const std::string& tensor_name) {
-#ifdef WITH_GPU
-  FDTensor* src = mat->Tensor();
-  if (src->device == Device::GPU) {
-    return src;
-  } else if (src->device == Device::CPU) {
-    FDTensor* tensor = UpdateAndGetCachedTensor(src->Shape(), src->Dtype(),
-                                                tensor_name, Device::GPU);
-    FDASSERT(cudaMemcpyAsync(tensor->Data(), src->Data(), tensor->Nbytes(),
-                             cudaMemcpyHostToDevice, mat->Stream()) == 0,
-             "[ERROR] Error occurs while copy memory from CPU to GPU.");
-    return tensor;
-  } else {
-    FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
+bool Processor::operator()(FDMatBatch* mat_batch, ProcLib lib) {
+  ProcLib target = lib;
+  if (lib == ProcLib::DEFAULT) {
+    target = DefaultProcLib::default_lib;
   }
+  if (target == ProcLib::FLYCV) {
+#ifdef ENABLE_FLYCV
+    return ImplByFlyCV(mat_batch);
 #else
-  FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
+    FDASSERT(false, "FastDeploy didn't compile with FlyCV.");
 #endif
-  return nullptr;
+  } else if (target == ProcLib::CUDA) {
+#ifdef WITH_GPU
+    FDASSERT(
+        mat_batch->Stream() != nullptr,
+        "CUDA processor requires cuda stream, please set stream for mat_batch");
+    return ImplByCuda(mat_batch);
+#else
+    FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
+#endif
+  } else if (target == ProcLib::CVCUDA) {
+#ifdef ENABLE_CVCUDA
+    FDASSERT(mat_batch->Stream() != nullptr,
+             "CV-CUDA processor requires cuda stream, please set stream for "
+             "mat_batch");
+    return ImplByCvCuda(mat_batch);
+#else
+    FDASSERT(false, "FastDeploy didn't compile with CV-CUDA.");
+#endif
+  }
+  // DEFAULT & OPENCV
+  return ImplByOpenCV(mat_batch);
 }
 
 void EnableFlyCV() {
diff --git a/fastdeploy/vision/common/processors/base.h b/fastdeploy/vision/common/processors/base.h
index 6fb3a33eb..786e88672 100644
--- a/fastdeploy/vision/common/processors/base.h
+++ b/fastdeploy/vision/common/processors/base.h
@@ -16,6 +16,7 @@
 
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy/vision/common/processors/mat.h"
+#include "fastdeploy/vision/common/processors/mat_batch.h"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include <unordered_map>
@@ -46,46 +47,63 @@ class FASTDEPLOY_DECL Processor {
 
   virtual std::string Name() = 0;
 
-  virtual bool ImplByOpenCV(Mat* mat) {
+  virtual bool ImplByOpenCV(FDMat* mat) {
     FDERROR << Name() << " Not Implement Yet." << std::endl;
     return false;
   }
 
-  virtual bool ImplByFlyCV(Mat* mat) {
+  virtual bool ImplByOpenCV(FDMatBatch* mat_batch) {
+    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+      if (ImplByOpenCV(&(*(mat_batch->mats))[i]) != true) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  virtual bool ImplByFlyCV(FDMat* mat) {
     return ImplByOpenCV(mat);
   }
 
-  virtual bool ImplByCuda(Mat* mat) {
+  virtual bool ImplByFlyCV(FDMatBatch* mat_batch) {
+    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+      if (ImplByFlyCV(&(*(mat_batch->mats))[i]) != true) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  virtual bool ImplByCuda(FDMat* mat) {
     return ImplByOpenCV(mat);
   }
 
-  virtual bool ImplByCvCuda(Mat* mat) {
+  virtual bool ImplByCuda(FDMatBatch* mat_batch) {
+    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+      if (ImplByCuda(&(*(mat_batch->mats))[i]) != true) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  virtual bool ImplByCvCuda(FDMat* mat) {
     return ImplByOpenCV(mat);
   }
 
-  virtual bool operator()(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
+  virtual bool ImplByCvCuda(FDMatBatch* mat_batch) {
+    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+      if (ImplByCvCuda(&(*(mat_batch->mats))[i]) != true) {
+        return false;
+      }
+    }
+    return true;
+  }
 
- protected:
-  // Update and get the cached tensor from the cached_tensors_ map.
-  // The tensor is indexed by a string.
-  // If the tensor doesn't exists in the map, then create a new tensor.
-  // If the tensor exists and shape is getting larger, then realloc the buffer.
-  // If the tensor exists and shape is not getting larger, then return the
-  // cached tensor directly.
-  FDTensor* UpdateAndGetCachedTensor(
-      const std::vector<int64_t>& new_shape, const FDDataType& data_type,
-      const std::string& tensor_name, const Device& new_device = Device::CPU,
-      const bool& use_pinned_memory = false);
+  virtual bool operator()(FDMat* mat, ProcLib lib = ProcLib::DEFAULT);
 
-  // Create an input tensor on GPU and save into cached_tensors_.
-  // If the Mat is on GPU, return the mat->Tensor() directly.
-  // If the Mat is on CPU, then create a cached GPU tensor and copy the mat's
-  // CPU tensor to this new GPU tensor.
-  FDTensor* CreateCachedGpuInputTensor(Mat* mat,
-                                       const std::string& tensor_name);
-
- private:
-  std::unordered_map<std::string, FDTensor> cached_tensors_;
+  virtual bool operator()(FDMatBatch* mat_batch,
+                          ProcLib lib = ProcLib::DEFAULT);
 };
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/center_crop.cc b/fastdeploy/vision/common/processors/center_crop.cc
index bb0c96947..1857f7a81 100644
--- a/fastdeploy/vision/common/processors/center_crop.cc
+++ b/fastdeploy/vision/common/processors/center_crop.cc
@@ -23,7 +23,7 @@
 namespace fastdeploy {
 namespace vision {
 
-bool CenterCrop::ImplByOpenCV(Mat* mat) {
+bool CenterCrop::ImplByOpenCV(FDMat* mat) {
   cv::Mat* im = mat->GetOpenCVMat();
   int height = static_cast<int>(im->rows);
   int width = static_cast<int>(im->cols);
@@ -42,7 +42,7 @@ bool CenterCrop::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool CenterCrop::ImplByFlyCV(Mat* mat) {
+bool CenterCrop::ImplByFlyCV(FDMat* mat) {
   fcv::Mat* im = mat->GetFlyCVMat();
   int height = static_cast<int>(im->height());
   int width = static_cast<int>(im->width());
@@ -63,18 +63,15 @@ bool CenterCrop::ImplByFlyCV(Mat* mat) {
 #endif
 
 #ifdef ENABLE_CVCUDA
-bool CenterCrop::ImplByCvCuda(Mat* mat) {
+bool CenterCrop::ImplByCvCuda(FDMat* mat) {
   // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
-  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
   auto src_tensor = CreateCvCudaTensorWrapData(*src);
 
   // Prepare output tensor
-  tensor_name = Name() + "_cvcuda_dst";
-  FDTensor* dst =
-      UpdateAndGetCachedTensor({height_, width_, mat->Channels()}, src->Dtype(),
-                               tensor_name, Device::GPU);
-  auto dst_tensor = CreateCvCudaTensorWrapData(*dst);
+  mat->output_cache->Resize({height_, width_, mat->Channels()}, src->Dtype(),
+                            "output_cache", Device::GPU);
+  auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
 
   int offset_x = static_cast<int>((mat->Width() - width_) / 2);
   int offset_y = static_cast<int>((mat->Height() - height_) / 2);
@@ -82,16 +79,27 @@ bool CenterCrop::ImplByCvCuda(Mat* mat) {
   NVCVRectI crop_roi = {offset_x, offset_y, width_, height_};
   crop_op(mat->Stream(), src_tensor, dst_tensor, crop_roi);
 
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
   mat->SetWidth(width_);
   mat->SetHeight(height_);
   mat->device = Device::GPU;
   mat->mat_type = ProcLib::CVCUDA;
   return true;
 }
+
+bool CenterCrop::ImplByCvCuda(FDMatBatch* mat_batch) {
+  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+    if (ImplByCvCuda(&((*(mat_batch->mats))[i])) != true) {
+      return false;
+    }
+  }
+  mat_batch->device = Device::GPU;
+  mat_batch->mat_type = ProcLib::CVCUDA;
+  return true;
+}
 #endif
 
-bool CenterCrop::Run(Mat* mat, const int& width, const int& height,
+bool CenterCrop::Run(FDMat* mat, const int& width, const int& height,
                      ProcLib lib) {
   auto c = CenterCrop(width, height);
   return c(mat, lib);
diff --git a/fastdeploy/vision/common/processors/center_crop.h b/fastdeploy/vision/common/processors/center_crop.h
index 7455773f6..3ca3a7391 100644
--- a/fastdeploy/vision/common/processors/center_crop.h
+++ b/fastdeploy/vision/common/processors/center_crop.h
@@ -22,16 +22,17 @@ namespace vision {
 class FASTDEPLOY_DECL CenterCrop : public Processor {
  public:
   CenterCrop(int width, int height) : height_(height), width_(width) {}
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
+  bool ImplByCvCuda(FDMatBatch* mat_batch);
 #endif
   std::string Name() { return "CenterCrop"; }
 
-  static bool Run(Mat* mat, const int& width, const int& height,
+  static bool Run(FDMat* mat, const int& width, const int& height,
                   ProcLib lib = ProcLib::DEFAULT);
 
  private:
diff --git a/fastdeploy/vision/common/processors/cvcuda_utils.cc b/fastdeploy/vision/common/processors/cvcuda_utils.cc
index 482d0dc69..c7d25361b 100644
--- a/fastdeploy/vision/common/processors/cvcuda_utils.cc
+++ b/fastdeploy/vision/common/processors/cvcuda_utils.cc
@@ -47,17 +47,19 @@ nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor) {
            "When create CVCUDA tensor from FD tensor,"
            "tensor shape should be 3-Dim, HWC layout");
   int batchsize = 1;
+  int h = tensor.Shape()[0];
+  int w = tensor.Shape()[1];
+  int c = tensor.Shape()[2];
 
   nvcv::TensorDataStridedCuda::Buffer buf;
   buf.strides[3] = FDDataTypeSize(tensor.Dtype());
-  buf.strides[2] = tensor.shape[2] * buf.strides[3];
-  buf.strides[1] = tensor.shape[1] * buf.strides[2];
-  buf.strides[0] = tensor.shape[0] * buf.strides[1];
+  buf.strides[2] = c * buf.strides[3];
+  buf.strides[1] = w * buf.strides[2];
+  buf.strides[0] = h * buf.strides[1];
   buf.basePtr = reinterpret_cast<NVCVByte*>(const_cast<void*>(tensor.Data()));
 
   nvcv::Tensor::Requirements req = nvcv::Tensor::CalcRequirements(
-      batchsize, {tensor.shape[1], tensor.shape[0]},
-      CreateCvCudaImageFormat(tensor.Dtype(), tensor.shape[2]));
+      batchsize, {w, h}, CreateCvCudaImageFormat(tensor.Dtype(), c));
 
   nvcv::TensorDataStridedCuda tensor_data(
       nvcv::TensorShape{req.shape, req.rank, req.layout},
@@ -70,6 +72,33 @@ void* GetCvCudaTensorDataPtr(const nvcv::TensorWrapData& tensor) {
       dynamic_cast<const nvcv::ITensorDataStridedCuda*>(tensor.exportData());
   return reinterpret_cast<void*>(data->basePtr());
 }
+
+nvcv::ImageWrapData CreateImageWrapData(const FDTensor& tensor) {
+  FDASSERT(tensor.shape.size() == 3,
+           "When create CVCUDA image from FD tensor,"
+           "tensor shape should be 3-Dim, HWC layout");
+  int h = tensor.Shape()[0];
+  int w = tensor.Shape()[1];
+  int c = tensor.Shape()[2];
+  nvcv::ImageDataStridedCuda::Buffer buf;
+  buf.numPlanes = 1;
+  buf.planes[0].width = w;
+  buf.planes[0].height = h;
+  buf.planes[0].rowStride = w * c * FDDataTypeSize(tensor.Dtype());
+  buf.planes[0].basePtr =
+      reinterpret_cast<NVCVByte*>(const_cast<void*>(tensor.Data()));
+  nvcv::ImageWrapData nvimg{nvcv::ImageDataStridedCuda{
+      nvcv::ImageFormat{CreateCvCudaImageFormat(tensor.Dtype(), c)}, buf}};
+  return nvimg;
+}
+
+void CreateCvCudaImageBatchVarShape(std::vector<FDTensor*>& tensors,
+                                    nvcv::ImageBatchVarShape& img_batch) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    FDASSERT(tensors[i]->device == Device::GPU, "Tensor must on GPU.");
+    img_batch.pushBack(CreateImageWrapData(*(tensors[i])));
+  }
+}
 #endif
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/cvcuda_utils.h b/fastdeploy/vision/common/processors/cvcuda_utils.h
index cd4eae8f6..60971ec49 100644
--- a/fastdeploy/vision/common/processors/cvcuda_utils.h
+++ b/fastdeploy/vision/common/processors/cvcuda_utils.h
@@ -18,6 +18,7 @@
 
 #ifdef ENABLE_CVCUDA
 #include "nvcv/Tensor.hpp"
+#include <nvcv/ImageBatch.hpp>
 
 namespace fastdeploy {
 namespace vision {
@@ -25,7 +26,10 @@ namespace vision {
 nvcv::ImageFormat CreateCvCudaImageFormat(FDDataType type, int channel);
 nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor);
 void* GetCvCudaTensorDataPtr(const nvcv::TensorWrapData& tensor);
+nvcv::ImageWrapData CreateImageWrapData(const FDTensor& tensor);
+void CreateCvCudaImageBatchVarShape(std::vector<FDTensor*>& tensors,
+                                    nvcv::ImageBatchVarShape& img_batch);
 
-}
-}
+}  // namespace vision
+}  // namespace fastdeploy
 #endif
diff --git a/fastdeploy/vision/common/processors/manager.cc b/fastdeploy/vision/common/processors/manager.cc
index 147e12ae8..45b29866b 100644
--- a/fastdeploy/vision/common/processors/manager.cc
+++ b/fastdeploy/vision/common/processors/manager.cc
@@ -62,13 +62,24 @@ bool ProcessorManager::Run(std::vector<FDMat>* images,
     return false;
   }
 
-  for (size_t i = 0; i < images->size(); ++i) {
-    if (CudaUsed()) {
-      SetStream(&((*images)[i]));
-    }
+  if (images->size() > input_caches_.size()) {
+    input_caches_.resize(images->size());
+    output_caches_.resize(images->size());
   }
 
-  bool ret = Apply(images, outputs);
+  FDMatBatch image_batch(images);
+  image_batch.input_cache = &batch_input_cache_;
+  image_batch.output_cache = &batch_output_cache_;
+
+  for (size_t i = 0; i < images->size(); ++i) {
+    if (CudaUsed()) {
+      SetStream(&image_batch);
+    }
+    (*images)[i].input_cache = &input_caches_[i];
+    (*images)[i].output_cache = &output_caches_[i];
+  }
+
+  bool ret = Apply(&image_batch, outputs);
 
   if (CudaUsed()) {
     SyncStream();
diff --git a/fastdeploy/vision/common/processors/manager.h b/fastdeploy/vision/common/processors/manager.h
index 8721c7e10..6c119ff56 100644
--- a/fastdeploy/vision/common/processors/manager.h
+++ b/fastdeploy/vision/common/processors/manager.h
@@ -16,6 +16,7 @@
 
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy/vision/common/processors/mat.h"
+#include "fastdeploy/vision/common/processors/mat_batch.h"
 
 namespace fastdeploy {
 namespace vision {
@@ -24,16 +25,28 @@ class FASTDEPLOY_DECL ProcessorManager {
  public:
   ~ProcessorManager();
 
+  /** \brief Use CUDA to boost the performance of processors
+   *
+   * \param[in] enable_cv_cuda ture: use CV-CUDA, false: use CUDA only
+   * \param[in] gpu_id GPU device id
+   * \return true if the preprocess successed, otherwise false
+   */
   void UseCuda(bool enable_cv_cuda = false, int gpu_id = -1);
 
   bool CudaUsed();
 
-  void SetStream(Mat* mat) {
+  void SetStream(FDMat* mat) {
 #ifdef WITH_GPU
     mat->SetStream(stream_);
 #endif
   }
 
+  void SetStream(FDMatBatch* mat_batch) {
+#ifdef WITH_GPU
+    mat_batch->SetStream(stream_);
+#endif
+  }
+
   void SyncStream() {
 #ifdef WITH_GPU
     FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess,
@@ -51,13 +64,13 @@ class FASTDEPLOY_DECL ProcessorManager {
    */
   bool Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs);
 
-  /** \brief The body of Run() function which needs to be implemented by a derived class
+  /** \brief Apply() is the body of Run() function, it needs to be implemented by a derived class
    *
-   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] image_batch The input image batch
    * \param[in] outputs The output tensors which will feed in runtime
    * \return true if the preprocess successed, otherwise false
    */
-  virtual bool Apply(std::vector<FDMat>* images,
+  virtual bool Apply(FDMatBatch* image_batch,
                      std::vector<FDTensor>* outputs) = 0;
 
  protected:
@@ -68,6 +81,11 @@ class FASTDEPLOY_DECL ProcessorManager {
   cudaStream_t stream_ = nullptr;
 #endif
   int device_id_ = -1;
+
+  std::vector<FDTensor> input_caches_;
+  std::vector<FDTensor> output_caches_;
+  FDTensor batch_input_cache_;
+  FDTensor batch_output_cache_;
 };
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/manager_pybind.cc b/fastdeploy/vision/common/processors/manager_pybind.cc
new file mode 100644
index 000000000..65507cce5
--- /dev/null
+++ b/fastdeploy/vision/common/processors/manager_pybind.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindProcessorManager(pybind11::module& m) {
+  pybind11::class_<vision::ProcessorManager>(m, "ProcessorManager")
+      .def("run",
+           [](vision::ProcessorManager& self,
+              std::vector<pybind11::array>& im_list) {
+             std::vector<vision::FDMat> images;
+             for (size_t i = 0; i < im_list.size(); ++i) {
+               images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
+             }
+             std::vector<FDTensor> outputs;
+             if (!self.Run(&images, &outputs)) {
+               throw std::runtime_error("Failed to process the input data");
+             }
+             if (!self.CudaUsed()) {
+               for (size_t i = 0; i < outputs.size(); ++i) {
+                 outputs[i].StopSharing();
+               }
+             }
+             return outputs;
+           })
+      .def("use_cuda",
+           [](vision::ProcessorManager& self, bool enable_cv_cuda = false,
+              int gpu_id = -1) { self.UseCuda(enable_cv_cuda, gpu_id); });
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/mat.cc b/fastdeploy/vision/common/processors/mat.cc
index 93d11f871..f56d0b585 100644
--- a/fastdeploy/vision/common/processors/mat.cc
+++ b/fastdeploy/vision/common/processors/mat.cc
@@ -247,5 +247,40 @@ std::vector<FDMat> WrapMat(const std::vector<cv::Mat>& images) {
   return mats;
 }
 
+bool CheckShapeConsistency(std::vector<Mat>* mats) {
+  for (size_t i = 1; i < mats->size(); ++i) {
+    if ((*mats)[i].Channels() != (*mats)[0].Channels() ||
+        (*mats)[i].Width() != (*mats)[0].Width() ||
+        (*mats)[i].Height() != (*mats)[0].Height()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+FDTensor* CreateCachedGpuInputTensor(Mat* mat) {
+#ifdef WITH_GPU
+  FDTensor* src = mat->Tensor();
+  if (src->device == Device::GPU) {
+    return src;
+  } else if (src->device == Device::CPU) {
+    // Mats on CPU, we need copy these tensors from CPU to GPU
+    FDASSERT(src->Shape().size() == 3, "The CPU tensor must has 3 dims.")
+    mat->input_cache->Resize(src->Shape(), src->Dtype(), "input_cache",
+                             Device::GPU);
+    FDASSERT(
+        cudaMemcpyAsync(mat->input_cache->Data(), src->Data(), src->Nbytes(),
+                        cudaMemcpyHostToDevice, mat->Stream()) == 0,
+        "[ERROR] Error occurs while copy memory from CPU to GPU.");
+    return mat->input_cache;
+  } else {
+    FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
+  }
+#else
+  FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
+#endif
+  return nullptr;
+}
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/mat.h b/fastdeploy/vision/common/processors/mat.h
index 568744a04..c29fdd4b2 100644
--- a/fastdeploy/vision/common/processors/mat.h
+++ b/fastdeploy/vision/common/processors/mat.h
@@ -119,6 +119,11 @@ struct FASTDEPLOY_DECL Mat {
   void SetChannels(int s) { channels = s; }
   void SetWidth(int w) { width = w; }
   void SetHeight(int h) { height = h; }
+
+  // When using CV-CUDA/CUDA, please set input/output cache,
+  // refer to manager.cc
+  FDTensor* input_cache = nullptr;
+  FDTensor* output_cache = nullptr;
 #ifdef WITH_GPU
   cudaStream_t Stream() const { return stream; }
   void SetStream(cudaStream_t s) { stream = s; }
@@ -165,5 +170,12 @@ FASTDEPLOY_DECL FDMat WrapMat(const cv::Mat& image);
  */
 FASTDEPLOY_DECL std::vector<FDMat> WrapMat(const std::vector<cv::Mat>& images);
 
+bool CheckShapeConsistency(std::vector<Mat>* mats);
+
+// Create an input tensor on GPU and save into input_cache.
+// If the Mat is on GPU, return the mat->Tensor() directly.
+// If the Mat is on CPU, then update the input cache tensor and copy the mat's
+// CPU tensor to this new GPU input cache tensor.
+FDTensor* CreateCachedGpuInputTensor(Mat* mat);
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/mat_batch.cc b/fastdeploy/vision/common/processors/mat_batch.cc
new file mode 100644
index 000000000..b73703588
--- /dev/null
+++ b/fastdeploy/vision/common/processors/mat_batch.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "fastdeploy/vision/common/processors/mat_batch.h"
+
+namespace fastdeploy {
+namespace vision {
+
+#ifdef WITH_GPU
+void FDMatBatch::SetStream(cudaStream_t s) {
+  stream = s;
+  for (size_t i = 0; i < mats->size(); ++i) {
+    (*mats)[i].SetStream(s);
+  }
+}
+#endif
+
+FDTensor* FDMatBatch::Tensor() {
+  if (has_batched_tensor) {
+    return &fd_tensor;
+  }
+  FDASSERT(CheckShapeConsistency(mats), "Mats shapes are not consistent.")
+  // Each mat has its own tensor,
+  // to get a batched tensor, we need copy these tensors to a batched tensor
+  FDTensor* src = (*mats)[0].Tensor();
+  auto new_shape = src->Shape();
+  new_shape.insert(new_shape.begin(), mats->size());
+  input_cache->Resize(new_shape, src->Dtype(), "batch_input_cache", device);
+  for (size_t i = 0; i < mats->size(); ++i) {
+    FDASSERT(device == (*mats)[i].Tensor()->device,
+             "Mats and MatBatch are not on the same device");
+    uint8_t* p = reinterpret_cast<uint8_t*>(input_cache->Data());
+    int num_bytes = (*mats)[i].Tensor()->Nbytes();
+    FDTensor::CopyBuffer(p + i * num_bytes, (*mats)[i].Tensor()->Data(),
+                         num_bytes, device, false);
+  }
+  SetTensor(input_cache);
+  return &fd_tensor;
+}
+
+void FDMatBatch::SetTensor(FDTensor* tensor) {
+  fd_tensor.SetExternalData(tensor->Shape(), tensor->Dtype(), tensor->Data(),
+                            tensor->device, tensor->device_id);
+  has_batched_tensor = true;
+}
+
+FDTensor* CreateCachedGpuInputTensor(FDMatBatch* mat_batch) {
+#ifdef WITH_GPU
+  auto mats = mat_batch->mats;
+  FDASSERT(CheckShapeConsistency(mats), "Mats shapes are not consistent.")
+  FDTensor* src = (*mats)[0].Tensor();
+  if (mat_batch->device == Device::GPU) {
+    return mat_batch->Tensor();
+  } else if (mat_batch->device == Device::CPU) {
+    // Mats on CPU, we need copy them to GPU and then get a batched GPU tensor
+    for (size_t i = 0; i < mats->size(); ++i) {
+      FDTensor* tensor = CreateCachedGpuInputTensor(&(*mats)[i]);
+      (*mats)[i].SetTensor(tensor);
+    }
+    return mat_batch->Tensor();
+  } else {
+    FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
+  }
+#else
+  FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
+#endif
+  return nullptr;
+}
+
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/mat_batch.h b/fastdeploy/vision/common/processors/mat_batch.h
new file mode 100644
index 000000000..ed5b408c3
--- /dev/null
+++ b/fastdeploy/vision/common/processors/mat_batch.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "fastdeploy/vision/common/processors/mat.h"
+
+#ifdef WITH_GPU
+#include <cuda_runtime_api.h>
+#endif
+
+namespace fastdeploy {
+namespace vision {
+
+enum FDMatBatchLayout { NHWC, NCHW };
+
+struct FASTDEPLOY_DECL FDMatBatch {
+  FDMatBatch() = default;
+
+  // MatBatch is intialized with a list of mats,
+  // the data is stored in the mats separately.
+  // Call Tensor() function to get a batched 4-dimension tensor.
+  explicit FDMatBatch(std::vector<Mat>* _mats) {
+    mats = _mats;
+    layout = FDMatBatchLayout::NHWC;
+    mat_type = ProcLib::OPENCV;
+  }
+
+  // Get the batched 4-dimension tensor.
+  FDTensor* Tensor();
+
+  void SetTensor(FDTensor* tensor);
+
+ private:
+#ifdef WITH_GPU
+  cudaStream_t stream = nullptr;
+#endif
+  FDTensor fd_tensor;
+
+ public:
+  // When using CV-CUDA/CUDA, please set input/output cache,
+  // refer to manager.cc
+  FDTensor* input_cache;
+  FDTensor* output_cache;
+#ifdef WITH_GPU
+  cudaStream_t Stream() const { return stream; }
+  void SetStream(cudaStream_t s);
+#endif
+
+  std::vector<FDMat>* mats;
+  ProcLib mat_type = ProcLib::OPENCV;
+  FDMatBatchLayout layout = FDMatBatchLayout::NHWC;
+  Device device = Device::CPU;
+
+  // False: the data is stored in the mats separately
+  // True: the data is stored in the fd_tensor continuously in 4 dimensions
+  bool has_batched_tensor = false;
+};
+
+// Create a batched input tensor on GPU and save into input_cache.
+// If the MatBatch is on GPU, return the Tensor() directly.
+// If the MatBatch is on CPU, then copy the CPU tensors to GPU and get a GPU
+// batched input tensor.
+FDTensor* CreateCachedGpuInputTensor(FDMatBatch* mat_batch);
+
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.cc b/fastdeploy/vision/common/processors/normalize_and_permute.cc
index 93850b97f..d38aeca86 100755
--- a/fastdeploy/vision/common/processors/normalize_and_permute.cc
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.cc
@@ -56,7 +56,7 @@ NormalizeAndPermute::NormalizeAndPermute(const std::vector<float>& mean,
   swap_rb_ = swap_rb;
 }
 
-bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
+bool NormalizeAndPermute::ImplByOpenCV(FDMat* mat) {
   cv::Mat* im = mat->GetOpenCVMat();
   int origin_w = im->cols;
   int origin_h = im->rows;
@@ -79,7 +79,7 @@ bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool NormalizeAndPermute::ImplByFlyCV(Mat* mat) {
+bool NormalizeAndPermute::ImplByFlyCV(FDMat* mat) {
   if (mat->layout != Layout::HWC) {
     FDERROR << "Only supports input with HWC layout." << std::endl;
     return false;
@@ -109,7 +109,7 @@ bool NormalizeAndPermute::ImplByFlyCV(Mat* mat) {
 }
 #endif
 
-bool NormalizeAndPermute::Run(Mat* mat, const std::vector<float>& mean,
+bool NormalizeAndPermute::Run(FDMat* mat, const std::vector<float>& mean,
                               const std::vector<float>& std, bool is_scale,
                               const std::vector<float>& min,
                               const std::vector<float>& max, ProcLib lib,
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.cu b/fastdeploy/vision/common/processors/normalize_and_permute.cu
index 69bb6af1d..fd482e9d6 100644
--- a/fastdeploy/vision/common/processors/normalize_and_permute.cu
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.cu
@@ -18,63 +18,110 @@
 namespace fastdeploy {
 namespace vision {
 
-__global__ void NormalizeAndPermuteKernel(uint8_t* src, float* dst,
+__global__ void NormalizeAndPermuteKernel(const uint8_t* src, float* dst,
                                           const float* alpha, const float* beta,
                                           int num_channel, bool swap_rb,
-                                          int edge) {
+                                          int batch_size, int edge) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= edge) return;
 
-  if (swap_rb) {
-    uint8_t tmp = src[num_channel * idx];
-    src[num_channel * idx] = src[num_channel * idx + 2];
-    src[num_channel * idx + 2] = tmp;
-  }
+  int img_size = edge / batch_size;
+  int n = idx / img_size;        // batch index
+  int p = idx - (n * img_size);  // pixel index within the image
 
   for (int i = 0; i < num_channel; ++i) {
-    dst[idx + edge * i] = src[num_channel * idx + i] * alpha[i] + beta[i];
+    int j = i;
+    if (swap_rb) {
+      j = 2 - i;
+    }
+    dst[n * img_size * num_channel + i * img_size + p] =
+        src[num_channel * idx + j] * alpha[i] + beta[i];
   }
 }
 
-bool NormalizeAndPermute::ImplByCuda(Mat* mat) {
+bool NormalizeAndPermute::ImplByCuda(FDMat* mat) {
   // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
-  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
 
   // Prepare output tensor
-  tensor_name = Name() + "_dst";
-  FDTensor* dst = UpdateAndGetCachedTensor(src->Shape(), FDDataType::FP32,
-                                           tensor_name, Device::GPU);
+  mat->output_cache->Resize(src->Shape(), FDDataType::FP32, "output_cache",
+                            Device::GPU);
 
   // Copy alpha and beta to GPU
-  tensor_name = Name() + "_alpha";
-  FDMat alpha_mat =
-      FDMat::Create(1, 1, alpha_.size(), FDDataType::FP32, alpha_.data());
-  FDTensor* alpha = CreateCachedGpuInputTensor(&alpha_mat, tensor_name);
+  gpu_alpha_.Resize({1, 1, static_cast<int>(alpha_.size())}, FDDataType::FP32,
+                    "alpha", Device::GPU);
+  cudaMemcpy(gpu_alpha_.Data(), alpha_.data(), gpu_alpha_.Nbytes(),
+             cudaMemcpyHostToDevice);
 
-  tensor_name = Name() + "_beta";
-  FDMat beta_mat =
-      FDMat::Create(1, 1, beta_.size(), FDDataType::FP32, beta_.data());
-  FDTensor* beta = CreateCachedGpuInputTensor(&beta_mat, tensor_name);
+  gpu_beta_.Resize({1, 1, static_cast<int>(beta_.size())}, FDDataType::FP32,
+                   "beta", Device::GPU);
+  cudaMemcpy(gpu_beta_.Data(), beta_.data(), gpu_beta_.Nbytes(),
+             cudaMemcpyHostToDevice);
 
-  int jobs = mat->Width() * mat->Height();
+  int jobs = 1 * mat->Width() * mat->Height();
   int threads = 256;
   int blocks = ceil(jobs / (float)threads);
   NormalizeAndPermuteKernel<<<blocks, threads, 0, mat->Stream()>>>(
       reinterpret_cast<uint8_t*>(src->Data()),
-      reinterpret_cast<float*>(dst->Data()),
-      reinterpret_cast<float*>(alpha->Data()),
-      reinterpret_cast<float*>(beta->Data()), mat->Channels(), swap_rb_, jobs);
+      reinterpret_cast<float*>(mat->output_cache->Data()),
+      reinterpret_cast<float*>(gpu_alpha_.Data()),
+      reinterpret_cast<float*>(gpu_beta_.Data()), mat->Channels(), swap_rb_, 1,
+      jobs);
 
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
   mat->device = Device::GPU;
   mat->layout = Layout::CHW;
   mat->mat_type = ProcLib::CUDA;
   return true;
 }
 
+bool NormalizeAndPermute::ImplByCuda(FDMatBatch* mat_batch) {
+  // Prepare input tensor
+  FDTensor* src = CreateCachedGpuInputTensor(mat_batch);
+
+  // Prepare output tensor
+  mat_batch->output_cache->Resize(src->Shape(), FDDataType::FP32,
+                                  "output_cache", Device::GPU);
+  // NHWC -> NCHW
+  std::swap(mat_batch->output_cache->shape[1],
+            mat_batch->output_cache->shape[3]);
+
+  // Copy alpha and beta to GPU
+  gpu_alpha_.Resize({1, 1, static_cast<int>(alpha_.size())}, FDDataType::FP32,
+                    "alpha", Device::GPU);
+  cudaMemcpy(gpu_alpha_.Data(), alpha_.data(), gpu_alpha_.Nbytes(),
+             cudaMemcpyHostToDevice);
+
+  gpu_beta_.Resize({1, 1, static_cast<int>(beta_.size())}, FDDataType::FP32,
+                   "beta", Device::GPU);
+  cudaMemcpy(gpu_beta_.Data(), beta_.data(), gpu_beta_.Nbytes(),
+             cudaMemcpyHostToDevice);
+
+  int jobs =
+      mat_batch->output_cache->Numel() / mat_batch->output_cache->shape[1];
+  int threads = 256;
+  int blocks = ceil(jobs / (float)threads);
+  NormalizeAndPermuteKernel<<<blocks, threads, 0, mat_batch->Stream()>>>(
+      reinterpret_cast<uint8_t*>(src->Data()),
+      reinterpret_cast<float*>(mat_batch->output_cache->Data()),
+      reinterpret_cast<float*>(gpu_alpha_.Data()),
+      reinterpret_cast<float*>(gpu_beta_.Data()),
+      mat_batch->output_cache->shape[1], swap_rb_,
+      mat_batch->output_cache->shape[0], jobs);
+
+  mat_batch->SetTensor(mat_batch->output_cache);
+  mat_batch->device = Device::GPU;
+  mat_batch->layout = FDMatBatchLayout::NCHW;
+  mat_batch->mat_type = ProcLib::CUDA;
+  return true;
+}
+
 #ifdef ENABLE_CVCUDA
-bool NormalizeAndPermute::ImplByCvCuda(Mat* mat) { return ImplByCuda(mat); }
+bool NormalizeAndPermute::ImplByCvCuda(FDMat* mat) { return ImplByCuda(mat); }
+
+bool NormalizeAndPermute::ImplByCvCuda(FDMatBatch* mat_batch) {
+  return ImplByCuda(mat_batch);
+}
 #endif
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.h b/fastdeploy/vision/common/processors/normalize_and_permute.h
index ff8394c67..da7039db4 100644
--- a/fastdeploy/vision/common/processors/normalize_and_permute.h
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.h
@@ -25,15 +25,17 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
                       const std::vector<float>& min = std::vector<float>(),
                       const std::vector<float>& max = std::vector<float>(),
                       bool swap_rb = false);
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef WITH_GPU
-  bool ImplByCuda(Mat* mat);
+  bool ImplByCuda(FDMat* mat);
+  bool ImplByCuda(FDMatBatch* mat_batch);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
+  bool ImplByCvCuda(FDMatBatch* mat_batch);
 #endif
   std::string Name() { return "NormalizeAndPermute"; }
 
@@ -47,7 +49,7 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
   // There will be some precomputation in contruct function
   // and the `norm(mat)` only need to compute result = mat * alpha + beta
   // which will reduce lots of time
-  static bool Run(Mat* mat, const std::vector<float>& mean,
+  static bool Run(FDMat* mat, const std::vector<float>& mean,
                   const std::vector<float>& std, bool is_scale = true,
                   const std::vector<float>& min = std::vector<float>(),
                   const std::vector<float>& max = std::vector<float>(),
@@ -76,6 +78,8 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
  private:
   std::vector<float> alpha_;
   std::vector<float> beta_;
+  FDTensor gpu_alpha_;
+  FDTensor gpu_beta_;
   bool swap_rb_;
 };
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/resize.cc b/fastdeploy/vision/common/processors/resize.cc
index 29a8798ad..0de6ddfc7 100644
--- a/fastdeploy/vision/common/processors/resize.cc
+++ b/fastdeploy/vision/common/processors/resize.cc
@@ -23,7 +23,7 @@
 namespace fastdeploy {
 namespace vision {
 
-bool Resize::ImplByOpenCV(Mat* mat) {
+bool Resize::ImplByOpenCV(FDMat* mat) {
   if (mat->layout != Layout::HWC) {
     FDERROR << "Resize: The format of input is not HWC." << std::endl;
     return false;
@@ -61,7 +61,7 @@ bool Resize::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool Resize::ImplByFlyCV(Mat* mat) {
+bool Resize::ImplByFlyCV(FDMat* mat) {
   if (mat->layout != Layout::HWC) {
     FDERROR << "Resize: The format of input is not HWC." << std::endl;
     return false;
@@ -123,7 +123,7 @@ bool Resize::ImplByFlyCV(Mat* mat) {
 #endif
 
 #ifdef ENABLE_CVCUDA
-bool Resize::ImplByCvCuda(Mat* mat) {
+bool Resize::ImplByCvCuda(FDMat* mat) {
   if (width_ == mat->Width() && height_ == mat->Height()) {
     return true;
   }
@@ -143,23 +143,20 @@ bool Resize::ImplByCvCuda(Mat* mat) {
   }
 
   // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
-  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
   auto src_tensor = CreateCvCudaTensorWrapData(*src);
 
   // Prepare output tensor
-  tensor_name = Name() + "_cvcuda_dst";
-  FDTensor* dst =
-      UpdateAndGetCachedTensor({height_, width_, mat->Channels()}, mat->Type(),
-                               tensor_name, Device::GPU);
-  auto dst_tensor = CreateCvCudaTensorWrapData(*dst);
+  mat->output_cache->Resize({height_, width_, mat->Channels()}, mat->Type(),
+                            "output_cache", Device::GPU);
+  auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
 
   // CV-CUDA Interp value is compatible with OpenCV
   cvcuda::Resize resize_op;
   resize_op(mat->Stream(), src_tensor, dst_tensor,
             NVCVInterpolationType(interp_));
 
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
   mat->SetWidth(width_);
   mat->SetHeight(height_);
   mat->device = Device::GPU;
@@ -168,8 +165,8 @@ bool Resize::ImplByCvCuda(Mat* mat) {
 }
 #endif
 
-bool Resize::Run(Mat* mat, int width, int height, float scale_w, float scale_h,
-                 int interp, bool use_scale, ProcLib lib) {
+bool Resize::Run(FDMat* mat, int width, int height, float scale_w,
+                 float scale_h, int interp, bool use_scale, ProcLib lib) {
   if (mat->Height() == height && mat->Width() == width) {
     return true;
   }
diff --git a/fastdeploy/vision/common/processors/resize.h b/fastdeploy/vision/common/processors/resize.h
index 54480108b..2b4f88a35 100644
--- a/fastdeploy/vision/common/processors/resize.h
+++ b/fastdeploy/vision/common/processors/resize.h
@@ -31,16 +31,16 @@ class FASTDEPLOY_DECL Resize : public Processor {
     use_scale_ = use_scale;
   }
 
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
 #endif
   std::string Name() { return "Resize"; }
 
-  static bool Run(Mat* mat, int width, int height, float scale_w = -1.0,
+  static bool Run(FDMat* mat, int width, int height, float scale_w = -1.0,
                   float scale_h = -1.0, int interp = 1, bool use_scale = false,
                   ProcLib lib = ProcLib::DEFAULT);
 
diff --git a/fastdeploy/vision/common/processors/resize_by_short.cc b/fastdeploy/vision/common/processors/resize_by_short.cc
index 1d6309f5d..535652fc7 100644
--- a/fastdeploy/vision/common/processors/resize_by_short.cc
+++ b/fastdeploy/vision/common/processors/resize_by_short.cc
@@ -23,7 +23,7 @@
 namespace fastdeploy {
 namespace vision {
 
-bool ResizeByShort::ImplByOpenCV(Mat* mat) {
+bool ResizeByShort::ImplByOpenCV(FDMat* mat) {
   cv::Mat* im = mat->GetOpenCVMat();
   int origin_w = im->cols;
   int origin_h = im->rows;
@@ -43,7 +43,7 @@ bool ResizeByShort::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool ResizeByShort::ImplByFlyCV(Mat* mat) {
+bool ResizeByShort::ImplByFlyCV(FDMat* mat) {
   fcv::Mat* im = mat->GetFlyCVMat();
   int origin_w = im->width();
   int origin_h = im->height();
@@ -87,10 +87,9 @@ bool ResizeByShort::ImplByFlyCV(Mat* mat) {
 #endif
 
 #ifdef ENABLE_CVCUDA
-bool ResizeByShort::ImplByCvCuda(Mat* mat) {
+bool ResizeByShort::ImplByCvCuda(FDMat* mat) {
   // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
-  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
   auto src_tensor = CreateCvCudaTensorWrapData(*src);
 
   double scale = GenerateScale(mat->Width(), mat->Height());
@@ -98,23 +97,69 @@ bool ResizeByShort::ImplByCvCuda(Mat* mat) {
   int height = static_cast<int>(round(scale * mat->Height()));
 
   // Prepare output tensor
-  tensor_name = Name() + "_cvcuda_dst";
-  FDTensor* dst = UpdateAndGetCachedTensor(
-      {height, width, mat->Channels()}, mat->Type(), tensor_name, Device::GPU);
-  auto dst_tensor = CreateCvCudaTensorWrapData(*dst);
+  mat->output_cache->Resize({height, width, mat->Channels()}, mat->Type(),
+                            "output_cache", Device::GPU);
+  auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
 
   // CV-CUDA Interp value is compatible with OpenCV
   cvcuda::Resize resize_op;
   resize_op(mat->Stream(), src_tensor, dst_tensor,
             NVCVInterpolationType(interp_));
 
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
   mat->SetWidth(width);
   mat->SetHeight(height);
   mat->device = Device::GPU;
   mat->mat_type = ProcLib::CVCUDA;
   return true;
 }
+
+bool ResizeByShort::ImplByCvCuda(FDMatBatch* mat_batch) {
+  // TODO(wangxinyu): to support batched tensor as input
+  FDASSERT(mat_batch->has_batched_tensor == false,
+           "ResizeByShort doesn't support batched tensor as input for now.");
+  // Prepare input batch
+  std::string tensor_name = Name() + "_cvcuda_src";
+  std::vector<FDTensor*> src_tensors;
+  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+    FDTensor* src = CreateCachedGpuInputTensor(&(*(mat_batch->mats))[i]);
+    src_tensors.push_back(src);
+  }
+  nvcv::ImageBatchVarShape src_batch(mat_batch->mats->size());
+  CreateCvCudaImageBatchVarShape(src_tensors, src_batch);
+
+  // Prepare output batch
+  tensor_name = Name() + "_cvcuda_dst";
+  std::vector<FDTensor*> dst_tensors;
+  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+    FDMat* mat = &(*(mat_batch->mats))[i];
+    double scale = GenerateScale(mat->Width(), mat->Height());
+    int width = static_cast<int>(round(scale * mat->Width()));
+    int height = static_cast<int>(round(scale * mat->Height()));
+    mat->output_cache->Resize({height, width, mat->Channels()}, mat->Type(),
+                              "output_cache", Device::GPU);
+    dst_tensors.push_back(mat->output_cache);
+  }
+  nvcv::ImageBatchVarShape dst_batch(mat_batch->mats->size());
+  CreateCvCudaImageBatchVarShape(dst_tensors, dst_batch);
+
+  // CV-CUDA Interp value is compatible with OpenCV
+  cvcuda::Resize resize_op;
+  resize_op(mat_batch->Stream(), src_batch, dst_batch,
+            NVCVInterpolationType(interp_));
+
+  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
+    FDMat* mat = &(*(mat_batch->mats))[i];
+    mat->SetTensor(dst_tensors[i]);
+    mat->SetWidth(dst_tensors[i]->Shape()[1]);
+    mat->SetHeight(dst_tensors[i]->Shape()[0]);
+    mat->device = Device::GPU;
+    mat->mat_type = ProcLib::CVCUDA;
+  }
+  mat_batch->device = Device::GPU;
+  mat_batch->mat_type = ProcLib::CVCUDA;
+  return true;
+}
 #endif
 
 double ResizeByShort::GenerateScale(const int origin_w, const int origin_h) {
@@ -143,7 +188,7 @@ double ResizeByShort::GenerateScale(const int origin_w, const int origin_h) {
   return scale;
 }
 
-bool ResizeByShort::Run(Mat* mat, int target_size, int interp, bool use_scale,
+bool ResizeByShort::Run(FDMat* mat, int target_size, int interp, bool use_scale,
                         const std::vector<int>& max_hw, ProcLib lib) {
   auto r = ResizeByShort(target_size, interp, use_scale, max_hw);
   return r(mat, lib);
diff --git a/fastdeploy/vision/common/processors/resize_by_short.h b/fastdeploy/vision/common/processors/resize_by_short.h
index 64a7f09f0..99078c708 100644
--- a/fastdeploy/vision/common/processors/resize_by_short.h
+++ b/fastdeploy/vision/common/processors/resize_by_short.h
@@ -28,16 +28,17 @@ class FASTDEPLOY_DECL ResizeByShort : public Processor {
     interp_ = interp;
     use_scale_ = use_scale;
   }
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
+  bool ImplByCvCuda(FDMatBatch* mat_batch);
 #endif
   std::string Name() { return "ResizeByShort"; }
 
-  static bool Run(Mat* mat, int target_size, int interp = 1,
+  static bool Run(FDMat* mat, int target_size, int interp = 1,
                   bool use_scale = true,
                   const std::vector<int>& max_hw = std::vector<int>(),
                   ProcLib lib = ProcLib::DEFAULT);
diff --git a/fastdeploy/vision/common/result.cc b/fastdeploy/vision/common/result.cc
index 446a39699..d48d9ddc4 100755
--- a/fastdeploy/vision/common/result.cc
+++ b/fastdeploy/vision/common/result.cc
@@ -16,11 +16,16 @@
 namespace fastdeploy {
 namespace vision {
 
-void ClassifyResult::Clear() {
+void ClassifyResult::Free() {
   std::vector<int32_t>().swap(label_ids);
   std::vector<float>().swap(scores);
 }
 
+void ClassifyResult::Clear() {
+  label_ids.clear();
+  scores.clear();
+}
+
 std::string ClassifyResult::Str() {
   std::string out;
   out = "ClassifyResult(\nlabel_ids: ";
@@ -47,11 +52,16 @@ void Mask::Reserve(int size) { data.reserve(size); }
 
 void Mask::Resize(int size) { data.resize(size); }
 
-void Mask::Clear() {
+void Mask::Free() {
   std::vector<uint8_t>().swap(data);
   std::vector<int64_t>().swap(shape);
 }
 
+void Mask::Clear() {
+  data.clear();
+  shape.clear();
+}
+
 std::string Mask::Str() {
   std::string out = "Mask(";
   size_t ndim = shape.size();
@@ -94,7 +104,7 @@ DetectionResult& DetectionResult::operator=(DetectionResult&& other) {
   return *this;
 }
 
-void DetectionResult::Clear() {
+void DetectionResult::Free() {
   std::vector<std::array<float, 4>>().swap(boxes);
   std::vector<float>().swap(scores);
   std::vector<int32_t>().swap(label_ids);
@@ -102,18 +112,30 @@ void DetectionResult::Clear() {
   contain_masks = false;
 }
 
+void DetectionResult::Clear() {
+  boxes.clear();
+  scores.clear();
+  label_ids.clear();
+  masks.clear();
+  contain_masks = false;
+}
+
 void DetectionResult::Reserve(int size) {
   boxes.reserve(size);
   scores.reserve(size);
   label_ids.reserve(size);
-  masks.reserve(size);
+  if (contain_masks) {
+    masks.reserve(size);
+  }
 }
 
 void DetectionResult::Resize(int size) {
   boxes.resize(size);
   scores.resize(size);
   label_ids.resize(size);
-  masks.resize(size);
+  if (contain_masks) {
+    masks.resize(size);
+  }
 }
 
 std::string DetectionResult::Str() {
@@ -139,12 +161,18 @@ std::string DetectionResult::Str() {
   return out;
 }
 
-void KeyPointDetectionResult::Clear() {
+void KeyPointDetectionResult::Free() {
   std::vector<std::array<float, 2>>().swap(keypoints);
   std::vector<float>().swap(scores);
   num_joints = -1;
 }
 
+void KeyPointDetectionResult::Clear() {
+  keypoints.clear();
+  scores.clear();
+  num_joints = -1;
+}
+
 void KeyPointDetectionResult::Reserve(int size) { keypoints.reserve(size); }
 
 void KeyPointDetectionResult::Resize(int size) { keypoints.resize(size); }
@@ -155,8 +183,8 @@ std::string KeyPointDetectionResult::Str() {
   out = "KeyPointDetectionResult: [x, y, conf]\n";
   for (size_t i = 0; i < keypoints.size(); ++i) {
     out = out + std::to_string(keypoints[i][0]) + "," +
-          std::to_string(keypoints[i][1]) + ", " +
-          std::to_string(scores[i]) + "\n";
+          std::to_string(keypoints[i][1]) + ", " + std::to_string(scores[i]) +
+          "\n";
   }
   out += "num_joints:" + std::to_string(num_joints) + "\n";
   return out;
@@ -170,22 +198,22 @@ void OCRResult::Clear() {
   cls_labels.clear();
 }
 
-void MOTResult::Clear(){
+void MOTResult::Clear() {
   boxes.clear();
   ids.clear();
   scores.clear();
   class_ids.clear();
 }
 
-std::string MOTResult::Str(){
+std::string MOTResult::Str() {
   std::string out;
-  out = "MOTResult:\nall boxes counts: "+std::to_string(boxes.size())+"\n";
+  out = "MOTResult:\nall boxes counts: " + std::to_string(boxes.size()) + "\n";
   out += "[xmin\tymin\txmax\tymax\tid\tscore]\n";
   for (size_t i = 0; i < boxes.size(); ++i) {
-    out = out + "["+ std::to_string(boxes[i][0]) + "\t" +
+    out = out + "[" + std::to_string(boxes[i][0]) + "\t" +
           std::to_string(boxes[i][1]) + "\t" + std::to_string(boxes[i][2]) +
-          "\t" + std::to_string(boxes[i][3]) + "\t" +
-          std::to_string(ids[i]) + "\t" + std::to_string(scores[i]) + "]\n";
+          "\t" + std::to_string(boxes[i][3]) + "\t" + std::to_string(ids[i]) +
+          "\t" + std::to_string(scores[i]) + "]\n";
   }
   return out;
 }
@@ -197,13 +225,20 @@ FaceDetectionResult::FaceDetectionResult(const FaceDetectionResult& res) {
   landmarks_per_face = res.landmarks_per_face;
 }
 
-void FaceDetectionResult::Clear() {
+void FaceDetectionResult::Free() {
   std::vector<std::array<float, 4>>().swap(boxes);
   std::vector<float>().swap(scores);
   std::vector<std::array<float, 2>>().swap(landmarks);
   landmarks_per_face = 0;
 }
 
+void FaceDetectionResult::Clear() {
+  boxes.clear();
+  scores.clear();
+  landmarks.clear();
+  landmarks_per_face = 0;
+}
+
 void FaceDetectionResult::Reserve(int size) {
   boxes.reserve(size);
   scores.reserve(size);
@@ -257,23 +292,22 @@ std::string FaceDetectionResult::Str() {
   return out;
 }
 
-void FaceAlignmentResult::Clear() {
+void FaceAlignmentResult::Free() {
   std::vector<std::array<float, 2>>().swap(landmarks);
 }
 
-void FaceAlignmentResult::Reserve(int size) {
-  landmarks.resize(size);
-}
+void FaceAlignmentResult::Clear() { landmarks.clear(); }
 
-void FaceAlignmentResult::Resize(int size) {
-  landmarks.resize(size);
-}
+void FaceAlignmentResult::Reserve(int size) { landmarks.resize(size); }
+
+void FaceAlignmentResult::Resize(int size) { landmarks.resize(size); }
 
 std::string FaceAlignmentResult::Str() {
   std::string out;
 
   out = "FaceAlignmentResult: [x, y]\n";
-  out = out + "There are " +std::to_string(landmarks.size()) + " landmarks, the top 10 are listed as below:\n";
+  out = out + "There are " + std::to_string(landmarks.size()) +
+        " landmarks, the top 10 are listed as below:\n";
   int landmarks_size = landmarks.size();
   size_t result_length = std::min(10, landmarks_size);
   for (size_t i = 0; i < result_length; ++i) {
@@ -355,7 +389,9 @@ FaceRecognitionResult::FaceRecognitionResult(const FaceRecognitionResult& res) {
   embedding.assign(res.embedding.begin(), res.embedding.end());
 }
 
-void FaceRecognitionResult::Clear() { std::vector<float>().swap(embedding); }
+void FaceRecognitionResult::Free() { std::vector<float>().swap(embedding); }
+
+void FaceRecognitionResult::Clear() { embedding.clear(); }
 
 void FaceRecognitionResult::Reserve(int size) { embedding.reserve(size); }
 
@@ -536,28 +572,23 @@ std::string OCRResult::Str() {
   return no_result;
 }
 
-void HeadPoseResult::Clear() {
-  std::vector<float>().swap(euler_angles);
-}
+void HeadPoseResult::Free() { std::vector<float>().swap(euler_angles); }
 
-void HeadPoseResult::Reserve(int size) {
-  euler_angles.resize(size);
-}
+void HeadPoseResult::Clear() { euler_angles.clear(); }
 
-void HeadPoseResult::Resize(int size) {
-  euler_angles.resize(size);
-}
+void HeadPoseResult::Reserve(int size) { euler_angles.resize(size); }
+
+void HeadPoseResult::Resize(int size) { euler_angles.resize(size); }
 
 std::string HeadPoseResult::Str() {
   std::string out;
 
   out = "HeadPoseResult: [yaw, pitch, roll]\n";
-  out = out + "yaw: " + std::to_string(euler_angles[0]) + "\n" +
-        "pitch: " + std::to_string(euler_angles[1]) + "\n" +
-        "roll: " + std::to_string(euler_angles[2]) + "\n";
+  out = out + "yaw: " + std::to_string(euler_angles[0]) + "\n" + "pitch: " +
+        std::to_string(euler_angles[1]) + "\n" + "roll: " +
+        std::to_string(euler_angles[2]) + "\n";
   return out;
 }
 
-
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/result.h b/fastdeploy/vision/common/result.h
index c68f6d4cf..7c4efde23 100755
--- a/fastdeploy/vision/common/result.h
+++ b/fastdeploy/vision/common/result.h
@@ -51,9 +51,12 @@ struct FASTDEPLOY_DECL ClassifyResult : public BaseResult {
   std::vector<float> scores;
   ResultType type = ResultType::CLASSIFY;
 
-  /// Clear result
+  /// Clear ClassifyResult
   void Clear();
 
+  /// Clear ClassifyResult and free the memory
+  void Free();
+
   /// Copy constructor
   ClassifyResult(const ClassifyResult& other) = default;
   /// Move assignment
@@ -72,9 +75,12 @@ struct FASTDEPLOY_DECL Mask : public BaseResult {
   std::vector<int64_t> shape;  // (H,W) ...
   ResultType type = ResultType::MASK;
 
-  /// clear mask
+  /// clear Mask result
   void Clear();
 
+  /// Clear Mask result and free the memory
+  void Free();
+
   /// Return a mutable pointer of the mask data buffer
   void* Data() { return data.data(); }
 
@@ -117,9 +123,12 @@ struct FASTDEPLOY_DECL DetectionResult : public BaseResult {
   /// Move assignment
   DetectionResult& operator=(DetectionResult&& other);
 
-  /// Clear detection result
+  /// Clear DetectionResult
   void Clear();
 
+  /// Clear DetectionResult and free the memory
+  void Free();
+
   void Reserve(int size);
 
   void Resize(int size);
@@ -140,9 +149,12 @@ struct FASTDEPLOY_DECL KeyPointDetectionResult : public BaseResult {
   int num_joints = -1;
 
   ResultType type = ResultType::KEYPOINT_DETECTION;
-  /// Clear detection result
+  /// Clear KeyPointDetectionResult
   void Clear();
 
+  /// Clear KeyPointDetectionResult and free the memory
+  void Free();
+
   void Reserve(int size);
 
   void Resize(int size);
@@ -215,9 +227,12 @@ struct FASTDEPLOY_DECL FaceDetectionResult : public BaseResult {
 
   FaceDetectionResult() { landmarks_per_face = 0; }
   FaceDetectionResult(const FaceDetectionResult& res);
-  /// Clear detection result
+  /// Clear FaceDetectionResult
   void Clear();
 
+  /// Clear FaceDetectionResult and free the memory
+  void Free();
+
   void Reserve(int size);
 
   void Resize(int size);
@@ -233,9 +248,12 @@ struct FASTDEPLOY_DECL FaceAlignmentResult : public BaseResult {
   std::vector<std::array<float, 2>> landmarks;
 
   ResultType type = ResultType::FACE_ALIGNMENT;
-  /// Clear facealignment result
+  /// Clear FaceAlignmentResult
   void Clear();
 
+  /// Clear FaceAlignmentResult and free the memory
+  void Free();
+
   void Reserve(int size);
 
   void Resize(int size);
@@ -292,9 +310,12 @@ struct FASTDEPLOY_DECL FaceRecognitionResult : public BaseResult {
 
   FaceRecognitionResult() {}
   FaceRecognitionResult(const FaceRecognitionResult& res);
-  /// Clear detection result
+  /// Clear FaceRecognitionResult
   void Clear();
 
+  /// Clear FaceRecognitionResult and free the memory
+  void Free();
+
   void Reserve(int size);
 
   void Resize(int size);
@@ -347,9 +368,12 @@ struct FASTDEPLOY_DECL HeadPoseResult : public BaseResult {
   std::vector<float> euler_angles;
 
   ResultType type = ResultType::HEADPOSE;
-  /// Clear headpose result
+  /// Clear HeadPoseResult
   void Clear();
 
+  /// Clear HeadPoseResult and free the memory
+  void Free();
+
   void Reserve(int size);
 
   void Resize(int size);
diff --git a/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h b/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h
index eea3fe521..a6b6f0cc9 100755
--- a/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h
+++ b/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h
@@ -40,12 +40,12 @@ class FASTDEPLOY_DECL RKYOLOPostprocessor {
            std::vector<DetectionResult>* results);
 
   /// Set nms_threshold, default 0.45
-  void SetNMSThreshold(const float& nms_threshold) {
+  void SetNMSThreshold(float nms_threshold) {
     nms_threshold_ = nms_threshold;
   }
 
   /// Set conf_threshold, default 0.25
-  void SetConfThreshold(const float& conf_threshold) {
+  void SetConfThreshold(float conf_threshold) {
     conf_threshold_ = conf_threshold;
   }
 
@@ -56,21 +56,21 @@ class FASTDEPLOY_DECL RKYOLOPostprocessor {
   float GetNMSThreshold() const { return nms_threshold_; }
 
   /// Set height and weight
-  void SetHeightAndWeight(int& height, int& width) {
+  void SetHeightAndWeight(int height,int width) {
     height_ = height;
     width_ = width;
   }
 
   /// Set pad_hw_values
-  void SetPadHWValues(std::vector<std::vector<int>> pad_hw_values) {
+  void SetPadHWValues(const std::vector<std::vector<int>>& pad_hw_values) {
     pad_hw_values_ = pad_hw_values;
   }
 
   /// Set scale
-  void SetScale(std::vector<float> scale) { scale_ = scale; }
+  void SetScale(const std::vector<float>& scale) { scale_ = scale; }
 
   /// Set Anchor
-  void SetAnchor(std::vector<int> anchors, int anchor_per_branch) {
+  void SetAnchor(const std::vector<int>& anchors, int anchor_per_branch) {
     anchors_ = anchors;
     anchor_per_branch_ = anchor_per_branch;
   }
diff --git a/fastdeploy/vision/detection/contrib/rknpu2/utils.cc b/fastdeploy/vision/detection/contrib/rknpu2/utils.cc
index 4271def4a..f86a37b12 100644
--- a/fastdeploy/vision/detection/contrib/rknpu2/utils.cc
+++ b/fastdeploy/vision/detection/contrib/rknpu2/utils.cc
@@ -18,26 +18,6 @@ namespace detection {
 float Clamp(float val, int min, int max) {
   return val > min ? (val < max ? val : max) : min;
 }
-
-float Sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
-
-float UnSigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
-
-inline int32_t __clip(float val, float min, float max) {
-  float f = val <= min ? min : (val >= max ? max : val);
-  return f;
-}
-
-int8_t QntF32ToAffine(float f32, int32_t zp, float scale) {
-  float dst_val = (f32 / scale) + zp;
-  int8_t res = (int8_t)__clip(dst_val, -128, 127);
-  return res;
-}
-
-float DeqntAffineToF32(int8_t qnt, int32_t zp, float scale) {
-  return ((float)qnt - (float)zp) * scale;
-}
-
 static float CalculateOverlap(float xmin0, float ymin0, float xmax0,
                               float ymax0, float xmin1, float ymin1,
                               float xmax1, float ymax1) {
diff --git a/fastdeploy/vision/detection/contrib/rknpu2/utils.h b/fastdeploy/vision/detection/contrib/rknpu2/utils.h
index 1fa533082..f556bb245 100644
--- a/fastdeploy/vision/detection/contrib/rknpu2/utils.h
+++ b/fastdeploy/vision/detection/contrib/rknpu2/utils.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include <stdint.h>
+#include <cstdint>
 #include <cmath>
 #include <vector>
 
@@ -20,11 +20,6 @@ namespace fastdeploy {
 namespace vision {
 namespace detection {
 float Clamp(float val, int min, int max);
-float Sigmoid(float x);
-float UnSigmoid(float y);
-inline static int32_t __clip(float val, float min, float max);
-int8_t QntF32ToAffine(float f32, int32_t zp, float scale);
-float DeqntAffineToF32(int8_t qnt, int32_t zp, float scale);
 int NMS(int valid_count, std::vector<float>& output_locations,
         std::vector<int>& class_id, std::vector<int>& order, float threshold,
         bool class_agnostic);
diff --git a/fastdeploy/vision/facedet/contrib/centerface/centerface.cc b/fastdeploy/vision/facedet/contrib/centerface/centerface.cc
new file mode 100644
index 000000000..3d17f1938
--- /dev/null
+++ b/fastdeploy/vision/facedet/contrib/centerface/centerface.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/facedet/contrib/centerface/centerface.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy{
+
+namespace vision{
+
+namespace facedet{
+
+CenterFace::CenterFace(const std::string& model_file,
+                       const std::string& params_file,
+                       const RuntimeOption& custom_option,
+                       const ModelFormat& model_format) {
+  if (model_format == ModelFormat::ONNX) {
+    valid_cpu_backends = {Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
+  } else {
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
+    valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
+  }
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool CenterFace::Initialize(){
+  if (!InitRuntime()){
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool CenterFace::Predict(const cv::Mat& im, FaceDetectionResult* result){
+  std::vector<FaceDetectionResult> results;
+  if (!BatchPredict({im}, &results)) {
+    return false;
+  }
+  *result = std::move(results[0]);
+  return true;
+}
+
+bool CenterFace::BatchPredict(const std::vector<cv::Mat>& images,
+                              std::vector<FaceDetectionResult>* results){
+  std::vector<FDMat> fd_images = WrapMat(images);
+  FDASSERT(images.size() == 1, "Only support batch = 1 now.");
+  std::vector<std::map<std::string, std::array<float, 2>>> ims_info;
+  if (!preprocessor_.Run(&fd_images, &reused_input_tensors_, &ims_info)) {
+    FDERROR << "Failed to preprocess the input image." << std::endl;
+    return false;
+  }
+
+  reused_input_tensors_[0].name = InputInfoOfRuntime(0).name;
+  if (!Infer(reused_input_tensors_, &reused_output_tensors_)) {
+    FDERROR << "Failed to inference by runtime." << std::endl;
+    return false;
+  }
+
+  if (!postprocessor_.Run(reused_output_tensors_, results, ims_info)){
+    FDERROR << "Failed to postprocess the inference results by runtime." << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/facedet/contrib/centerface/centerface.h b/fastdeploy/vision/facedet/contrib/centerface/centerface.h
new file mode 100644
index 000000000..8d14d52b9
--- /dev/null
+++ b/fastdeploy/vision/facedet/contrib/centerface/centerface.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+#include "fastdeploy/vision/facedet/contrib/centerface/preprocessor.h"
+#include "fastdeploy/vision/facedet/contrib/centerface/postprocessor.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+/*! @brief CenterFace model object used when to load a CenterFace model exported by CenterFace.
+ */
+class FASTDEPLOY_DECL CenterFace: public FastDeployModel{
+ public:
+  /** \brief  Set path of model file and the configuration of runtime.
+   *
+   * \param[in] model_file Path of model file, e.g ./centerface.onnx
+   * \param[in] params_file Path of parameter file, e.g ppyoloe/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+   * \param[in] custom_option RuntimeOption for inference, the default will use cpu, and choose the backend defined in "valid_cpu_backends"
+   * \param[in] model_format Model format of the loaded model, default is ONNX format
+   */
+  CenterFace(const std::string& model_file, const std::string& params_file = "",
+              const RuntimeOption& custom_option = RuntimeOption(),
+              const ModelFormat& model_format = ModelFormat::ONNX);
+
+  std::string ModelName() {return "centerface";}
+
+  /** \brief Predict the detection result for an input image
+   *
+   * \param[in] img The input image data, comes from cv::imread(), is a 3-D array with layout HWC, BGR format
+   * \param[in] result The output detection result will be writen to this structure
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool Predict(const cv::Mat& im, FaceDetectionResult* result);
+
+  /** \brief Predict the detection results for a batch of input images
+   *
+   * \param[in] imgs, The input image list, each element comes from cv::imread()
+   * \param[in] results The output detection result list
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool BatchPredict(const std::vector<cv::Mat>& images,
+                            std::vector<FaceDetectionResult>* results);
+
+  /// Get preprocessor reference of CenterFace
+  virtual CenterFacePreprocessor& GetPreprocessor() {
+    return preprocessor_;
+  }
+
+  /// Get postprocessor reference of CenterFace
+  virtual CenterFacePostprocessor& GetPostprocessor() {
+    return postprocessor_;
+  }
+
+ protected:
+  bool Initialize();
+  CenterFacePreprocessor preprocessor_;
+  CenterFacePostprocessor postprocessor_;
+};
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/contrib/centerface/centerface_pybind.cc b/fastdeploy/vision/facedet/contrib/centerface/centerface_pybind.cc
new file mode 100644
index 000000000..586e427b9
--- /dev/null
+++ b/fastdeploy/vision/facedet/contrib/centerface/centerface_pybind.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindCenterFace(pybind11::module& m) {
+  pybind11::class_<vision::facedet::CenterFacePreprocessor>(
+      m, "CenterFacePreprocessor")
+      .def(pybind11::init<>())
+      .def("run", [](vision::facedet::CenterFacePreprocessor& self, std::vector<pybind11::array>& im_list) {
+        std::vector<vision::FDMat> images;
+        for (size_t i = 0; i < im_list.size(); ++i) {
+          images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
+        }
+        std::vector<FDTensor> outputs;
+        std::vector<std::map<std::string, std::array<float, 2>>> ims_info;
+        if (!self.Run(&images, &outputs, &ims_info)) {
+          throw std::runtime_error("Failed to preprocess the input data in CenterFacePreprocessor.");
+        }
+        for (size_t i = 0; i < outputs.size(); ++i) {
+          outputs[i].StopSharing();
+        }
+        return make_pair(outputs, ims_info);
+      })
+      .def_property("size", &vision::facedet::CenterFacePreprocessor::GetSize, &vision::facedet::CenterFacePreprocessor::SetSize);
+
+  pybind11::class_<vision::facedet::CenterFacePostprocessor>(
+      m, "CenterFacePostprocessor")
+      .def(pybind11::init<>())
+      .def("run", [](vision::facedet::CenterFacePostprocessor& self, std::vector<FDTensor>& inputs,
+                     const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info) {
+        std::vector<vision::FaceDetectionResult> results;
+        if (!self.Run(inputs, &results, ims_info)) {
+          throw std::runtime_error("Failed to postprocess the runtime result in CenterFacePostprocessor.");
+        }
+        return results; 
+      })
+      .def("run", [](vision::facedet::CenterFacePostprocessor& self, std::vector<pybind11::array>& input_array,
+                     const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info) {
+        std::vector<vision::FaceDetectionResult> results;
+        std::vector<FDTensor> inputs;
+        PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
+        if (!self.Run(inputs, &results, ims_info)) {
+          throw std::runtime_error("Failed to postprocess the runtime result in CenterFacePostprocessor.");
+        }
+        return results;
+      })
+      .def_property("conf_threshold", &vision::facedet::CenterFacePostprocessor::GetConfThreshold, &vision::facedet::CenterFacePostprocessor::SetConfThreshold)
+      .def_property("nms_threshold", &vision::facedet::CenterFacePostprocessor::GetNMSThreshold, &vision::facedet::CenterFacePostprocessor::SetNMSThreshold);
+    
+  pybind11::class_<vision::facedet::CenterFace, FastDeployModel>(m, "CenterFace")
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>())
+      .def("predict",
+           [](vision::facedet::CenterFace& self, pybind11::array& data) {
+             auto mat = PyArrayToCvMat(data);
+             vision::FaceDetectionResult res;
+             self.Predict(mat, &res);
+             return res;
+           })
+      .def("batch_predict", [](vision::facedet::CenterFace& self, std::vector<pybind11::array>& data) {
+        std::vector<cv::Mat> images;
+        for (size_t i = 0; i < data.size(); ++i) {
+          images.push_back(PyArrayToCvMat(data[i]));
+        }
+        std::vector<vision::FaceDetectionResult> results;
+        self.BatchPredict(images, &results);
+        return results;
+      })
+      .def_property_readonly("preprocessor", &vision::facedet::CenterFace::GetPreprocessor)
+      .def_property_readonly("postprocessor", &vision::facedet::CenterFace::GetPostprocessor);
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/contrib/centerface/postprocessor.cc b/fastdeploy/vision/facedet/contrib/centerface/postprocessor.cc
new file mode 100644
index 000000000..98931956b
--- /dev/null
+++ b/fastdeploy/vision/facedet/contrib/centerface/postprocessor.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/facedet/contrib/centerface/postprocessor.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+CenterFacePostprocessor::CenterFacePostprocessor() {
+  conf_threshold_ = 0.5;
+  nms_threshold_ = 0.3;
+  landmarks_per_face_ = 5;
+}
+
+bool CenterFacePostprocessor::Run(const std::vector<FDTensor>& infer_result,
+                              std::vector<FaceDetectionResult>* results,
+                              const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info) {
+  int batch = infer_result[0].shape[0];
+ 
+  results->resize(batch);
+  FDTensor heatmap = infer_result[0]; //(1 1 160 160)
+  FDTensor scales = infer_result[1]; //(1 2 160 160)
+  FDTensor offsets = infer_result[2]; //(1 2 160 160)
+  FDTensor landmarks = infer_result[3]; //(1 10 160 160)
+  for (size_t bs = 0; bs < batch; ++bs) {
+    (*results)[bs].Clear();
+    (*results)[bs].landmarks_per_face = landmarks_per_face_;
+    (*results)[bs].Reserve(heatmap.shape[2]);
+    if (infer_result[0].dtype != FDDataType::FP32) {
+      FDERROR << "Only support post process with float32 data." << std::endl;
+      return false;
+    }
+    int fea_h = heatmap.shape[2];
+    int fea_w = heatmap.shape[3];
+    int spacial_size = fea_w * fea_h;
+
+    float *heatmap_out = static_cast<float*>(heatmap.Data());
+
+    float *scale0 = static_cast<float*>(scales.Data());
+    float *scale1 = scale0 + spacial_size;
+
+    float *offset0 = static_cast<float*>(offsets.Data());
+    float *offset1 = offset0 + spacial_size;
+    float confidence = 0.f;
+
+    std::vector<int> ids;
+    for (int i = 0; i < fea_h; i++) {
+      for (int j = 0; j < fea_w; j++) {
+        if (heatmap_out[i*fea_w + j] > conf_threshold_) {
+          ids.push_back(i);
+          ids.push_back(j);
+        }
+      }
+    }
+
+    auto iter_out = ims_info[bs].find("output_shape");
+    auto iter_ipt = ims_info[bs].find("input_shape");
+    FDASSERT(iter_out != ims_info[bs].end() && iter_ipt != ims_info[bs].end(),
+            "Cannot find input_shape or output_shape from im_info.");
+    float out_h = iter_out->second[0];
+    float out_w = iter_out->second[1];
+    float ipt_h = iter_ipt->second[0];
+    float ipt_w = iter_ipt->second[1];
+    float scale_h = ipt_h / out_h;
+    float scale_w = ipt_w / out_w;
+
+    for (int i = 0; i < ids.size() / 2; i++) {
+      int id_h = ids[2 * i];
+      int id_w = ids[2 * i + 1];
+      int index = id_h * fea_w + id_w;
+      confidence = heatmap_out[index];
+
+      float s0 = std::exp(scale0[index]) * 4;
+      float s1 = std::exp(scale1[index]) * 4;
+      float o0 = offset0[index];
+      float o1 = offset1[index];
+
+      float x1 = (id_w + o1 + 0.5) * 4 - s1 / 2 > 0.f ? (id_w + o1 + 0.5) * 4 - s1 / 2 : 0;
+      float y1 =(id_h + o0 + 0.5) * 4 - s0 / 2 > 0 ? (id_h + o0 + 0.5) * 4 - s0 / 2 : 0;
+      float x2 = 0, y2 = 0;
+      x1 = x1 < (float)out_w ? x1 : (float)out_w;
+      y1 = y1 < (float)out_h ? y1 : (float)out_h;
+      x2 =  x1 + s1 < (float)out_w ? x1 + s1 : (float)out_w;
+      y2 = y1 + s0 < (float)out_h ? y1 + s0 : (float)out_h;
+
+      (*results)[bs].boxes.emplace_back(std::array<float, 4>{x1, y1, x2, y2});
+      (*results)[bs].scores.push_back(confidence);
+      // decode landmarks (default 5 landmarks)
+      if (landmarks_per_face_ > 0) {
+        // reference: utils/box_utils.py#L241
+        for (size_t j = 0; j < landmarks_per_face_; j++) {
+          float *xmap = (float*)landmarks.Data() + (2 * j + 1) * spacial_size;
+          float *ymap = (float*)landmarks.Data() + (2 * j) * spacial_size;
+          float lx = (x1 + xmap[index] * s1) * scale_w;
+          float ly = (y1 + ymap[index] * s0) * scale_h;
+          (*results)[bs].landmarks.emplace_back(std::array<float, 2>{lx, ly});
+        }
+      }
+    }
+
+    if ((*results)[bs].boxes.size() == 0) {
+      return true;
+    }
+
+    utils::NMS(&((*results)[bs]), nms_threshold_);
+
+    for (size_t i = 0; i < (*results)[bs].boxes.size(); ++i) {
+      (*results)[bs].boxes[i][0] = std::max((*results)[bs].boxes[i][0] * scale_w, 0.0f);
+      (*results)[bs].boxes[i][1] = std::max((*results)[bs].boxes[i][1] * scale_h, 0.0f);
+      (*results)[bs].boxes[i][2] = std::max((*results)[bs].boxes[i][2] * scale_w, 0.0f);
+      (*results)[bs].boxes[i][3] = std::max((*results)[bs].boxes[i][3] * scale_h, 0.0f);
+      (*results)[bs].boxes[i][0] = std::min((*results)[bs].boxes[i][0], ipt_w - 1.0f);
+      (*results)[bs].boxes[i][1] = std::min((*results)[bs].boxes[i][1], ipt_h - 1.0f);
+      (*results)[bs].boxes[i][2] = std::min((*results)[bs].boxes[i][2], ipt_w - 1.0f);
+      (*results)[bs].boxes[i][3] = std::min((*results)[bs].boxes[i][3], ipt_h - 1.0f);
+    }
+  }
+  return true;
+}
+
+}  // namespace detection
+}  // namespace vision
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/facedet/contrib/centerface/postprocessor.h b/fastdeploy/vision/facedet/contrib/centerface/postprocessor.h
new file mode 100644
index 000000000..918b8ab1c
--- /dev/null
+++ b/fastdeploy/vision/facedet/contrib/centerface/postprocessor.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+class FASTDEPLOY_DECL CenterFacePostprocessor{
+ public:
+  /*! @brief Postprocessor object for CenterFace serials model.
+   */
+  CenterFacePostprocessor();
+
+  /** \brief Process the result of runtime and fill to FaceDetectionResult structure
+   *
+   * \param[in] infer_result The inference result from runtime
+   * \param[in] results The output result of detection
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \return true if the postprocess successed, otherwise false
+   */
+  bool Run(const std::vector<FDTensor>& infer_result,
+          std::vector<FaceDetectionResult>* results,
+          const std::vector<std::map<std::string,
+          std::array<float, 2>>>& ims_info);
+
+  /// Set conf_threshold, default 0.5
+  void SetConfThreshold(const float& conf_threshold) {
+    conf_threshold_ = conf_threshold;
+  }
+
+  /// Get conf_threshold, default 0.5
+  float GetConfThreshold() const { return conf_threshold_; }
+
+  /// Set nms_threshold, default 0.3
+  void SetNMSThreshold(const float& nms_threshold) {
+    nms_threshold_ = nms_threshold;
+  }
+
+  /// Get nms_threshold, default 0.3
+  float GetNMSThreshold() const { return nms_threshold_; }
+
+ protected:
+  float conf_threshold_;
+  float nms_threshold_;
+  int landmarks_per_face_;
+};
+
+}  // namespace facedet
+}  // namespace vision
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/facedet/contrib/centerface/preprocessor.cc b/fastdeploy/vision/facedet/contrib/centerface/preprocessor.cc
new file mode 100644
index 000000000..ae3cacb8d
--- /dev/null
+++ b/fastdeploy/vision/facedet/contrib/centerface/preprocessor.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/facedet/contrib/centerface/preprocessor.h"
+#include "fastdeploy/function/concat.h"
+#include "fastdeploy/vision/common/processors/mat.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+CenterFacePreprocessor::CenterFacePreprocessor() {
+  size_ = {640, 640};
+}
+
+bool CenterFacePreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+                                 std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+  if (images->size() == 0) {
+    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    return false;
+  }
+  ims_info->resize(images->size());
+  outputs->resize(1);
+  std::vector<FDTensor> tensors(images->size());
+  for (size_t i = 0; i < images->size(); i++) {
+    if (!Preprocess(&(*images)[i], &tensors[i], &(*ims_info)[i])) {
+      FDERROR << "Failed to preprocess input image." << std::endl;
+      return false;
+    }
+  }
+
+  if (tensors.size() == 1) {
+    (*outputs)[0] = std::move(tensors[0]);
+  } else {
+    function::Concat(tensors, &((*outputs)[0]), 0);
+  }
+  return true;
+}
+
+bool CenterFacePreprocessor::Preprocess(FDMat* mat, FDTensor* output,
+                                        std::map<std::string, std::array<float, 2>>* im_info){
+  // Record the shape of image and the shape of preprocessed image
+  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
+                               static_cast<float>(mat->Width())};
+
+  // centerface's preprocess steps
+  // 1. Resize
+  // 2. ConvertAndPermute
+  Resize::Run(mat, size_[0], size_[1]);
+  std::vector<float> alpha = {1.0f, 1.0f, 1.0f};
+  std::vector<float> beta = {0.0f, 0.0f, 0.0f};
+  ConvertAndPermute::Run(mat, alpha, beta,true);
+
+  // Record output shape of preprocessed image
+  (*im_info)["output_shape"] = {static_cast<float>(mat->Height()),
+                                static_cast<float>(mat->Width())};
+
+  mat->ShareWithTensor(output);
+  output->ExpandDim(0);
+  return true;
+}
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespacefastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/facedet/contrib/centerface/preprocessor.h b/fastdeploy/vision/facedet/contrib/centerface/preprocessor.h
new file mode 100644
index 000000000..a856306cb
--- /dev/null
+++ b/fastdeploy/vision/facedet/contrib/centerface/preprocessor.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+class FASTDEPLOY_DECL CenterFacePreprocessor{
+ public:
+  /** \brief Create a preprocessor instance for CenterFace serials model
+   */
+  CenterFacePreprocessor();
+
+  /** \brief Process the input image and prepare input tensors for runtime
+   *
+   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] outputs The output tensors which will feed in runtime
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \ret
+   */
+  bool Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+           std::vector<std::map<std::string, std::array<float, 2>>>* ims_info);
+
+  /// Set target size, tuple of (width, height), default size = {640, 640}
+  void SetSize(const std::vector<int>& size) { size_ = size; }
+
+  /// Get target size, tuple of (width, height), default size = {640, 640}
+  std::vector<int> GetSize() const { return size_; }
+
+
+ protected:
+  bool Preprocess(FDMat * mat, FDTensor* output,
+                  std::map<std::string, std::array<float, 2>>* im_info);
+
+  // target size, tuple of (width, height), default size = {640, 640}
+  std::vector<int> size_;
+
+};
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/facedet_pybind.cc b/fastdeploy/vision/facedet/facedet_pybind.cc
index e5a62542d..a36eb0b83 100644
--- a/fastdeploy/vision/facedet/facedet_pybind.cc
+++ b/fastdeploy/vision/facedet/facedet_pybind.cc
@@ -20,6 +20,8 @@ void BindRetinaFace(pybind11::module& m);
 void BindUltraFace(pybind11::module& m);
 void BindYOLOv5Face(pybind11::module& m);
 void BindYOLOv7Face(pybind11::module& m);
+void BindCenterFace(pybind11::module& m);
+void BindBlazeFace(pybind11::module& m);
 void BindSCRFD(pybind11::module& m);
 
 void BindFaceDet(pybind11::module& m) {
@@ -28,6 +30,8 @@ void BindFaceDet(pybind11::module& m) {
   BindUltraFace(facedet_module);
   BindYOLOv5Face(facedet_module);
   BindYOLOv7Face(facedet_module);
+  BindCenterFace(facedet_module);
+  BindBlazeFace(facedet_module);
   BindSCRFD(facedet_module);
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/ppdet/blazeface/blazeface.cc b/fastdeploy/vision/facedet/ppdet/blazeface/blazeface.cc
new file mode 100644
index 000000000..5541f5d67
--- /dev/null
+++ b/fastdeploy/vision/facedet/ppdet/blazeface/blazeface.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/facedet/ppdet/blazeface/blazeface.h"
+#include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy{
+
+namespace vision{
+
+namespace facedet{
+
+BlazeFace::BlazeFace(const std::string& model_file,
+                     const std::string& params_file,
+                     const std::string& config_file,
+                     const RuntimeOption& custom_option,
+                     const ModelFormat& model_format)
+                     : preprocessor_(config_file){
+  valid_cpu_backends = {Backend::OPENVINO, Backend::PDINFER, Backend::LITE};
+  valid_gpu_backends = {Backend::OPENVINO, Backend::LITE, Backend::PDINFER};
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool BlazeFace::Initialize(){
+  if (!InitRuntime()){
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool BlazeFace::Predict(const cv::Mat& im, FaceDetectionResult* result){
+  std::vector<FaceDetectionResult> results;
+  if (!this->BatchPredict({im}, &results)) {
+    return false;
+  }
+  *result = std::move(results[0]);
+  return true;
+}
+
+bool BlazeFace::BatchPredict(const std::vector<cv::Mat>& images,
+                              std::vector<FaceDetectionResult>* results){
+  std::vector<FDMat> fd_images = WrapMat(images);
+  FDASSERT(images.size() == 1, "Only support batch = 1 now.");
+  std::vector<std::map<std::string, std::array<float, 2>>> ims_info;
+  if (!preprocessor_.Run(&fd_images, &reused_input_tensors_, &ims_info)) {
+    FDERROR << "Failed to preprocess the input image." << std::endl;
+    return false;
+  }
+
+  reused_input_tensors_[0].name = "image";
+  reused_input_tensors_[1].name = "scale_factor";
+  reused_input_tensors_[2].name = "im_shape";
+
+  // Some models don't need scale_factor and im_shape as input
+  while (reused_input_tensors_.size() != NumInputsOfRuntime()) {
+    reused_input_tensors_.pop_back();
+  }
+ 
+  if (!Infer(reused_input_tensors_, &reused_output_tensors_)) {
+    FDERROR << "Failed to inference by runtime." << std::endl;
+    return false;
+  }
+
+  if (!postprocessor_.Run(reused_output_tensors_, results, ims_info)){
+    FDERROR << "Failed to postprocess the inference results by runtime." << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/facedet/ppdet/blazeface/blazeface.h b/fastdeploy/vision/facedet/ppdet/blazeface/blazeface.h
new file mode 100644
index 000000000..b740240a8
--- /dev/null
+++ b/fastdeploy/vision/facedet/ppdet/blazeface/blazeface.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+#include "fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.h"
+#include "fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+/*! @brief BlazeFace model object used when to load a BlazeFace model exported by BlazeFace.
+ */ 
+class FASTDEPLOY_DECL BlazeFace: public FastDeployModel{ 
+ public:
+  /** \brief  Set path of model file and the configuration of runtime.
+   *
+   * \param[in] model_file Path of model file, e.g ./blazeface.onnx
+   * \param[in] params_file Path of parameter file, e.g ppyoloe/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+   * \param[in] config_file Path of configuration file for deployment, e.g resnet/infer_cfg.yml
+   * \param[in] custom_option RuntimeOption for inference, the default will use cpu, and choose the backend defined in "valid_cpu_backends"
+   * \param[in] model_format Model format of the loaded model, default is ONNX format
+   */
+  BlazeFace(const std::string& model_file, const std::string& params_file = "",
+            const std::string& config_file = "",
+            const RuntimeOption& custom_option = RuntimeOption(),
+            const ModelFormat& model_format = ModelFormat::PADDLE);
+
+  std::string ModelName() {return "blaze-face";}
+
+  /** \brief Predict the detection result for an input image
+   *
+   * \param[in] img The input image data, comes from cv::imread(), is a 3-D array with layout HWC, BGR format
+   * \param[in] result The output detection result will be writen to this structure
+   * \return true if the prediction successed, otherwise false
+   */
+  bool Predict(const cv::Mat& im, FaceDetectionResult* result);
+
+  /** \brief Predict the detection results for a batch of input images
+   *
+   * \param[in] imgs, The input image list, each element comes from cv::imread()
+   * \param[in] results The output detection result list
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool BatchPredict(const std::vector<cv::Mat>& images,
+                            std::vector<FaceDetectionResult>* results);
+
+  /// Get preprocessor reference of BlazeFace
+  virtual BlazeFacePreprocessor& GetPreprocessor() {
+    return preprocessor_;
+  }
+
+  /// Get postprocessor reference of BlazeFace
+  virtual BlazeFacePostprocessor& GetPostprocessor() {
+    return postprocessor_;
+  }
+
+ protected:
+  bool Initialize();
+  BlazeFacePreprocessor preprocessor_;
+  BlazeFacePostprocessor postprocessor_;
+};
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/ppdet/blazeface/blazeface_pybind.cc b/fastdeploy/vision/facedet/ppdet/blazeface/blazeface_pybind.cc
new file mode 100644
index 000000000..cc0066d40
--- /dev/null
+++ b/fastdeploy/vision/facedet/ppdet/blazeface/blazeface_pybind.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindBlazeFace(pybind11::module& m) {
+  pybind11::class_<vision::facedet::BlazeFacePreprocessor>(
+      m, "BlazeFacePreprocessor")
+      .def(pybind11::init<>())
+      .def("run", [](vision::facedet::BlazeFacePreprocessor& self, std::vector<pybind11::array>& im_list) {
+        std::vector<vision::FDMat> images;
+        for (size_t i = 0; i < im_list.size(); ++i) {
+          images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
+        }
+        std::vector<FDTensor> outputs;
+        std::vector<std::map<std::string, std::array<float, 2>>> ims_info;
+        if (!self.Run(&images, &outputs, &ims_info)) {
+          throw std::runtime_error("Failed to preprocess the input data in BlazeFacePreprocessor.");
+        }
+        for (size_t i = 0; i < outputs.size(); ++i) {
+          outputs[i].StopSharing();
+        }
+        return make_pair(outputs, ims_info);
+      });
+
+  pybind11::class_<vision::facedet::BlazeFacePostprocessor>(
+      m, "BlazeFacePostprocessor")
+      .def(pybind11::init<>())
+      .def("run", [](vision::facedet::BlazeFacePostprocessor& self, std::vector<FDTensor>& inputs,
+                     const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info) {
+        std::vector<vision::FaceDetectionResult> results;
+        if (!self.Run(inputs, &results, ims_info)) {
+          throw std::runtime_error("Failed to postprocess the runtime result in BlazeFacePostprocessor.");
+        }
+        return results; 
+      })
+      .def("run", [](vision::facedet::BlazeFacePostprocessor& self, std::vector<pybind11::array>& input_array,
+                     const std::vector<std::map<std::string, std::array<float, 2>>>& ims_info) {
+        std::vector<vision::FaceDetectionResult> results;
+        std::vector<FDTensor> inputs;
+        PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
+        if (!self.Run(inputs, &results, ims_info)) {
+          throw std::runtime_error("Failed to postprocess the runtime result in BlazePostprocessor.");
+        }
+        return results;
+      })
+      .def_property("conf_threshold", &vision::facedet::BlazeFacePostprocessor::GetConfThreshold, &vision::facedet::BlazeFacePostprocessor::SetConfThreshold)
+      .def_property("nms_threshold", &vision::facedet::BlazeFacePostprocessor::GetNMSThreshold, &vision::facedet::BlazeFacePostprocessor::SetNMSThreshold);
+    
+  pybind11::class_<vision::facedet::BlazeFace, FastDeployModel>(m, "BlazeFace")
+      .def(pybind11::init<std::string, std::string, std::string, RuntimeOption,
+                          ModelFormat>())
+      .def("predict",
+           [](vision::facedet::BlazeFace& self, pybind11::array& data) {
+             auto mat = PyArrayToCvMat(data);
+             vision::FaceDetectionResult res;
+             self.Predict(mat, &res);
+             return res;
+           })
+      .def("batch_predict", [](vision::facedet::BlazeFace& self, std::vector<pybind11::array>& data) {
+        std::vector<cv::Mat> images;
+        for (size_t i = 0; i < data.size(); ++i) {
+          images.push_back(PyArrayToCvMat(data[i]));
+        }
+        std::vector<vision::FaceDetectionResult> results;
+        self.BatchPredict(images, &results);
+        return results;
+      })
+      .def_property_readonly("preprocessor", &vision::facedet::BlazeFace::GetPreprocessor)
+      .def_property_readonly("postprocessor", &vision::facedet::BlazeFace::GetPostprocessor);
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.cc b/fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.cc
new file mode 100644
index 000000000..8624a5c8c
--- /dev/null
+++ b/fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.h"
+#include "fastdeploy/vision/utils/utils.h"
+#include "fastdeploy/vision/detection/ppdet/multiclass_nms.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+BlazeFacePostprocessor::BlazeFacePostprocessor() {
+  conf_threshold_ = 0.5;
+  nms_threshold_ = 0.3;
+}
+
+bool BlazeFacePostprocessor::Run(const std::vector<FDTensor>& tensors,
+                                 std::vector<FaceDetectionResult>* results,
+                                 const std::vector<std::map<std::string,
+                                 std::array<float, 2>>>& ims_info) {
+  // Get number of boxes for each input image
+  std::vector<int> num_boxes(tensors[1].shape[0]);
+  int total_num_boxes = 0;
+  if (tensors[1].dtype == FDDataType::INT32) {
+    const auto* data = static_cast<const int32_t*>(tensors[1].CpuData());
+    for (size_t i = 0; i < tensors[1].shape[0]; ++i) {
+      num_boxes[i] = static_cast<int>(data[i]);
+      total_num_boxes += num_boxes[i];
+    }
+  } else if (tensors[1].dtype == FDDataType::INT64) {
+    const auto* data = static_cast<const int64_t*>(tensors[1].CpuData());
+    for (size_t i = 0; i < tensors[1].shape[0]; ++i) {
+      num_boxes[i] = static_cast<int>(data[i]);
+    }
+  }
+
+  // Special case for TensorRT, it has fixed output shape of NMS
+  // So there's invalid boxes in its' output boxes
+  int num_output_boxes = static_cast<int>(tensors[0].Shape()[0]);
+  bool contain_invalid_boxes = false;
+  if (total_num_boxes != num_output_boxes) {
+    if (num_output_boxes % num_boxes.size() == 0) {
+      contain_invalid_boxes = true;
+    } else {
+      FDERROR << "Cannot handle the output data for this model, unexpected "
+                 "situation."
+              << std::endl;
+      return false;
+    }
+  }
+
+  // Get boxes for each input image
+  results->resize(num_boxes.size());
+
+  if (tensors[0].shape[0] == 0) {
+    // No detected boxes
+    return true;
+  }
+
+  const auto* box_data = static_cast<const float*>(tensors[0].CpuData());
+  int offset = 0;
+  for (size_t i = 0; i < num_boxes.size(); ++i) {
+    const float* ptr = box_data + offset;
+    (*results)[i].Reserve(num_boxes[i]);
+    for (size_t j = 0; j < num_boxes[i]; ++j) {
+      if (ptr[j * 6 + 1] > conf_threshold_) {
+        (*results)[i].scores.push_back(ptr[j * 6 + 1]);
+        (*results)[i].boxes.emplace_back(std::array<float, 4>(
+            {ptr[j * 6 + 2], ptr[j * 6 + 3], ptr[j * 6 + 4], ptr[j * 6 + 5]}));
+      }
+    }
+    if (contain_invalid_boxes) {
+      offset += static_cast<int>(num_output_boxes * 6 / num_boxes.size());
+    } else {
+      offset += static_cast<int>(num_boxes[i] * 6);
+    }
+  }
+return true;
+}
+
+} // namespace detection
+} // namespace vision
+} // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.h b/fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.h
new file mode 100644
index 000000000..b7443a140
--- /dev/null
+++ b/fastdeploy/vision/facedet/ppdet/blazeface/postprocessor.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+class FASTDEPLOY_DECL BlazeFacePostprocessor{
+ public:
+  /*! @brief Postprocessor object for BlazeFace serials model.
+   */
+  BlazeFacePostprocessor();
+
+  /** \brief Process the result of runtime and fill to FaceDetectionResult structure
+   *
+   * \param[in] infer_result The inference result from runtime
+   * \param[in] results The output result of detection
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \return true if the postprocess successed, otherwise false
+   */
+  bool Run(const std::vector<FDTensor>& infer_result,
+           std::vector<FaceDetectionResult>* results,
+           const std::vector<std::map<std::string,
+           std::array<float, 2>>>& ims_info);                
+
+  /// Set conf_threshold, default 0.5
+  void SetConfThreshold(const float& conf_threshold) {
+    conf_threshold_ = conf_threshold;
+  }
+
+  /// Get conf_threshold, default 0.5
+  float GetConfThreshold() const { return conf_threshold_; }
+
+  /// Set nms_threshold, default 0.3
+  void SetNMSThreshold(const float& nms_threshold) {
+    nms_threshold_ = nms_threshold;
+  }
+
+  /// Get nms_threshold, default 0.3
+  float GetNMSThreshold() const { return nms_threshold_; }
+
+ protected:
+  float conf_threshold_;
+  float nms_threshold_;
+};
+
+}  // namespace facedet
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.cc b/fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.cc
new file mode 100644
index 000000000..a259f4a50
--- /dev/null
+++ b/fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.cc
@@ -0,0 +1,207 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.h"
+#include "fastdeploy/function/concat.h"
+#include "fastdeploy/function/pad.h"
+#include "fastdeploy/vision/common/processors/mat.h"
+#include "yaml-cpp/yaml.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+BlazeFacePreprocessor::BlazeFacePreprocessor(const std::string& config_file) {
+  is_scale_ = false;
+  normalize_mean_ = {123, 117, 104};
+  normalize_std_ = {127.502231, 127.502231, 127.502231};
+  this->config_file_ = config_file;
+  FDASSERT(BuildPreprocessPipelineFromConfig(),
+           "Failed to create PaddleDetPreprocessor.");
+}
+
+bool BlazeFacePreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+                                 std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
+  if (images->size() == 0) {
+    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    return false;
+  }
+  ims_info->resize(images->size());
+  outputs->resize(3);
+  int batch = static_cast<int>(images->size());
+  // Allocate memory for scale_factor
+  (*outputs)[1].Resize({batch, 2}, FDDataType::FP32);
+  // Allocate memory for im_shape
+  (*outputs)[2].Resize({batch, 2}, FDDataType::FP32);
+
+  std::vector<int> max_hw({-1, -1});
+
+  auto* scale_factor_ptr =
+      reinterpret_cast<float*>((*outputs)[1].MutableData());
+  auto* im_shape_ptr = reinterpret_cast<float*>((*outputs)[2].MutableData());
+
+  // Concat all the preprocessed data to a batch tensor
+  std::vector<FDTensor> im_tensors(images->size());
+
+  for (size_t i = 0; i < images->size(); ++i) {
+    int origin_w = (*images)[i].Width();
+    int origin_h = (*images)[i].Height();
+    scale_factor_ptr[2 * i] = 1.0;
+    scale_factor_ptr[2 * i + 1] = 1.0;
+    
+    for (size_t j = 0; j < processors_.size(); ++j) {
+      if (!(*(processors_[j].get()))(&((*images)[i]))) {
+        FDERROR << "Failed to processs image:" << i << " in "
+                << processors_[i]->Name() << "." << std::endl;
+        return false;
+      }
+      if (processors_[j]->Name().find("Resize") != std::string::npos) {
+        scale_factor_ptr[2 * i] = (*images)[i].Height() * 1.0 / origin_h;
+        scale_factor_ptr[2 * i + 1] = (*images)[i].Width() * 1.0 / origin_w;
+      }
+    }
+    
+    if ((*images)[i].Height() > max_hw[0]) {
+      max_hw[0] = (*images)[i].Height();
+    }
+    if ((*images)[i].Width() > max_hw[1]) {
+      max_hw[1] = (*images)[i].Width();
+    }
+    im_shape_ptr[2 * i] = max_hw[0];
+    im_shape_ptr[2 * i + 1] = max_hw[1];
+
+    if ((*images)[i].Height() < max_hw[0] || (*images)[i].Width() < max_hw[1]) {
+      // if the size of image less than max_hw, pad to max_hw
+      FDTensor tensor;
+      (*images)[i].ShareWithTensor(&tensor);
+      function::Pad(tensor, &(im_tensors[i]),
+                    {0, 0, max_hw[0] - (*images)[i].Height(),
+                     max_hw[1] - (*images)[i].Width()},
+                    0);
+    } else {
+      // No need pad
+      (*images)[i].ShareWithTensor(&(im_tensors[i]));
+    }
+    // Reshape to 1xCxHxW
+    im_tensors[i].ExpandDim(0);
+  }
+
+  if (im_tensors.size() == 1) {
+    // If there's only 1 input, no need to concat
+    // skip memory copy
+    (*outputs)[0] = std::move(im_tensors[0]);
+  } else {
+    // Else concat the im tensor for each input image
+    // compose a batched input tensor
+    function::Concat(im_tensors, &((*outputs)[0]), 0);
+  }
+
+  return true;
+}
+
+bool BlazeFacePreprocessor::BuildPreprocessPipelineFromConfig() {
+  processors_.clear();
+  YAML::Node cfg;
+  try {
+    cfg = YAML::LoadFile(config_file_);
+  } catch (YAML::BadFile& e) {
+    FDERROR << "Failed to load yaml file " << config_file_
+            << ", maybe you should check this file." << std::endl;
+    return false;
+  }
+
+  processors_.push_back(std::make_shared<BGR2RGB>());
+
+  bool has_permute = false;
+  for (const auto& op : cfg["Preprocess"]) {
+    std::string op_name = op["type"].as<std::string>();
+    if (op_name == "NormalizeImage") {
+        auto mean = op["mean"].as<std::vector<float>>();
+        auto std = op["std"].as<std::vector<float>>();
+        bool is_scale = true;
+        if (op["is_scale"]) {
+          is_scale = op["is_scale"].as<bool>();
+        }
+        std::string norm_type = "mean_std";
+        if (op["norm_type"]) {
+          norm_type = op["norm_type"].as<std::string>();
+        }
+        if (norm_type != "mean_std") {
+          std::fill(mean.begin(), mean.end(), 0.0);
+          std::fill(std.begin(), std.end(), 1.0);
+        }
+        processors_.push_back(std::make_shared<Normalize>(mean, std, is_scale));
+    } else if (op_name == "Resize") {
+      bool keep_ratio = op["keep_ratio"].as<bool>();
+      auto target_size = op["target_size"].as<std::vector<int>>();
+      int interp = op["interp"].as<int>();
+      FDASSERT(target_size.size() == 2,
+               "Require size of target_size be 2, but now it's %lu.",
+               target_size.size());
+      if (!keep_ratio) {
+        int width = target_size[1];
+        int height = target_size[0];
+        processors_.push_back(
+            std::make_shared<Resize>(width, height, -1.0, -1.0, interp, false));
+      } else {
+        int min_target_size = std::min(target_size[0], target_size[1]);
+        int max_target_size = std::max(target_size[0], target_size[1]);
+        std::vector<int> max_size;
+        if (max_target_size > 0) {
+          max_size.push_back(max_target_size);
+          max_size.push_back(max_target_size);
+        }
+        processors_.push_back(std::make_shared<ResizeByShort>(
+            min_target_size, interp, true, max_size));
+      }
+    } else if (op_name == "Permute") {
+      // Do nothing, do permute as the last operation
+      has_permute = true;
+      continue;
+    } else if (op_name == "Pad") {
+      auto size = op["size"].as<std::vector<int>>();
+      auto value = op["fill_value"].as<std::vector<float>>();
+      processors_.push_back(std::make_shared<Cast>("float"));
+      processors_.push_back(
+          std::make_shared<PadToSize>(size[1], size[0], value));
+    } else if (op_name == "PadStride") {
+      auto stride = op["stride"].as<int>();
+      processors_.push_back(
+          std::make_shared<StridePad>(stride, std::vector<float>(3, 0)));
+    } else {
+      FDERROR << "Unexcepted preprocess operator: " << op_name << "."
+              << std::endl;
+      return false;
+    }
+  }
+
+  if (has_permute) {
+    // permute = cast<float> + HWC2CHW
+    processors_.push_back(std::make_shared<Cast>("float"));
+    processors_.push_back(std::make_shared<HWC2CHW>());
+  }
+
+  // Fusion will improve performance
+  FuseTransforms(&processors_);
+
+  return true;
+}
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespacefastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.h b/fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.h
new file mode 100644
index 000000000..836fd6bfb
--- /dev/null
+++ b/fastdeploy/vision/facedet/ppdet/blazeface/preprocessor.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+#include "fastdeploy/vision/detection/ppdet/preprocessor.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace facedet {
+
+class FASTDEPLOY_DECL BlazeFacePreprocessor:
+      public fastdeploy::vision::detection::PaddleDetPreprocessor {
+ public:
+  /** \brief Create a preprocessor instance for BlazeFace serials model
+   */
+  BlazeFacePreprocessor() = default;
+
+  /** \brief Create a preprocessor instance for Blazeface serials model
+   *
+   * \param[in] config_file Path of configuration file for deployment, e.g ppyoloe/infer_cfg.yml
+   */
+  explicit BlazeFacePreprocessor(const std::string& config_file);
+
+  /** \brief Process the input image and prepare input tensors for runtime
+   *
+   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] outputs The output tensors which will feed in runtime
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \ret
+   */
+  bool Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+           std::vector<std::map<std::string, std::array<float, 2>>>* ims_info);
+
+ private:
+  bool BuildPreprocessPipelineFromConfig();
+
+  // if is_scale_up is false, the input image only can be zoom out,
+  // the maximum resize scale cannot exceed 1.0
+  bool is_scale_;
+
+  std::vector<float> normalize_mean_;
+
+  std::vector<float> normalize_std_;
+  
+  std::vector<std::shared_ptr<Processor>> processors_;
+  // read config file
+  std::string config_file_;
+};
+
+}  // namespace facedet
+
+}  // namespace vision
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/vision_pybind.cc b/fastdeploy/vision/vision_pybind.cc
index 22f7581be..03e625728 100755
--- a/fastdeploy/vision/vision_pybind.cc
+++ b/fastdeploy/vision/vision_pybind.cc
@@ -16,6 +16,7 @@
 
 namespace fastdeploy {
 
+void BindProcessorManager(pybind11::module& m);
 void BindDetection(pybind11::module& m);
 void BindClassification(pybind11::module& m);
 void BindSegmentation(pybind11::module& m);
@@ -204,6 +205,7 @@ void BindVision(pybind11::module& m) {
   m.def("disable_flycv", &vision::DisableFlyCV,
         "Disable image preprocessing by FlyCV, change to use OpenCV.");
 
+  BindProcessorManager(m);
   BindDetection(m);
   BindClassification(m);
   BindSegmentation(m);
diff --git a/python/fastdeploy/__init__.py b/python/fastdeploy/__init__.py
index 730d98a73..1d9640c7b 100755
--- a/python/fastdeploy/__init__.py
+++ b/python/fastdeploy/__init__.py
@@ -39,3 +39,5 @@ from . import text
 from . import encryption
 from .download import download, download_and_decompress, download_model, get_model_list
 from . import serving
+from .code_version import version, git_version
+__version__ = version
diff --git a/python/fastdeploy/model.py b/python/fastdeploy/model.py
index 59833f775..224cbafdf 100644
--- a/python/fastdeploy/model.py
+++ b/python/fastdeploy/model.py
@@ -54,6 +54,11 @@ class FastDeployModel:
     def print_statis_info_of_runtime(self):
         return self._model.print_statis_info_of_runtime()
 
+    def get_profile_time(self):
+        """Get profile time of Runtime after the profile process is done.
+        """
+        return self._model.get_profile_time()    
+
     @property
     def runtime_option(self):
         return self._model.runtime_option if self._model is not None else None
diff --git a/python/fastdeploy/runtime.py b/python/fastdeploy/runtime.py
old mode 100755
new mode 100644
index d864a8897..cd7b6641b
--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -144,76 +144,24 @@ class Runtime:
             index, self.num_outputs)
         return self._runtime.get_output_info(index)
 
+    def get_profile_time(self):
+        """Get profile time of Runtime after the profile process is done.
+        """
+        return self._runtime.get_profile_time()
+
 
 class RuntimeOption:
     """Options for FastDeploy Runtime.
     """
 
+    __slots__ = ["_option"]
+
     def __init__(self):
         """Initialize a FastDeploy RuntimeOption object.
         """
 
         self._option = C.RuntimeOption()
 
-    @property
-    def is_dynamic(self):
-        """Only for Poros backend
-
-        :param value: (bool)Whether to enable dynamic shape, default False
-        """
-        return self._option.is_dynamic
-
-    @property
-    def unconst_ops_thres(self):
-        """Only for Poros backend
-
-        :param value: (int)Minimum number of subgraph OPs, default 10
-        """
-        return self._option.unconst_ops_thres
-
-    @property
-    def long_to_int(self):
-        """Only for Poros backend
-
-        :param value: (bool)Whether to convert long dtype to int dtype, default True
-        """
-        return self._option.long_to_int
-
-    @property
-    def use_nvidia_tf32(self):
-        """Only for Poros backend
-
-        :param value: (bool)The calculation accuracy of tf32 mode exists on the A card, which can bring some performance improvements, default False
-        """
-        return self._option.use_nvidia_tf32
-
-    @is_dynamic.setter
-    def is_dynamic(self, value):
-        assert isinstance(
-            value, bool), "The value to set `is_dynamic` must be type of bool."
-        self._option.is_dynamic = value
-
-    @unconst_ops_thres.setter
-    def unconst_ops_thres(self, value):
-        assert isinstance(
-            value,
-            int), "The value to set `unconst_ops_thres` must be type of int."
-        self._option.unconst_ops_thres = value
-
-    @long_to_int.setter
-    def long_to_int(self, value):
-        assert isinstance(
-            value,
-            bool), "The value to set `long_to_int` must be type of bool."
-        self._option.long_to_int = value
-
-    @use_nvidia_tf32.setter
-    def use_nvidia_tf32(self, value):
-        assert isinstance(
-            value,
-            bool), "The value to set `use_nvidia_tf32` must be type of bool."
-        self._option.use_nvidia_tf32 = value
-
     def set_model_path(self,
                        model_path,
                        params_path="",
@@ -317,7 +265,10 @@ class RuntimeOption:
 
         :param level: (int)Optimization level, -1 means the default setting
         """
-        return self._option.set_ort_graph_opt_level(level)
+        logging.warning(
+            "`RuntimeOption.set_ort_graph_opt_level` will be deprecated in v1.2.0, please use `RuntimeOption.graph_optimize_level = 99` instead."
+        )
+        self._option.ort_option.graph_optimize_level = level
 
     def use_paddle_backend(self):
         """Use Paddle Inference backend, support inference Paddle model on CPU/Nvidia GPU.
@@ -359,99 +310,143 @@ class RuntimeOption:
         """
         return self.use_lite_backend()
 
-    def set_lite_device_names(self, device_names):
-        """Set nnadapter device name for Paddle Lite backend.
-        """
-        return self._option.set_lite_device_names(device_names)
-
     def set_lite_context_properties(self, context_properties):
         """Set nnadapter context properties for Paddle Lite backend.
         """
-        return self._option.set_lite_context_properties(context_properties)
+        logging.warning(
+            "`RuntimeOption.set_lite_context_properties` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.nnadapter_context_properties = ...` instead."
+        )
+        self._option.paddle_lite_option.nnadapter_context_properties = context_properties
 
     def set_lite_model_cache_dir(self, model_cache_dir):
         """Set nnadapter model cache dir for Paddle Lite backend.
         """
-        return self._option.set_lite_model_cache_dir(model_cache_dir)
+        logging.warning(
+            "`RuntimeOption.set_lite_model_cache_dir` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.nnadapter_model_cache_dir = ...` instead."
+        )
+
+        self._option.paddle_lite_option.nnadapter_model_cache_dir = model_cache_dir
 
     def set_lite_dynamic_shape_info(self, dynamic_shape_info):
         """ Set nnadapter dynamic shape info for Paddle Lite backend.
         """
-        return self._option.set_lite_dynamic_shape_info(dynamic_shape_info)
+        logging.warning(
+            "`RuntimeOption.set_lite_dynamic_shape_info` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.nnadapter_dynamic_shape_info = ...` instead."
+        )
+        self._option.paddle_lite_option.nnadapter_dynamic_shape_info = dynamic_shape_info
 
     def set_lite_subgraph_partition_path(self, subgraph_partition_path):
         """ Set nnadapter subgraph partition path for Paddle Lite backend.
         """
-        return self._option.set_lite_subgraph_partition_path(
-            subgraph_partition_path)
+        logging.warning(
+            "`RuntimeOption.set_lite_subgraph_partition_path` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.nnadapter_subgraph_partition_config_path = ...` instead."
+        )
+        self._option.paddle_lite_option.nnadapter_subgraph_partition_config_path = subgraph_partition_path
 
     def set_lite_subgraph_partition_config_buffer(self,
                                                   subgraph_partition_buffer):
         """ Set nnadapter subgraph partition buffer for Paddle Lite backend.
         """
-        return self._option.set_lite_subgraph_partition_config_buffer(
-            subgraph_partition_buffer)
+        logging.warning(
+            "`RuntimeOption.set_lite_subgraph_partition_buffer` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.nnadapter_subgraph_partition_config_buffer = ...` instead."
+        )
+        self._option.paddle_lite_option.nnadapter_subgraph_partition_config_buffer = subgraph_partition_buffer
 
     def set_lite_mixed_precision_quantization_config_path(
             self, mixed_precision_quantization_config_path):
         """ Set nnadapter mixed precision quantization config path for Paddle Lite backend..
         """
-        return self._option.set_lite_mixed_precision_quantization_config_path(
-            mixed_precision_quantization_config_path)
+        logging.warning(
+            "`RuntimeOption.set_lite_mixed_precision_quantization_config_path` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.nnadapter_mixed_precision_quantization_config_path = ...` instead."
+        )
+        self._option.paddle_lite_option.nnadapter_mixed_precision_quantization_config_path = mixed_precision_quantization_config_path
 
     def set_paddle_mkldnn(self, use_mkldnn=True):
         """Enable/Disable MKLDNN while using Paddle Inference backend, mkldnn is enabled by default.
         """
-        return self._option.set_paddle_mkldnn(use_mkldnn)
+        logging.warning(
+            "`RuntimeOption.set_paddle_mkldnn` will be derepcated in v1.2.0, please use `RuntimeOption.paddle_infer_option.enable_mkldnn = True` instead."
+        )
+        self._option.paddle_infer_option.enable_mkldnn = True
 
     def set_openvino_device(self, name="CPU"):
         """Set device name for OpenVINO, default 'CPU', can also be 'AUTO', 'GPU', 'GPU.1'....
+           This interface is deprecated, please use `RuntimeOption.openvino_option.set_device` instead.
         """
-        return self._option.set_openvino_device(name)
+        logging.warning(
+            "`RuntimeOption.set_openvino_device` will be deprecated in v1.2.0, please use `RuntimeOption.openvino_option.set_device` instead."
+        )
+        self._option.openvino_option.set_device(name)
 
     def set_openvino_shape_info(self, shape_info):
         """Set shape information of the models' inputs, used for GPU to fix the shape
+           This interface is deprecated, please use `RuntimeOption.openvino_option.set_shape_info` instead.
 
         :param shape_info: (dict{str, list of int})Shape information of model's inputs, e.g {"image": [1, 3, 640, 640], "scale_factor": [1, 2]}
         """
-        return self._option.set_openvino_shape_info(shape_info)
+        logging.warning(
+            "`RuntimeOption.set_openvino_shape_info` will be deprecated in v1.2.0, please use `RuntimeOption.openvino_option.set_shape_info` instead."
+        )
+        self._option.openvino_option.set_shape_info(shape_info)
 
     def set_openvino_cpu_operators(self, operators):
         """While using OpenVINO backend and intel GPU, this interface specifies unsupported operators to run on CPU
+           This interface is deprecated, please use `RuntimeOption.openvino_option.set_cpu_operators` instead.
 
         :param operators: (list of string)list of operators' name, e.g ["MulticlasNms"]
         """
-        return self._option.set_openvino_cpu_operators(operators)
+        logging.warning(
+            "`RuntimeOption.set_openvino_cpu_operators` will be deprecated in v1.2.0, please use `RuntimeOption.openvino_option.set_cpu_operators` instead."
+        )
+        self._option.openvino_option.set_cpu_operators(operators)
 
     def enable_paddle_log_info(self):
         """Enable print out the debug log information while using Paddle Inference backend, the log information is disabled by default.
         """
-        return self._option.enable_paddle_log_info()
+        logging.warning(
+            "RuntimeOption.enable_paddle_log_info` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.enable_log_info = True` instead."
+        )
+        self._option.paddle_infer_option.enable_log_info = True
 
     def disable_paddle_log_info(self):
         """Disable print out the debug log information while using Paddle Inference backend, the log information is disabled by default.
         """
-        return self._option.disable_paddle_log_info()
+        logging.warning(
+            "RuntimeOption.disable_paddle_log_info` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.enable_log_info = False` instead."
+        )
+        self._option.paddle_infer_option.enable_log_info = False
 
     def set_paddle_mkldnn_cache_size(self, cache_size):
         """Set size of shape cache while using Paddle Inference backend with MKLDNN enabled, default will cache all the dynamic shape.
         """
-        return self._option.set_paddle_mkldnn_cache_size(cache_size)
+        logging.warning(
+            "RuntimeOption.set_paddle_mkldnn_cache_size` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.mkldnn_cache_size = {}` instead.".
+            format(cache_size))
+        self._option.paddle_infer_option.mkldnn_cache_size = cache_size
 
     def enable_lite_fp16(self):
         """Enable half precision inference while using Paddle Lite backend on ARM CPU, fp16 is disabled by default.
         """
-        return self._option.enable_lite_fp16()
+        logging.warning(
+            "`RuntimeOption.enable_lite_fp16` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.enable_fp16 = True` instead."
+        )
+        self._option.paddle_lite_option.enable_fp16 = True
 
     def disable_lite_fp16(self):
         """Disable half precision inference while using Paddle Lite backend on ARM CPU, fp16 is disabled by default.
         """
-        return self._option.disable_lite_fp16()
+        logging.warning(
+            "`RuntimeOption.disable_lite_fp16` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.enable_fp16 = False` instead."
+        )
+        self._option.paddle_lite_option.enable_fp16 = False
 
     def set_lite_power_mode(self, mode):
         """Set POWER mode while using Paddle Lite backend on ARM CPU.
         """
-        return self._option.set_lite_power_mode(mode)
+        logging.warning(
+            "`RuntimeOption.set_lite_powermode` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_lite_option.power_mode = {}` instead.".
+            format(mode))
+        self._option.paddle_lite_option.power_mode = mode
 
     def set_trt_input_shape(self,
                             tensor_name,
@@ -465,30 +460,42 @@ class RuntimeOption:
         :param opt_shape: (list of int)Optimize shape of the input, this offten set as the most common input shape, if set to None, it will keep same with min_shape
         :param max_shape: (list of int)Maximum shape of the input, e.g [8, 3, 224, 224], if set to None, it will keep same with the min_shape
         """
+        logging.warning(
+            "`RuntimeOption.set_trt_input_shape` will be deprecated in v1.2.0, please use `RuntimeOption.trt_option.set_shape()` instead."
+        )
         if opt_shape is None and max_shape is None:
             opt_shape = min_shape
             max_shape = min_shape
         else:
             assert opt_shape is not None and max_shape is not None, "Set min_shape only, or set min_shape, opt_shape, max_shape both."
-        return self._option.set_trt_input_shape(tensor_name, min_shape,
-                                                opt_shape, max_shape)
+        return self._option.trt_option.set_shape(tensor_name, min_shape,
+                                                 opt_shape, max_shape)
 
     def set_trt_cache_file(self, cache_file_path):
         """Set a cache file path while using TensorRT backend. While loading a Paddle/ONNX model with set_trt_cache_file("./tensorrt_cache/model.trt"), if file `./tensorrt_cache/model.trt` exists, it will skip building tensorrt engine and load the cache file directly; if file `./tensorrt_cache/model.trt` doesn't exist, it will building tensorrt engine and save the engine as binary string to the cache file.
 
         :param cache_file_path: (str)Path of tensorrt cache file
         """
-        return self._option.set_trt_cache_file(cache_file_path)
+        logging.warning(
+            "`RuntimeOption.set_trt_cache_file` will be deprecated in v1.2.0, please use `RuntimeOption.trt_option.serialize_file = {}` instead.".
+            format(cache_file_path))
+        self._option.trt_option.serialize_file = cache_file_path
 
     def enable_trt_fp16(self):
         """Enable half precision inference while using TensorRT backend, notice that not all the Nvidia GPU support FP16, in those cases, will fallback to FP32 inference.
         """
-        return self._option.enable_trt_fp16()
+        logging.warning(
+            "`RuntimeOption.enable_trt_fp16` will be deprecated in v1.2.0, please use `RuntimeOption.trt_option.enable_fp16 = True` instead."
+        )
+        self._option.trt_option.enable_fp16 = True
 
     def disable_trt_fp16(self):
         """Disable half precision inference while suing TensorRT backend.
         """
-        return self._option.disable_trt_fp16()
+        logging.warning(
+            "`RuntimeOption.disable_trt_fp16` will be deprecated in v1.2.0, please use `RuntimeOption.trt_option.enable_fp16 = False` instead."
+        )
+        self._option.trt_option.enable_fp16 = False
 
     def enable_pinned_memory(self):
         """Enable pinned memory. Pinned memory can be utilized to speedup the data transfer between CPU and GPU. Currently it's only suppurted in TRT backend and Paddle Inference backend.
@@ -503,37 +510,65 @@ class RuntimeOption:
     def enable_paddle_to_trt(self):
         """While using TensorRT backend, enable_paddle_to_trt() will change to use Paddle Inference backend, and use its integrated TensorRT instead.
         """
+        logging.warning(
+            "`RuntimeOption.enable_paddle_to_trt` will be deprecated in v1.2.l0, if you want to run tensorrt with Paddle Inference backend, please use the following method, "
+        )
+        logging.warning("    ==============================================")
+        logging.warning("    import fastdeploy as fd")
+        logging.warning("    option = fd.RuntimeOption()")
+        logging.warning("    option.use_gpu(0)")
+        logging.warning("    option.use_paddle_infer_backend()")
+        logging.warning("    option.paddle_infer_option.enabel_trt = True")
+        logging.warning("    ==============================================")
         return self._option.enable_paddle_to_trt()
 
     def set_trt_max_workspace_size(self, trt_max_workspace_size):
         """Set max workspace size while using TensorRT backend.
         """
-        return self._option.set_trt_max_workspace_size(trt_max_workspace_size)
+        logging.warning(
+            "`RuntimeOption.set_trt_max_workspace_size` will be deprecated in v1.2.0, please use `RuntimeOption.trt_option.max_workspace_size = {}` instead.".
+            format(trt_max_workspace_size))
+        self._option.trt_option.max_workspace_size = trt_max_workspace_size
 
     def set_trt_max_batch_size(self, trt_max_batch_size):
         """Set max batch size while using TensorRT backend.
         """
-        return self._option.set_trt_max_batch_size(trt_max_batch_size)
+        logging.warning(
+            "`RuntimeOption.set_trt_max_batch_size` will be deprecated in v1.2.0, please use `RuntimeOption.trt_option.max_batch_size = {}` instead.".
+            format(trt_max_batch_size))
+        self._option.trt_option.max_batch_size = trt_max_batch_size
 
     def enable_paddle_trt_collect_shape(self):
         """Enable collect subgraph shape information while using Paddle Inference with TensorRT
         """
-        return self._option.enable_paddle_trt_collect_shape()
+        logging.warning(
+            "`RuntimeOption.enable_paddle_trt_collect_shape` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.collect_trt_shape = True` instead."
+        )
+        self._option.paddle_infer_option.collect_trt_shape = True
 
     def disable_paddle_trt_collect_shape(self):
         """Disable collect subgraph shape information while using Paddle Inference with TensorRT
         """
-        return self._option.disable_paddle_trt_collect_shape()
+        logging.warning(
+            "`RuntimeOption.disable_paddle_trt_collect_shape` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.collect_trt_shape = False` instead."
+        )
+        self._option.paddle_infer_option.collect_trt_shape = False
 
     def delete_paddle_backend_pass(self, pass_name):
         """Delete pass by name in paddle backend
         """
-        return self._option.delete_paddle_backend_pass(pass_name)
+        logging.warning(
+            "`RuntimeOption.delete_paddle_backend_pass` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.delete_pass` instead."
+        )
+        self._option.paddle_infer_option.delete_pass(pass_name)
 
     def disable_paddle_trt_ops(self, ops):
         """Disable some ops in paddle trt backend
         """
-        return self._option.disable_paddle_trt_ops(ops)
+        logging.warning(
+            "`RuntimeOption.disable_paddle_trt_ops` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.disable_trt_ops()` instead."
+        )
+        self._option.disable_trt_ops(ops)
 
     def use_ipu(self,
                 device_num=1,
@@ -548,10 +583,72 @@ class RuntimeOption:
                        replica_num=1,
                        available_memory_proportion=1.0,
                        enable_half_partial=False):
-        return self._option.set_ipu_config(enable_fp16, replica_num,
+        logging.warning("`RuntimeOption.set_ipu_config` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.set_ipu_config()` instead.")
+        self._option.paddle_infer_option.set_ipu_config(enable_fp16, replica_num,
                                            available_memory_proportion,
                                            enable_half_partial)
 
+    @property
+    def poros_option(self):
+        """Get PorosBackendOption object to configure Poros backend
+
+        :return PorosBackendOption
+        """
+        return self._option.poros_option
+
+    @property
+    def paddle_lite_option(self):
+        """Get LiteBackendOption object to configure Paddle Lite backend
+
+        :return LiteBackendOption
+        """
+        return self._option.paddle_lite_option
+
+    @property
+    def openvino_option(self):
+        """Get OpenVINOOption object to configure OpenVINO backend
+
+        :return OpenVINOOption
+        """
+        return self._option.openvino_option
+
+    @property
+    def ort_option(self):
+        """Get OrtBackendOption object to configure ONNX Runtime backend
+
+        :return OrtBackendOption
+        """
+        return self._option.ort_option
+
+    @property
+    def trt_option(self):
+        """Get TrtBackendOption object to configure TensorRT backend
+
+        :return TrtBackendOption
+        """
+        return self._option.trt_option
+
+    @property
+    def paddle_infer_option(self):
+        """Get PaddleBackendOption object to configure Paddle Inference backend
+
+        :return PaddleBackendOption
+        """
+        return self._option.paddle_infer_option
+
+    def enable_profiling(self, inclue_h2d_d2h=False, repeat=100, warmup=50):
+        """Set the profile mode as 'true'.
+        :param inclue_h2d_d2h Whether to include time of H2D_D2H for time of runtime.
+        :param repeat Repeat times for runtime inference.
+        :param warmup Warmup times for runtime inference.
+        """
+        return self._option.enable_profiling(inclue_h2d_d2h, repeat, warmup)
+
+    def disable_profiling(self):
+        """Set the profile mode as 'false'.
+        """
+        return self._option.disable_profiling()
+
     def __repr__(self):
         attrs = dir(self._option)
         message = "RuntimeOption(\n"
@@ -560,8 +657,7 @@ class RuntimeOption:
                 continue
             if hasattr(getattr(self._option, attr), "__call__"):
                 continue
-            message += "  {} : {}\t\n".format(attr,
-                                              getattr(self._option, attr))
+            message += "  {} : {}\t\n".format(attr, getattr(self._option, attr))
         message.strip("\n")
         message += ")"
         return message
diff --git a/python/fastdeploy/vision/classification/contrib/yolov5cls.py b/python/fastdeploy/vision/classification/contrib/yolov5cls.py
index 5f401fa1d..a7d372a08 100644
--- a/python/fastdeploy/vision/classification/contrib/yolov5cls.py
+++ b/python/fastdeploy/vision/classification/contrib/yolov5cls.py
@@ -18,18 +18,78 @@ from .... import FastDeployModel, ModelFormat
 from .... import c_lib_wrap as C
 
 
+class YOLOv5ClsPreprocessor:
+    def __init__(self):
+        """Create a preprocessor for YOLOv5Cls
+        """
+        self._preprocessor = C.vision.classification.YOLOv5ClsPreprocessor()
+
+    def run(self, input_ims):
+        """Preprocess input images for YOLOv5Cls
+
+        :param: input_ims: (list of numpy.ndarray)The input image
+        :return: list of FDTensor
+        """
+        return self._preprocessor.run(input_ims)
+
+    @property
+    def size(self):
+        """
+        Argument for image preprocessing step, the preprocess image size, tuple of (width, height), default size = [224, 224]
+        """
+        return self._preprocessor.size
+
+    @size.setter
+    def size(self, wh):
+        assert isinstance(wh, (list, tuple)),\
+            "The value to set `size` must be type of tuple or list."
+        assert len(wh) == 2,\
+            "The value to set `size` must contatins 2 elements means [width, height], but now it contains {} elements.".format(
+            len(wh))
+        self._preprocessor.size = wh
+
+
+class YOLOv5ClsPostprocessor:
+    def __init__(self):
+        """Create a postprocessor for YOLOv5Cls
+        """
+        self._postprocessor = C.vision.classification.YOLOv5ClsPostprocessor()
+
+    def run(self, runtime_results, ims_info):
+        """Postprocess the runtime results for YOLOv5Cls
+
+        :param: runtime_results: (list of FDTensor)The output FDTensor results from runtime
+        :param: ims_info: (list of dict)Record input_shape and output_shape
+        :return: list of ClassifyResult(If the runtime_results is predict by batched samples, the length of this list equals to the batch size)
+        """
+        return self._postprocessor.run(runtime_results, ims_info)
+
+    @property
+    def topk(self):
+        """
+        topk for postprocessing, default is 1
+        """
+        return self._postprocessor.topk
+
+    @topk.setter
+    def topk(self, topk):
+        assert isinstance(topk, int),\
+            "The value to set `top k` must be type of int."
+        self._postprocessor.topk = topk
+
+
 class YOLOv5Cls(FastDeployModel):
     def __init__(self,
                  model_file,
                  params_file="",
                  runtime_option=None,
                  model_format=ModelFormat.ONNX):
-        """Load a image classification model exported by YOLOv5.
+        """Load a YOLOv5Cls model exported by YOLOv5Cls.
 
-        :param model_file: (str)Path of model file, e.g yolov5cls/yolov5n-cls.onnx
-        :param params_file: (str)Path of parameters file, if the model_fomat is ModelFormat.ONNX, this param will be ignored, can be set as empty string
+        :param model_file: (str)Path of model file, e.g ./YOLOv5Cls.onnx
+        :param params_file: (str)Path of parameters file, e.g yolox/model.pdiparams, if the model_fomat is ModelFormat.ONNX, this param will be ignored, can be set as empty string
         :param runtime_option: (fastdeploy.RuntimeOption)RuntimeOption for inference this model, if it's None, will use the default backend on CPU
-        :param model_format: (fastdeploy.ModelForamt)Model format of the loaded model, default is ONNX
+        :param model_format: (fastdeploy.ModelForamt)Model format of the loaded model
         """
 
         super(YOLOv5Cls, self).__init__(runtime_option)
@@ -37,33 +97,39 @@ class YOLOv5Cls(FastDeployModel):
         assert model_format == ModelFormat.ONNX, "YOLOv5Cls only support model format of ModelFormat.ONNX now."
         self._model = C.vision.classification.YOLOv5Cls(
             model_file, params_file, self._runtime_option, model_format)
+
         assert self.initialized, "YOLOv5Cls initialize failed."
 
-    def predict(self, input_image, topk=1):
+    def predict(self, input_image):
         """Classify an input image
 
-        :param im: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
-        :param topk: (int)The topk result by the classify confidence score, default 1
+        :param input_image: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
         :return: ClassifyResult
         """
+        assert input_image is not None, "Input image is None."
+        return self._model.predict(input_image)
 
-        return self._model.predict(input_image, topk)
+    def batch_predict(self, images):
+        """Classify a batch of input image
+
+        :param im: (list of numpy.ndarray) The input image list, each element is a 3-D array with layout HWC, BGR format
+        :return list of ClassifyResult
+        """
+
+        return self._model.batch_predict(images)
 
     @property
-    def size(self):
-        """
-        Returns the preprocess image size, default is (224, 224)
-        """
-        return self._model.size
+    def preprocessor(self):
+        """Get YOLOv5ClsPreprocessor object of the loaded model
 
-    @size.setter
-    def size(self, wh):
+        :return YOLOv5ClsPreprocessor
         """
-        Set the preprocess image size
+        return self._model.preprocessor
+
+    @property
+    def postprocessor(self):
+        """Get YOLOv5ClsPostprocessor object of the loaded model
+
+        :return YOLOv5ClsPostprocessor
         """
-        assert isinstance(wh, (list, tuple)),\
-            "The value to set `size` must be type of tuple or list."
-        assert len(wh) == 2,\
-            "The value to set `size` must contatins 2 elements means [width, height], but now it contains {} elements.".format(
-            len(wh))
-        self._model.size = wh
+        return self._model.postprocessor
diff --git a/python/fastdeploy/vision/classification/ppcls/__init__.py b/python/fastdeploy/vision/classification/ppcls/__init__.py
index 455702271..7215bcfbc 100644
--- a/python/fastdeploy/vision/classification/ppcls/__init__.py
+++ b/python/fastdeploy/vision/classification/ppcls/__init__.py
@@ -16,44 +16,40 @@ from __future__ import absolute_import
 import logging
 from .... import FastDeployModel, ModelFormat
 from .... import c_lib_wrap as C
+from ...common import ProcessorManager
 
 
-class PaddleClasPreprocessor:
+class PaddleClasPreprocessor(ProcessorManager):
     def __init__(self, config_file):
         """Create a preprocessor for PaddleClasModel from configuration file
 
         :param config_file: (str)Path of configuration file, e.g resnet50/inference_cls.yaml
         """
-        self._preprocessor = C.vision.classification.PaddleClasPreprocessor(
+        super(PaddleClasPreprocessor, self).__init__()
+        self._manager = C.vision.classification.PaddleClasPreprocessor(
             config_file)
 
-    def run(self, input_ims):
-        """Preprocess input images for PaddleClasModel
-
-        :param: input_ims: (list of numpy.ndarray)The input image
-        :return: list of FDTensor
-        """
-        return self._preprocessor.run(input_ims)
-
-    def use_cuda(self, enable_cv_cuda=False, gpu_id=-1):
-        """Use CUDA preprocessors
-
-        :param: enable_cv_cuda: Whether to enable CV-CUDA
-        :param: gpu_id: GPU device id
-        """
-        return self._preprocessor.use_cuda(enable_cv_cuda, gpu_id)
-
     def disable_normalize(self):
         """
         This function will disable normalize in preprocessing step.
         """
-        self._preprocessor.disable_normalize()
+        self._manager.disable_normalize()
 
     def disable_permute(self):
         """
         This function will disable hwc2chw in preprocessing step.
         """
-        self._preprocessor.disable_permute()
+        self._manager.disable_permute()
+
+    def initial_resize_on_cpu(self, v):
+        """
+        When the initial operator is Resize, and input image size is large,
+        maybe it's better to run resize on CPU, because the HostToDevice memcpy
+        is time consuming. Set this True to run the initial resize on CPU.
+
+        :param: v: True or False
+        """
+        self._manager.initial_resize_on_cpu(v)
 
 
 class PaddleClasPostprocessor:
diff --git a/python/fastdeploy/vision/common/__init__.py b/python/fastdeploy/vision/common/__init__.py
new file mode 100644
index 000000000..6e010a427
--- /dev/null
+++ b/python/fastdeploy/vision/common/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+
+from .manager import ProcessorManager
diff --git a/python/fastdeploy/vision/common/manager.py b/python/fastdeploy/vision/common/manager.py
new file mode 100644
index 000000000..05da3d68e
--- /dev/null
+++ b/python/fastdeploy/vision/common/manager.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+
+class ProcessorManager:
+    def __init__(self):
+        self._manager = None
+
+    def run(self, input_ims):
+        """Process input image
+
+        :param: input_ims: (list of numpy.ndarray) The input images
+        :return: list of FDTensor
+        """
+        return self._manager.run(input_ims)
+
+    def use_cuda(self, enable_cv_cuda=False, gpu_id=-1):
+        """Use CUDA processors
+
+        :param: enable_cv_cuda: Ture: use CV-CUDA, False: use CUDA only
+        :param: gpu_id: GPU device id
+        """
+        return self._manager.use_cuda(enable_cv_cuda, gpu_id)
diff --git a/python/fastdeploy/vision/facedet/__init__.py b/python/fastdeploy/vision/facedet/__init__.py
index 869657a3c..d1c771c2b 100644
--- a/python/fastdeploy/vision/facedet/__init__.py
+++ b/python/fastdeploy/vision/facedet/__init__.py
@@ -15,6 +15,8 @@
 from __future__ import absolute_import
 from .contrib.yolov5face import YOLOv5Face
 from .contrib.yolov7face import *
+from .contrib.centerface import *
+from .contrib.blazeface import *
 from .contrib.retinaface import RetinaFace
 from .contrib.scrfd import SCRFD
 from .contrib.ultraface import UltraFace
diff --git a/python/fastdeploy/vision/facedet/contrib/blazeface.py b/python/fastdeploy/vision/facedet/contrib/blazeface.py
new file mode 100644
index 000000000..f67b6ee3d
--- /dev/null
+++ b/python/fastdeploy/vision/facedet/contrib/blazeface.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import logging
+from .... import FastDeployModel, ModelFormat
+from .... import c_lib_wrap as C
+
+
+class BlazeFacePreprocessor:
+    def __init__(self):
+        """Create a preprocessor for BlazeFace
+        """
+        self._preprocessor = C.vision.facedet.BlazeFacePreprocessor()
+
+    def run(self, input_ims):
+        """Preprocess input images for BlazeFace
+
+        :param: input_ims: (list of numpy.ndarray)The input image
+        :return: list of FDTensor
+        """
+        return self._preprocessor.run(input_ims)
+
+    @property
+    def is_scale_(self):
+        """
+        is_scale_ for preprocessing, the input image only can be zoom out, the maximum resize scale cannot exceed 1.0, default true
+        """
+        return self._preprocessor.is_scale_
+
+    @is_scale_.setter
+    def is_scale_(self, value):
+        assert isinstance(
+            value,
+            bool), "The value to set `is_scale_` must be type of bool."
+        self._preprocessor.is_scale_ = value
+
+
+class BlazeFacePostprocessor:
+    def __init__(self):
+        """Create a postprocessor for BlazeFace
+        """
+        self._postprocessor = C.vision.facedet.BlazeFacePostprocessor()
+
+    def run(self, runtime_results, ims_info):
+        """Postprocess the runtime results for BlazeFace
+
+        :param: runtime_results: (list of FDTensor)The output FDTensor results from runtime
+        :param: ims_info: (list of dict)Record input_shape and output_shape
+        :return: list of DetectionResult(If the runtime_results is predict by batched samples, the length of this list equals to the batch size)
+        """
+        return self._postprocessor.run(runtime_results, ims_info)
+
+    @property
+    def conf_threshold(self):
+        """
+        confidence threshold for postprocessing, default is 0.5
+        """
+        return self._postprocessor.conf_threshold
+
+    @property
+    def nms_threshold(self):
+        """
+        nms threshold for postprocessing, default is 0.3
+        """
+        return self._postprocessor.nms_threshold
+
+    @conf_threshold.setter
+    def conf_threshold(self, conf_threshold):
+        assert isinstance(conf_threshold, float),\
+            "The value to set `conf_threshold` must be type of float."
+        self._postprocessor.conf_threshold = conf_threshold
+
+    @nms_threshold.setter
+    def nms_threshold(self, nms_threshold):
+        assert isinstance(nms_threshold, float),\
+            "The value to set `nms_threshold` must be type of float."
+        self._postprocessor.nms_threshold = nms_threshold
+
+
+class BlazeFace(FastDeployModel):
+    def __init__(self,
+                 model_file,
+                 params_file="",
+                 config_file="",
+                 runtime_option=None,
+                 model_format=ModelFormat.PADDLE):
+        """Load a BlazeFace model exported by BlazeFace.
+
+        :param model_file: (str)Path of model file, e.g ./Blazeface.onnx
+        :param params_file: (str)Path of parameters file, e.g yolox/model.pdiparams, if the model_fomat is ModelFormat.ONNX, this param will be ignored, can be set as empty string
+        :param runtime_option: (fastdeploy.RuntimeOption)RuntimeOption for inference this model, if it's None, will use the default backend on CPU
+        :param model_format: (fastdeploy.ModelForamt)Model format of the loaded model
+        """
+        super(BlazeFace, self).__init__(runtime_option)
+
+        self._model = C.vision.facedet.BlazeFace(
+            model_file, params_file, config_file, self._runtime_option, model_format)
+
+        assert self.initialized, "BlazeFace initialize failed."
+
+    def predict(self, input_image):
+         """Detect the location and key points of human faces from an input image
+         :param input_image: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
+         :return: FaceDetectionResult
+         """
+         return self._model.predict(input_image)
+
+    def batch_predict(self, images):
+        """Classify a batch of input image
+
+        :param im: (list of numpy.ndarray) The input image list, each element is a 3-D array with layout HWC, BGR format
+        :return list of FaceDetectionResult
+        """
+
+        return self._model.batch_predict(images)
+
+    @property
+    def preprocessor(self):
+        """Get BlazefacePreprocessor object of the loaded model
+
+        :return BlazefacePreprocessor
+        """
+        return self._model.preprocessor
+
+    @property
+    def postprocessor(self):
+        """Get BlazefacePostprocessor object of the loaded model
+
+        :return BlazefacePostprocessor
+        """
+        return self._model.postprocessor
diff --git a/python/fastdeploy/vision/facedet/contrib/centerface.py b/python/fastdeploy/vision/facedet/contrib/centerface.py
new file mode 100644
index 000000000..27a139789
--- /dev/null
+++ b/python/fastdeploy/vision/facedet/contrib/centerface.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import logging
+from .... import FastDeployModel, ModelFormat
+from .... import c_lib_wrap as C
+
+
+class CenterFacePreprocessor:
+    def __init__(self):
+        """Create a preprocessor for CenterFace
+        """
+        self._preprocessor = C.vision.facedet.CenterFacePreprocessor()
+
+    def run(self, input_ims):
+        """Preprocess input images for CenterFace
+
+        :param: input_ims: (list of numpy.ndarray)The input image
+        :return: list of FDTensor
+        """
+        return self._preprocessor.run(input_ims)
+
+    @property
+    def size(self):
+        """
+        Argument for image preprocessing step, the preprocess image size, tuple of (width, height), default size = [640, 640]
+        """
+        return self._preprocessor.size
+
+    @size.setter
+    def size(self, wh):
+        assert isinstance(wh, (list, tuple)),\
+            "The value to set `size` must be type of tuple or list."
+        assert len(wh) == 2,\
+            "The value to set `size` must contatins 2 elements means [width, height], but now it contains {} elements.".format(
+            len(wh))
+        self._preprocessor.size = wh
+
+
+class CenterFacePostprocessor:
+    def __init__(self):
+        """Create a postprocessor for CenterFace
+        """
+        self._postprocessor = C.vision.facedet.CenterFacePostprocessor()
+
+    def run(self, runtime_results, ims_info):
+        """Postprocess the runtime results for CenterFace
+
+        :param: runtime_results: (list of FDTensor)The output FDTensor results from runtime
+        :param: ims_info: (list of dict)Record input_shape and output_shape
+        :return: list of DetectionResult(If the runtime_results is predict by batched samples, the length of this list equals to the batch size)
+        """
+        return self._postprocessor.run(runtime_results, ims_info)
+
+    @property
+    def conf_threshold(self):
+        """
+        confidence threshold for postprocessing, default is 0.5
+        """
+        return self._postprocessor.conf_threshold
+
+    @property
+    def nms_threshold(self):
+        """
+        nms threshold for postprocessing, default is 0.3
+        """
+        return self._postprocessor.nms_threshold
+
+    @conf_threshold.setter
+    def conf_threshold(self, conf_threshold):
+        assert isinstance(conf_threshold, float),\
+            "The value to set `conf_threshold` must be type of float."
+        self._postprocessor.conf_threshold = conf_threshold
+
+    @nms_threshold.setter
+    def nms_threshold(self, nms_threshold):
+        assert isinstance(nms_threshold, float),\
+            "The value to set `nms_threshold` must be type of float."
+        self._postprocessor.nms_threshold = nms_threshold
+
+
+class CenterFace(FastDeployModel):
+    def __init__(self,
+                 model_file,
+                 params_file="",
+                 runtime_option=None,
+                 model_format=ModelFormat.ONNX):
+        """Load a CenterFace model exported by CenterFace.
+
+        :param model_file: (str)Path of model file, e.g ./CenterFace.onnx
+        :param params_file: (str)Path of parameters file, e.g yolox/model.pdiparams, if the model_fomat is ModelFormat.ONNX, this param will be ignored, can be set as empty string
+        :param runtime_option: (fastdeploy.RuntimeOption)RuntimeOption for inference this model, if it's None, will use the default backend on CPU
+        :param model_format: (fastdeploy.ModelForamt)Model format of the loaded model
+        """
+        super(CenterFace, self).__init__(runtime_option)
+
+        self._model = C.vision.facedet.CenterFace(
+            model_file, params_file, self._runtime_option, model_format)
+
+        assert self.initialized, "CenterFace initialize failed."
+
+    def predict(self, input_image):
+         """Detect the location and key points of human faces from an input image
+         :param input_image: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
+         :return: FaceDetectionResult
+         """
+         return self._model.predict(input_image)
+
+    def batch_predict(self, images):
+        """Classify a batch of input image
+
+        :param im: (list of numpy.ndarray) The input image list, each element is a 3-D array with layout HWC, BGR format
+        :return list of DetectionResult
+        """
+
+        return self._model.batch_predict(images)
+
+    @property
+    def preprocessor(self):
+        """Get CenterFacePreprocessor object of the loaded model
+
+        :return CenterFacePreprocessor
+        """
+        return self._model.preprocessor
+
+    @property
+    def postprocessor(self):
+        """Get CenterFacePostprocessor object of the loaded model
+
+        :return CenterFacePostprocessor
+        """
+        return self._model.postprocessor
diff --git a/python/setup.py b/python/setup.py
index d1b02254e..df617287f 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -64,22 +64,22 @@ setup_configs["ENABLE_OPENVINO_BACKEND"] = os.getenv("ENABLE_OPENVINO_BACKEND",
                                                      "OFF")
 setup_configs["ENABLE_PADDLE_BACKEND"] = os.getenv("ENABLE_PADDLE_BACKEND",
                                                    "OFF")
-setup_configs["ENABLE_POROS_BACKEND"] = os.getenv("ENABLE_POROS_BACKEND",
-                                                  "OFF")
+setup_configs["ENABLE_POROS_BACKEND"] = os.getenv("ENABLE_POROS_BACKEND", "OFF")
 setup_configs["ENABLE_TRT_BACKEND"] = os.getenv("ENABLE_TRT_BACKEND", "OFF")
 setup_configs["ENABLE_LITE_BACKEND"] = os.getenv("ENABLE_LITE_BACKEND", "OFF")
 setup_configs["PADDLELITE_URL"] = os.getenv("PADDLELITE_URL", "OFF")
 setup_configs["ENABLE_VISION"] = os.getenv("ENABLE_VISION", "OFF")
 setup_configs["ENABLE_ENCRYPTION"] = os.getenv("ENABLE_ENCRYPTION", "OFF")
 setup_configs["ENABLE_FLYCV"] = os.getenv("ENABLE_FLYCV", "OFF")
+setup_configs["ENABLE_CVCUDA"] = os.getenv("ENABLE_CVCUDA", "OFF")
 setup_configs["ENABLE_TEXT"] = os.getenv("ENABLE_TEXT", "OFF")
+setup_configs["ENABLE_BENCHMARK"] = os.getenv("ENABLE_BENCHMARK", "OFF")
 setup_configs["WITH_GPU"] = os.getenv("WITH_GPU", "OFF")
 setup_configs["WITH_IPU"] = os.getenv("WITH_IPU", "OFF")
 setup_configs["WITH_KUNLUNXIN"] = os.getenv("WITH_KUNLUNXIN", "OFF")
 setup_configs["BUILD_ON_JETSON"] = os.getenv("BUILD_ON_JETSON", "OFF")
 setup_configs["TRT_DIRECTORY"] = os.getenv("TRT_DIRECTORY", "UNDEFINED")
-setup_configs["CUDA_DIRECTORY"] = os.getenv("CUDA_DIRECTORY",
-                                            "/usr/local/cuda")
+setup_configs["CUDA_DIRECTORY"] = os.getenv("CUDA_DIRECTORY", "/usr/local/cuda")
 setup_configs["LIBRARY_NAME"] = PACKAGE_NAME
 setup_configs["PY_LIBRARY_NAME"] = PACKAGE_NAME + "_main"
 setup_configs["OPENCV_DIRECTORY"] = os.getenv("OPENCV_DIRECTORY", "")
@@ -102,6 +102,7 @@ if os.getenv("CMAKE_CXX_COMPILER", None) is not None:
     setup_configs["CMAKE_CXX_COMPILER"] = os.getenv("CMAKE_CXX_COMPILER")
 
 SRC_DIR = os.path.join(TOP_DIR, PACKAGE_NAME)
+PYTHON_SRC_DIR = os.path.join(TOP_DIR, "python", PACKAGE_NAME)
 CMAKE_BUILD_DIR = os.path.join(TOP_DIR, 'python', '.setuptools-cmake-build')
 
 WINDOWS = (os.name == 'nt')
@@ -118,8 +119,7 @@ extras_require = {}
 
 # Default value is set to TRUE\1 to keep the settings same as the current ones.
 # However going forward the recomemded way to is to set this to False\0
-USE_MSVC_STATIC_RUNTIME = bool(
-    os.getenv('USE_MSVC_STATIC_RUNTIME', '1') == '1')
+USE_MSVC_STATIC_RUNTIME = bool(os.getenv('USE_MSVC_STATIC_RUNTIME', '1') == '1')
 ONNX_NAMESPACE = os.getenv('ONNX_NAMESPACE', 'paddle2onnx')
 ################################################################################
 # Version
@@ -149,8 +149,7 @@ assert CMAKE, 'Could not find "cmake" executable!'
 @contextmanager
 def cd(path):
     if not os.path.isabs(path):
-        raise RuntimeError('Can only cd to absolute path, got: {}'.format(
-            path))
+        raise RuntimeError('Can only cd to absolute path, got: {}'.format(path))
     orig_path = os.getcwd()
     os.chdir(path)
     try:
@@ -185,7 +184,7 @@ def get_all_files(dirname):
 
 class create_version(ONNXCommand):
     def run(self):
-        with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
+        with open(os.path.join(PYTHON_SRC_DIR, 'code_version.py'), 'w') as f:
             f.write(
                 dedent('''\
             # This file is generated by setup.py. DO NOT EDIT!
diff --git a/scripts/android/build_android_cpp_with_benchmark.sh b/scripts/android/build_android_cpp_with_benchmark.sh
new file mode 100755
index 000000000..4a2c4084c
--- /dev/null
+++ b/scripts/android/build_android_cpp_with_benchmark.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        mutable global variables
+# -------------------------------------------------------------------------------
+TOOLCHAIN=clang # gcc/clang toolchain
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly ANDROID_ABI=$1
+readonly ANDROID_PLATFORM="android-$2"
+readonly BUILD_ROOT=build/Android
+readonly BUILD_DIR=${BUILD_ROOT}/${ANDROID_ABI}-api-$2
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__set_android_ndk() {
+  if [ -z $ANDROID_NDK ]; then
+    echo "-- [INFO] ANDROID_NDK not exists, please setup manually ..."
+    exit 0
+  else
+    echo "-- [INFO] Found ANDROID_NDK: ${ANDROID_NDK}"
+  fi
+  if [ "$ANDROID_NDK" ]; then
+      NDK_VERSION=$(echo $ANDROID_NDK | egrep -o "[0-9]{2}" | head -n 1)
+      if [ "$NDK_VERSION" -gt 17 ]; then
+          TOOLCHAIN=clang
+      fi
+      echo "-- [INFO] Checked ndk version: ${NDK_VERSION}"
+      echo "-- [INFO] Selected toolchain: ${TOOLCHAIN}"
+  fi
+}
+
+__build_fastdeploy_android_shared() {
+
+  local ANDROID_STL=c++_shared  # c++_static
+  local ANDROID_TOOLCHAIN=${TOOLCHAIN}
+  local TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DANDROID_ABI=${ANDROID_ABI} \
+        -DANDROID_NDK=${ANDROID_NDK} \
+        -DANDROID_PLATFORM=${ANDROID_PLATFORM} \
+        -DANDROID_STL=${ANDROID_STL} \
+        -DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN} \
+        -DENABLE_ORT_BACKEND=OFF \
+        -DENABLE_LITE_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=OFF \
+        -DENABLE_FLYCV=ON \
+        -DENABLE_TEXT=OFF \
+        -DENABLE_VISION=ON \
+        -DBUILD_EXAMPLES=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DWITH_OPENCV_STATIC=OFF \
+        -DWITH_LITE_STATIC=OFF \
+        -DWITH_OPENMP=OFF \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][${ANDROID_ABI}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __set_android_ndk
+  __build_fastdeploy_android_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/android/build_android_cpp_with_benchmark.sh arm64-v8a 21
+# ./scripts/android/build_android_cpp_with_benchmark.sh armeabi-v7a 21
diff --git a/scripts/android/build_android_cpp_with_text_api_only.sh b/scripts/android/build_android_cpp_with_text_api_only.sh
new file mode 100755
index 000000000..7881d8ed2
--- /dev/null
+++ b/scripts/android/build_android_cpp_with_text_api_only.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        mutable global variables
+# -------------------------------------------------------------------------------
+TOOLCHAIN=clang # gcc/clang toolchain
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly ANDROID_ABI=$1
+readonly ANDROID_PLATFORM="android-$2"
+readonly BUILD_ROOT=build/Android
+readonly BUILD_DIR=${BUILD_ROOT}/${ANDROID_ABI}-api-$2
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__set_android_ndk() {
+  if [ -z $ANDROID_NDK ]; then
+    echo "-- [INFO] ANDROID_NDK not exists, please setup manually ..."
+    exit 0
+  else
+    echo "-- [INFO] Found ANDROID_NDK: ${ANDROID_NDK}"
+  fi
+  if [ "$ANDROID_NDK" ]; then
+      NDK_VERSION=$(echo $ANDROID_NDK | egrep -o "[0-9]{2}" | head -n 1)
+      if [ "$NDK_VERSION" -gt 17 ]; then
+          TOOLCHAIN=clang
+      fi
+      echo "-- [INFO] Checked ndk version: ${NDK_VERSION}"
+      echo "-- [INFO] Selected toolchain: ${TOOLCHAIN}"
+  fi
+}
+
+__build_fastdeploy_android_shared() {
+
+  local ANDROID_STL=c++_shared  # c++_static
+  local ANDROID_TOOLCHAIN=${TOOLCHAIN}
+  local TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DANDROID_ABI=${ANDROID_ABI} \
+        -DANDROID_NDK=${ANDROID_NDK} \
+        -DANDROID_PLATFORM=${ANDROID_PLATFORM} \
+        -DANDROID_STL=${ANDROID_STL} \
+        -DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN} \
+        -DENABLE_ORT_BACKEND=OFF \
+        -DENABLE_LITE_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=OFF \
+        -DENABLE_FLYCV=OFF \
+        -DENABLE_TEXT=ON \
+        -DENABLE_VISION=OFF \
+        -DBUILD_EXAMPLES=OFF \
+        -DWITH_LITE_STATIC=ON \
+        -DWITH_OPENMP=ON \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][${ANDROID_ABI}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __set_android_ndk
+  __build_fastdeploy_android_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/android/build_android_cpp_with_text_api_only.sh arm64-v8a 21
+# ./scripts/android/build_android_cpp_with_text_api_only.sh armeabi-v7a 21
diff --git a/scripts/linux/build_linux_x86_64_cpp_cpu.sh b/scripts/linux/build_linux_x86_64_cpp_cpu.sh
new file mode 100755
index 000000000..e3ff7964b
--- /dev/null
+++ b/scripts/linux/build_linux_x86_64_cpp_cpu.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly BUILD_ROOT=build/Linux
+readonly BUILD_DIR=${BUILD_ROOT}/x86_64
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__build_fastdeploy_linux_x86_64_shared() {
+
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_GPU=OFF \
+        -DENABLE_ORT_BACKEND=ON \
+        -DENABLE_PADDLE_BACKEND=ON \
+        -DENABLE_OPENVINO_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=ON \
+        -DENABLE_VISION=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DBUILD_EXAMPLES=ON \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][x86_64]][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __build_fastdeploy_linux_x86_64_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/linux/build_linux_x86_64_cpp_cpu.sh
diff --git a/scripts/linux/build_linux_x86_64_cpp_gpu.sh b/scripts/linux/build_linux_x86_64_cpp_gpu.sh
new file mode 100755
index 000000000..9ae91921e
--- /dev/null
+++ b/scripts/linux/build_linux_x86_64_cpp_gpu.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly BUILD_ROOT=build/Linux
+readonly BUILD_DIR="${BUILD_ROOT}/x86_64_gpu"
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__build_fastdeploy_linux_x86_64_gpu_shared() {
+
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_GPU=ON \
+        -DTRT_DIRECTORY=${TRT_DIRECTORY} \
+        -DCUDA_DIRECTORY=${CUDA_DIRECTORY} \
+        -DENABLE_ORT_BACKEND=ON \
+        -DENABLE_TRT_BACKEND=ON \
+        -DENABLE_PADDLE_BACKEND=ON \
+        -DENABLE_OPENVINO_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=ON \
+        -DENABLE_VISION=ON \
+        -DENABLE_BENCHMARK=OFF \
+        -DBUILD_EXAMPLES=ON \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][x86_64_gpu}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __build_fastdeploy_linux_x86_64_gpu_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/linux/build_linux_x86_64_cpp_gpu.sh
diff --git a/scripts/linux/build_linux_x86_64_cpp_gpu_with_benchmark.sh b/scripts/linux/build_linux_x86_64_cpp_gpu_with_benchmark.sh
new file mode 100755
index 000000000..6f2b4ed7d
--- /dev/null
+++ b/scripts/linux/build_linux_x86_64_cpp_gpu_with_benchmark.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly BUILD_ROOT=build/Linux
+readonly BUILD_DIR="${BUILD_ROOT}/x86_64_gpu"
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__build_fastdeploy_linux_x86_64_gpu_shared() {
+
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_GPU=ON \
+        -DTRT_DIRECTORY=${TRT_DIRECTORY} \
+        -DCUDA_DIRECTORY=${CUDA_DIRECTORY} \
+        -DENABLE_ORT_BACKEND=ON \
+        -DENABLE_TRT_BACKEND=ON \
+        -DENABLE_PADDLE_BACKEND=ON \
+        -DENABLE_OPENVINO_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=ON \
+        -DENABLE_VISION=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DBUILD_EXAMPLES=ON \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][x86_64_gpu}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __build_fastdeploy_linux_x86_64_gpu_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/linux/build_linux_x86_64_cpp_gpu.sh
diff --git a/scripts/macosx/build_macosx_cpp.sh b/scripts/macosx/build_macosx_cpp.sh
new file mode 100755
index 000000000..4d8e08726
--- /dev/null
+++ b/scripts/macosx/build_macosx_cpp.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly BUILD_ROOT=build/MacOSX
+readonly OSX_ARCH=$1  # arm64, x86_64
+readonly BUILD_DIR=${BUILD_ROOT}/${OSX_ARCH}
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__build_fastdeploy_osx_arm64_shared() {
+
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DENABLE_ORT_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=ON \
+        -DENABLE_VISION=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DBUILD_EXAMPLES=ON \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][${OSX_ARCH}][${BUILD_DIR}/install]"
+}
+
+__build_fastdeploy_osx_x86_64_shared() {
+
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DENABLE_ORT_BACKEND=ON \
+        -DENABLE_PADDLE_BACKEND=ON \
+        -DENABLE_OPENVINO_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=ON \
+        -DENABLE_VISION=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DBUILD_EXAMPLES=ON \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][${OSX_ARCH}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  if [ "$OSX_ARCH" = "arm64" ]; then
+    __build_fastdeploy_osx_arm64_shared
+  else
+    __build_fastdeploy_osx_x86_64_shared
+  fi
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/macosx/build_macosx_cpp.sh arm64
+# ./scripts/macosx/build_macosx_cpp.sh x86_64
diff --git a/serving/docs/EN/model_configuration-en.md b/serving/docs/EN/model_configuration-en.md
index 2f9ee14ca..88f72e3b9 100644
--- a/serving/docs/EN/model_configuration-en.md
+++ b/serving/docs/EN/model_configuration-en.md
@@ -162,7 +162,8 @@ optimization {
     gpu_execution_accelerator : [
       {
         name : "tensorrt"
-        # Use FP16 inference in TensorRT. You can also choose: trt_fp32, trt_int8
+        # Use FP16 inference in TensorRT. You can also choose: trt_fp32
+        # If the loaded model is a quantized model, this precision will be int8 automatically
         parameters { key: "precision" value: "trt_fp16" }
       }
     ]
@@ -203,4 +204,4 @@ optimization {
   }
   ]
 }}
-```
\ No newline at end of file
+```
diff --git a/serving/docs/zh_CN/model_configuration.md b/serving/docs/zh_CN/model_configuration.md
index 60803121c..03f8e09af 100644
--- a/serving/docs/zh_CN/model_configuration.md
+++ b/serving/docs/zh_CN/model_configuration.md
@@ -162,7 +162,8 @@ optimization {
     gpu_execution_accelerator : [
       {
         name : "tensorrt"
-        # 使用TensorRT的FP16推理,其他可选项为: trt_fp32、trt_int8
+        # 使用TensorRT的FP16推理,其他可选项为: trt_fp32
+        # 如果加载的是量化模型，此精度设置无效，会默认使用int8进行推理
         parameters { key: "precision" value: "trt_fp16" }
       }
     ]
diff --git a/serving/src/fastdeploy_runtime.cc b/serving/src/fastdeploy_runtime.cc
index 79479609c..062a8476b 100644
--- a/serving/src/fastdeploy_runtime.cc
+++ b/serving/src/fastdeploy_runtime.cc
@@ -168,7 +168,10 @@ TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model,
 }
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
-    : BackendModel(triton_model), model_load_(false), main_runtime_(nullptr), is_clone_(true) {
+    : BackendModel(triton_model),
+      model_load_(false),
+      main_runtime_(nullptr),
+      is_clone_(true) {
   // Create runtime options that will be cloned and used for each
   // instance when creating that instance's runtime.
   runtime_options_.reset(new fastdeploy::RuntimeOption());
@@ -227,14 +230,14 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
                   ParseBoolValue(value_string, &pd_enable_mkldnn));
               runtime_options_->SetPaddleMKLDNN(pd_enable_mkldnn);
             } else if (param_key == "use_paddle_log") {
-                runtime_options_->EnablePaddleLogInfo();
+              runtime_options_->EnablePaddleLogInfo();
             } else if (param_key == "num_streams") {
-                int num_streams;
-                THROW_IF_BACKEND_MODEL_ERROR(
+              int num_streams;
+              THROW_IF_BACKEND_MODEL_ERROR(
                   ParseIntValue(value_string, &num_streams));
-                runtime_options_->SetOpenVINOStreams(num_streams);
+              runtime_options_->openvino_option.num_streams = num_streams;
             } else if (param_key == "is_clone") {
-                THROW_IF_BACKEND_MODEL_ERROR(
+              THROW_IF_BACKEND_MODEL_ERROR(
                   ParseBoolValue(value_string, &is_clone_));
             } else if (param_key == "use_ipu") {
               // runtime_options_->UseIpu();
@@ -271,11 +274,11 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
               std::vector<int32_t> shape;
               FDParseShape(params, input_name, &shape);
               if (name == "min_shape") {
-                runtime_options_->trt_min_shape[input_name] = shape;
+                runtime_options_->trt_option.min_shape[input_name] = shape;
               } else if (name == "max_shape") {
-                runtime_options_->trt_max_shape[input_name] = shape;
+                runtime_options_->trt_option.max_shape[input_name] = shape;
               } else {
-                runtime_options_->trt_opt_shape[input_name] = shape;
+                runtime_options_->trt_option.opt_shape[input_name] = shape;
               }
             }
           }
@@ -292,12 +295,10 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
                 std::transform(value_string.begin(), value_string.end(),
                                value_string.begin(), ::tolower);
                 if (value_string == "trt_fp16") {
-                  runtime_options_->EnableTrtFP16();
-                } else if (value_string == "trt_int8") {
-                  // TODO(liqi): use EnableTrtINT8
-                  runtime_options_->trt_enable_int8 = true;
+                  runtime_options_->trt_option.enable_fp16 = true;
                 } else if (value_string == "pd_fp16") {
-                  // TODO(liqi): paddle inference don't currently have interface for fp16.
+                  // TODO(liqi): paddle inference don't currently have interface
+                  // for fp16.
                 }
                 // } else if( param_key == "max_batch_size") {
                 //   THROW_IF_BACKEND_MODEL_ERROR(ParseUnsignedLongLongValue(
@@ -307,15 +308,15 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
                 //       value_string,
                 //       &runtime_options_->trt_max_workspace_size));
               } else if (param_key == "cache_file") {
-                runtime_options_->SetTrtCacheFile(value_string);
+                runtime_options_->trt_option.serialize_file = value_string;
               } else if (param_key == "use_paddle") {
                 runtime_options_->EnablePaddleToTrt();
               } else if (param_key == "use_paddle_log") {
                 runtime_options_->EnablePaddleLogInfo();
               } else if (param_key == "is_clone") {
                 THROW_IF_BACKEND_MODEL_ERROR(
-                  ParseBoolValue(value_string, &is_clone_));
-              } 
+                    ParseBoolValue(value_string, &is_clone_));
+              }
             }
           }
         }
@@ -330,17 +331,17 @@ TRITONSERVER_Error* ModelState::LoadModel(
     const int32_t instance_group_device_id, std::string* model_path,
     std::string* params_path, fastdeploy::Runtime** runtime,
     cudaStream_t stream) {
-  
   // FastDeploy Runtime creation is not thread-safe, so multiple creations
   // are serialized with a global lock.
   // The Clone interface can be invoked only when the main_runtime_ is created.
   static std::mutex global_context_mu;
   std::lock_guard<std::mutex> glock(global_context_mu);
 
-  if(model_load_ && is_clone_) {
-    if(main_runtime_ == nullptr) {
-      return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,
-                                  std::string("main_runtime is nullptr").c_str());
+  if (model_load_ && is_clone_) {
+    if (main_runtime_ == nullptr) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_NOT_FOUND,
+          std::string("main_runtime is nullptr").c_str());
     }
     *runtime = main_runtime_->Clone((void*)stream, instance_group_device_id);
   } else {
@@ -367,21 +368,21 @@ TRITONSERVER_Error* ModelState::LoadModel(
         if (not exists) {
           return TRITONSERVER_ErrorNew(
               TRITONSERVER_ERROR_NOT_FOUND,
-              std::string("Paddle params should be named as 'model.pdiparams' or "
-                          "not provided.'")
+              std::string(
+                  "Paddle params should be named as 'model.pdiparams' or "
+                  "not provided.'")
                   .c_str());
         }
-        runtime_options_->model_format = fastdeploy::ModelFormat::PADDLE;
-        runtime_options_->model_file = *model_path;
-        runtime_options_->params_file = *params_path;
+        runtime_options_->SetModelPath(*model_path, *params_path,
+                                       fastdeploy::ModelFormat::PADDLE);
       } else {
-        runtime_options_->model_format = fastdeploy::ModelFormat::ONNX;
-        runtime_options_->model_file = *model_path;
+        runtime_options_->SetModelPath(*model_path, "",
+                                       fastdeploy::ModelFormat::ONNX);
       }
     }
 
     // GPU
-  #ifdef TRITON_ENABLE_GPU
+#ifdef TRITON_ENABLE_GPU
     if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
         (instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
       runtime_options_->UseGpu(instance_group_device_id);
@@ -389,17 +390,17 @@ TRITONSERVER_Error* ModelState::LoadModel(
     } else if (runtime_options_->device != fastdeploy::Device::IPU) {
       runtime_options_->UseCpu();
     }
-  #else
+#else
     if (runtime_options_->device != fastdeploy::Device::IPU) {
       // If Device is set to IPU, just skip CPU setting.
       runtime_options_->UseCpu();
     }
-  #endif  // TRITON_ENABLE_GPU
+#endif  // TRITON_ENABLE_GPU
 
     *runtime = main_runtime_ = new fastdeploy::Runtime();
     if (!(*runtime)->Init(*runtime_options_)) {
       return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,
-                                  std::string("Runtime init error").c_str());
+                                   std::string("Runtime init error").c_str());
     }
     model_load_ = true;
   }
@@ -942,8 +943,8 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests,
 
   if (!all_response_failed) {
     FD_RESPOND_ALL_AND_SET_TRUE_IF_ERROR(responses, request_count,
-                                      all_response_failed,
-                                      Run(&responses, request_count));
+                                         all_response_failed,
+                                         Run(&responses, request_count));
   }
 
   uint64_t compute_end_ns = 0;
@@ -1067,17 +1068,16 @@ TRITONSERVER_Error* ModelInstanceState::SetInputTensors(
         allowed_input_types;
     if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
       allowed_input_types = {{TRITONSERVER_MEMORY_GPU, DeviceId()},
-                              {TRITONSERVER_MEMORY_CPU_PINNED, 0},
-                              {TRITONSERVER_MEMORY_CPU, 0}};
+                             {TRITONSERVER_MEMORY_CPU_PINNED, 0},
+                             {TRITONSERVER_MEMORY_CPU, 0}};
     } else {
       allowed_input_types = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
-                              {TRITONSERVER_MEMORY_CPU, 0}};
+                             {TRITONSERVER_MEMORY_CPU, 0}};
     }
 
-    RETURN_IF_ERROR(
-        collector->ProcessTensor(
-            input_name, nullptr, 0, allowed_input_types, &input_buffer,
-            &batchn_byte_size, &memory_type, &memory_type_id));
+    RETURN_IF_ERROR(collector->ProcessTensor(
+        input_name, nullptr, 0, allowed_input_types, &input_buffer,
+        &batchn_byte_size, &memory_type, &memory_type_id));
 
     int32_t device_id = -1;
     fastdeploy::Device device;
@@ -1089,9 +1089,9 @@ TRITONSERVER_Error* ModelInstanceState::SetInputTensors(
     }
 
     fastdeploy::FDTensor fdtensor(in_name);
-    fdtensor.SetExternalData(
-      batchn_shape, ConvertDataTypeToFD(input_datatype),
-      const_cast<char*>(input_buffer), device, device_id);
+    fdtensor.SetExternalData(batchn_shape, ConvertDataTypeToFD(input_datatype),
+                             const_cast<char*>(input_buffer), device,
+                             device_id);
     runtime_->BindInputTensor(in_name, fdtensor);
   }
 
@@ -1130,23 +1130,22 @@ TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors(
   for (auto& output_name : output_names_) {
     auto* output_tensor = runtime_->GetOutputTensor(output_name);
     if (output_tensor == nullptr) {
-        RETURN_IF_ERROR(
-            TRITONSERVER_ErrorNew(
-                TRITONSERVER_ERROR_INTERNAL,
-                (std::string("output tensor '") + output_name + "' is not found")
-                    .c_str()));
+      RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("output tensor '") + output_name + "' is not found")
+              .c_str()));
     }
     TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
     int64_t memory_type_id = 0;
-    if(output_tensor->device == fastdeploy::Device::GPU) {
+    if (output_tensor->device == fastdeploy::Device::GPU) {
       memory_type = TRITONSERVER_MEMORY_GPU;
       memory_type_id = DeviceId();
     }
     responder.ProcessTensor(
         output_tensor->name, ConvertFDType(output_tensor->dtype),
         output_tensor->shape,
-        reinterpret_cast<char*>(output_tensor->MutableData()),
-        memory_type, memory_type_id);
+        reinterpret_cast<char*>(output_tensor->MutableData()), memory_type,
+        memory_type_id);
   }
 
   // Finalize and wait for any pending buffer copies.
diff --git a/tests/acc_eval/classification/run.sh b/tests/acc_eval/classification/run.sh
old mode 100644
new mode 100755
index 16c1b2bb9..73fe957d4
--- a/tests/acc_eval/classification/run.sh
+++ b/tests/acc_eval/classification/run.sh
@@ -4,5 +4,5 @@ model_dir=`ls ./models/`
 
 for MODEL_NAME in $model_dir
 do
-    python infer.py --model ./models/$MODEL_NAME  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/${MODEL_NAME}_acc.log
+    python eval.py --model ./models/$MODEL_NAME  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/${MODEL_NAME}_acc.log
 done
diff --git a/tests/acc_eval/detection/eval_yolov5.py b/tests/acc_eval/detection/eval_yolov5.py
index 3d950b26a..f4aecbdc1 100755
--- a/tests/acc_eval/detection/eval_yolov5.py
+++ b/tests/acc_eval/detection/eval_yolov5.py
@@ -52,8 +52,8 @@ model = fd.vision.detection.YOLOv5(
     runtime_option=runtime_option,
     model_format=fd.ModelFormat.PADDLE)
 
-image_file_path = "/xieyunyao/Project/coco/val2017"
-annotation_file_path = "/xieyunyao/Project/coco/annotations/instances_val2017.json"
+image_file_path = "../dataset/coco/val2017"
+annotation_file_path = "../dataset/coco/annotations/instances_val2017.json"
 
 res = fd.vision.evaluation.eval_detection(model, image_file_path,
                                           annotation_file_path, 0.001, 0.65)
diff --git a/tests/acc_eval/detection/eval_yolov6.py b/tests/acc_eval/detection/eval_yolov6.py
index 3641194ca..3992c9f53 100755
--- a/tests/acc_eval/detection/eval_yolov6.py
+++ b/tests/acc_eval/detection/eval_yolov6.py
@@ -52,8 +52,8 @@ model = fd.vision.detection.YOLOv6(
     runtime_option=runtime_option,
     model_format=fd.ModelFormat.PADDLE)
 
-image_file_path = "/xieyunyao/Project/coco/val2017"
-annotation_file_path = "/xieyunyao/Project/coco/annotations/instances_val2017.json"
+image_file_path = "../dataset/coco/val2017"
+annotation_file_path = "../dataset/coco/annotations/instances_val2017.json"
 
 res = fd.vision.evaluation.eval_detection(model, image_file_path,
                                           annotation_file_path, 0.001, 0.65)
diff --git a/tests/acc_eval/detection/eval_yolov7.py b/tests/acc_eval/detection/eval_yolov7.py
index 3641194ca..3992c9f53 100755
--- a/tests/acc_eval/detection/eval_yolov7.py
+++ b/tests/acc_eval/detection/eval_yolov7.py
@@ -52,8 +52,8 @@ model = fd.vision.detection.YOLOv6(
     runtime_option=runtime_option,
     model_format=fd.ModelFormat.PADDLE)
 
-image_file_path = "/xieyunyao/Project/coco/val2017"
-annotation_file_path = "/xieyunyao/Project/coco/annotations/instances_val2017.json"
+image_file_path = "../dataset/coco/val2017"
+annotation_file_path = "../dataset/coco/annotations/instances_val2017.json"
 
 res = fd.vision.evaluation.eval_detection(model, image_file_path,
                                           annotation_file_path, 0.001, 0.65)
diff --git a/tests/acc_eval/detection/run.sh b/tests/acc_eval/detection/run.sh
old mode 100644
new mode 100755
index 59dff2e9b..051663215
--- a/tests/acc_eval/detection/run.sh
+++ b/tests/acc_eval/detection/run.sh
@@ -12,6 +12,6 @@ python eval_yolov3.py  --model_dir ./models/yolov3_darknet53_270e_coco  --image
 python eval_yolox.py --model_dir ./models/yolox_s_300e_coco  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolox_s_300e_coco.log
 python eval_faster_rcnn.py  --model_dir ./models/faster_rcnn_r50_vd_fpn_2x_coco  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/faster_rcnn_r50_vd_fpn_2x_coco.log
 python eval_mask_rcnn.py  --model_dir ./models/mask_rcnn_r50_1x_coco  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/mask_rcnn_r50_1x_coco.log
-python eval_yolov5.py  --model_dir ./models/yolov5s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov5s_infer.log
-python eval_yolov6.py  --model_dir ./models/yolov6s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov6s_infer.log
-python eval_yolov5.py  --model_dir ./models/yolov7_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov7_infer.log
+python eval_yolov5.py  --model ./models/yolov5s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov5s_infer.log
+python eval_yolov6.py  --model ./models/yolov6s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov6s_infer.log
+python eval_yolov7.py  --model ./models/yolov7_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov7_infer.log
diff --git a/tests/acc_eval/ppocr/eval_ppocrv2.py b/tests/acc_eval/ppocr/eval_ppocrv2.py
old mode 100644
new mode 100755
index f4742df66..bb478db91
--- a/tests/acc_eval/ppocr/eval_ppocrv2.py
+++ b/tests/acc_eval/ppocr/eval_ppocrv2.py
@@ -103,7 +103,7 @@ rec_model = fd.vision.ocr.Recognizer(
     runtime_option=runtime_option)
 
 # PPOCR的Rec模型开启静态推理, 其他硬件不需要的话请注释掉.
-rec_model.preprocessor.static_shape = True
+rec_model.preprocessor.static_shape_infer = True
 
 # 创建PP-OCR，串联3个模型，其中cls_model可选，如无需求，可设置为None
 ppocr_v2 = fd.vision.ocr.PPOCRv2(
diff --git a/tests/acc_eval/ppocr/eval_ppocrv3.py b/tests/acc_eval/ppocr/eval_ppocrv3.py
old mode 100644
new mode 100755
index b6f4dcced..496781ba0
--- a/tests/acc_eval/ppocr/eval_ppocrv3.py
+++ b/tests/acc_eval/ppocr/eval_ppocrv3.py
@@ -103,7 +103,7 @@ rec_model = fd.vision.ocr.Recognizer(
     runtime_option=runtime_option)
 
 # PPOCR的Rec模型开启静态推理, 其他硬件不需要的话请注释掉.
-rec_model.preprocessor.static_shape = True
+rec_model.preprocessor.static_shape_infer = True
 
 # 创建PP-OCR，串联3个模型，其中cls_model可选，如无需求，可设置为None
 ppocr_v3 = fd.vision.ocr.PPOCRv3(
diff --git a/tests/acc_eval/segmentation/eval.py b/tests/acc_eval/segmentation/eval.py
old mode 100644
new mode 100755
index b77a69519..df0dc0aa8
--- a/tests/acc_eval/segmentation/eval.py
+++ b/tests/acc_eval/segmentation/eval.py
@@ -54,5 +54,5 @@ model = fd.vision.segmentation.PaddleSegModel(
     model_file, params_file, config_file, runtime_option=runtime_option)
 
 res = fd.vision.evaluation.eval_segmentation(
-    model=model, data_dir="../dataset/FD_dataset/data/cityscapes")
+    model=model, data_dir="../dataset/cityscapes")
 print(res)
diff --git a/tests/models/test_blazeface.py b/tests/models/test_blazeface.py
new file mode 100644
index 000000000..70bafd693
--- /dev/null
+++ b/tests/models/test_blazeface.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from fastdeploy import ModelFormat
+import fastdeploy as fd
+import cv2
+import os
+import pickle
+import numpy as np
+import runtime_config as rc
+
+
+def test_detection_blazeface():
+    model_url = "https://bj.bcebos.com/paddlehub/fastdeploy/blazeface_1000e.tgz"
+    input_url1 = "https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg"
+    input_url2 = "https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000570688.jpg"
+    result_url1 = "https://bj.bcebos.com/paddlehub/fastdeploy/blazeface_result1.pkl"
+    result_url2 = "https://bj.bcebos.com/paddlehub/fastdeploy/blazeface_result2.pkl"
+    fd.download_and_decompress(model_url, "resources")
+    fd.download(input_url1, "resources")
+    fd.download(input_url2, "resources")
+
+
+    model_dir = "resources/blazeface_1000e"
+    model_file = os.path.join(model_dir, "model.pdmodel")
+    params_file = os.path.join(model_dir, "model.pdiparams")
+    config_file = os.path.join(model_dir, "infer_cfg.yml")
+    model = fd.vision.facedet.BlazeFace(
+        model_file, params_file, config_file, runtime_option=rc.test_option)
+    model.postprocessor.conf_threshold = 0.5
+
+    with open("resources/blazeface_result1.pkl", "rb") as f:
+        expect1 = pickle.load(f)
+
+    with open("resources/blazeface_result2.pkl", "rb") as f:
+        expect2 = pickle.load(f)
+
+    im1 = cv2.imread("./resources/000000014439.jpg")
+    im2 = cv2.imread("./resources/000000570688.jpg")
+
+    for i in range(3):
+        # test single predict
+        result1 = model.predict(im1)
+        result2 = model.predict(im2)
+
+        diff_boxes_1 = np.fabs(
+            np.array(result1.boxes) - np.array(expect1["boxes"]))
+        diff_boxes_2 = np.fabs(
+            np.array(result2.boxes) - np.array(expect2["boxes"]))
+
+        diff_scores_1 = np.fabs(
+            np.array(result1.scores) - np.array(expect1["scores"]))
+        diff_scores_2 = np.fabs(
+            np.array(result2.scores) - np.array(expect2["scores"]))
+
+        assert diff_boxes_1.max(
+        ) < 1e-04, "There's difference in detection boxes 1."
+        assert diff_scores_1.max(
+        ) < 1e-04, "There's difference in detection score 1."
+
+        assert diff_boxes_2.max(
+        ) < 1e-03, "There's difference in detection boxes 2."
+        assert diff_scores_2.max(
+        ) < 1e-04, "There's difference in detection score 2."
+
+        print("one image test success!")
+
+        # test batch predict
+        results = model.batch_predict([im1, im2])
+        result1 = results[0]
+        result2 = results[1]
+
+        diff_boxes_1 = np.fabs(
+            np.array(result1.boxes) - np.array(expect1["boxes"]))
+        diff_boxes_2 = np.fabs(
+            np.array(result2.boxes) - np.array(expect2["boxes"]))
+
+        diff_scores_1 = np.fabs(
+            np.array(result1.scores) - np.array(expect1["scores"]))
+        diff_scores_2 = np.fabs(
+            np.array(result2.scores) - np.array(expect2["scores"]))
+        assert diff_boxes_1.max(
+        ) < 1e-04, "There's difference in detection boxes 1."
+        assert diff_scores_1.max(
+        ) < 1e-03, "There's difference in detection score 1."
+
+        assert diff_boxes_2.max(
+        ) < 1e-04, "There's difference in detection boxes 2."
+        assert diff_scores_2.max(
+        ) < 1e-04, "There's difference in detection score 2."
+
+        print("batch predict success!")
+
+
+def test_detection_blazeface_runtime():
+    model_url = "https://bj.bcebos.com/paddlehub/fastdeploy/blazeface_1000e.tgz"
+    input_url1 = "https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg"
+    result_url1 = "https://bj.bcebos.com/paddlehub/fastdeploy/blazeface_result1.pkl"
+    fd.download_and_decompress(model_url, "resources")
+    fd.download(input_url1, "resources")
+    fd.download(result_url1, "resources")
+
+    model_dir = "resources/blazeface_1000e"
+    model_file = os.path.join(model_dir, "model.pdmodel")
+    params_file = os.path.join(model_dir, "model.pdiparams")
+    config_file = os.path.join(model_dir, "infer_cfg.yml")
+
+    preprocessor = fd.vision.facedet.BlazeFacePreprocessor()
+    postprocessor = fd.vision.facedet.BlazeFacePostprocessor()
+
+    rc.test_option.set_model_path(model_file, params_file, config_file, model_format=ModelFormat.PADDLE)
+    rc.test_option.use_openvino_backend()
+    runtime = fd.Runtime(rc.test_option)
+
+    with open("resources/blazeface_result1.pkl", "rb") as f:
+        expect1 = pickle.load(f)
+
+    im1 = cv2.imread("resources/000000014439.jpg")
+
+    for i in range(3):
+        # test runtime
+        input_tensors, ims_info = preprocessor.run([im1.copy()])
+        output_tensors = runtime.infer({"images": input_tensors[0]})
+        results = postprocessor.run(output_tensors, ims_info)
+        result1 = results[0]
+
+        diff_boxes_1 = np.fabs(
+            np.array(result1.boxes) - np.array(expect1["boxes"]))
+        diff_scores_1 = np.fabs(
+            np.array(result1.scores) - np.array(expect1["scores"]))
+
+        assert diff_boxes_1.max(
+        ) < 1e-03, "There's difference in detection boxes 1."
+        assert diff_scores_1.max(
+        ) < 1e-04, "There's difference in detection score 1."
+
+
+if __name__ == "__main__":
+    test_detection_blazeface()
+    test_detection_blaze_runtime()
diff --git a/tests/models/test_centerface.py b/tests/models/test_centerface.py
new file mode 100644
index 000000000..9dbbff0c1
--- /dev/null
+++ b/tests/models/test_centerface.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from fastdeploy import ModelFormat
+import fastdeploy as fd
+import cv2
+import os
+import pickle
+import numpy as np
+import runtime_config as rc
+
+
+def test_facedet_centerface():
+    model_url = "https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx"
+    input_url1 = "https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg"
+    result_url1 = "https://bj.bcebos.com/paddlehub/fastdeploy/centerface_result1.pkl"
+    fd.download(model_url, "resources")
+    fd.download(input_url1, "resources")
+    fd.download(result_url1, "resources")
+
+    model_file = "resources/CenterFace.onnx"
+    model = fd.vision.facedet.CenterFace(
+        model_file, runtime_option=rc.test_option)
+
+    with open("resources/centerface_result1.pkl", "rb") as f:
+        expect1 = pickle.load(f)
+
+    # compare diff
+    im1 = cv2.imread("./resources/test_lite_face_detector_3.jpg")
+    print(expect1)
+    for i in range(3):
+        # test single predict
+        result1 = model.predict(im1)
+
+        diff_boxes_1 = np.fabs(
+            np.array(result1.boxes) - np.array(expect1["boxes"]))
+        diff_scores_1 = np.fabs(
+            np.array(result1.scores) - np.array(expect1["scores"]))
+
+        assert diff_boxes_1.max(
+        ) < 1e-04, "There's difference in detection boxes 1."
+        assert diff_scores_1.max(
+        ) < 1e-05, "There's difference in detection score 1."
+
+def test_facedet_centerface_runtime():
+    model_url = "https://bj.bcebos.com/paddlehub/fastdeploy/CenterFace.onnx"
+    input_url1 = "https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg"
+    result_url1 = "https://bj.bcebos.com/paddlehub/fastdeploy/centerface_result1.pkl"
+    fd.download(model_url, "resources")
+    fd.download(input_url1, "resources")
+    fd.download(result_url1, "resources")
+
+    model_file = "resources/CenterFace.onnx"
+
+    preprocessor = fd.vision.facedet.CenterFacePreprocessor()
+    postprocessor = fd.vision.facedet.CenterFacePostprocessor()
+
+    rc.test_option.set_model_path(model_file, model_format=ModelFormat.ONNX)
+    rc.test_option.use_openvino_backend()
+    runtime = fd.Runtime(rc.test_option)
+
+    with open("resources/centerface_result1.pkl", "rb") as f:
+        expect1 = pickle.load(f)
+
+    # compare diff
+    im1 = cv2.imread("./resources/test_lite_face_detector_3.jpg")
+
+    for i in range(3):
+        # test runtime
+        input_tensors, ims_info = preprocessor.run([im1.copy()])
+        output_tensors = runtime.infer({"input.1": input_tensors[0]})
+        results = postprocessor.run(output_tensors, ims_info)
+        result1 = results[0]
+
+        diff_boxes_1 = np.fabs(
+            np.array(result1.boxes) - np.array(expect1["boxes"]))
+        diff_scores_1 = np.fabs(
+            np.array(result1.scores) - np.array(expect1["scores"]))
+
+        assert diff_boxes_1.max(
+        ) < 1e-04, "There's difference in detection boxes 1."
+        assert diff_scores_1.max(
+        ) < 1e-05, "There's difference in detection score 1."
+
+
+if __name__ == "__main__":
+    test_facedet_centerface()
+    test_facedet_centerface_runtime()
\ No newline at end of file
diff --git a/tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml b/tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml
index 418b99607..aeeb3fdb7 100644
--- a/tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml
+++ b/tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml
@@ -1,13 +1,13 @@
 mean:
   -
-    - 127.5
-    - 127.5
-    - 127.5
+    - 123.675
+    - 116.28
+    - 103.53
 std:
   -
-    - 127.5
-    - 127.5
-    - 127.5
+    - 58.395
+    - 57.12
+    - 57.375
 model_path: ./picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx
 outputs_nodes:
   - 'p2o.Div.79'
diff --git a/tools/rknpu2/config/yolov8_n_quantized.yaml b/tools/rknpu2/config/yolov8_n_quantized.yaml
index c0811864f..bac5b8c20 100644
--- a/tools/rknpu2/config/yolov8_n_quantized.yaml
+++ b/tools/rknpu2/config/yolov8_n_quantized.yaml
@@ -1,5 +1,13 @@
 mean:
+  -
+    - 0
+    - 0
+    - 0
 std:
+  -
+    - 255
+    - 255
+    - 255
 model_path: ./yolov8_n_500e_coco/yolov8_n_500e_coco.onnx
 outputs_nodes:
   - 'p2o.Mul.119'
diff --git a/tools/rknpu2/config/yolov8_n_unquantized.yaml b/tools/rknpu2/config/yolov8_n_unquantized.yaml
index c1a777e1d..3d1a0f053 100644
--- a/tools/rknpu2/config/yolov8_n_unquantized.yaml
+++ b/tools/rknpu2/config/yolov8_n_unquantized.yaml
@@ -1,9 +1,17 @@
 mean:
+  -
+    - 0
+    - 0
+    - 0
 std:
+  -
+    - 255
+    - 255
+    - 255
 model_path: ./yolov8_n_500e_coco/yolov8_n_500e_coco.onnx
 outputs_nodes:
-  - 'p2o.Div.1'
+  - 'p2o.Mul.1'
   - 'p2o.Concat.49'
 do_quantization: False
-dataset: "./dataset.txt"
+dataset: "./yolov8_n_500e_coco/dataset.txt"
 output_folder: "./yolov8_n_500e_coco"