Merge branch 'develop' into matting

2025-10-05 16:48:03 +08:00 · 2023-02-14 17:39:12 +08:00
parent 39639009d1 fc6edcc541
commit 6d72073d09
45 changed files with 1138 additions and 306 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -715,6 +715,16 @@ if(WITH_ASCEND)
  )
 endif()

+if(WITH_CAPI)
+  install(
+    DIRECTORY ${PROJECT_SOURCE_DIR}/c_api/fastdeploy_capi
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/include
+    FILES_MATCHING
+    PATTERN "*.h"
+    PATTERN "*/types_internal.h" EXCLUDE
+  )
+endif()
+
 include(${PROJECT_SOURCE_DIR}/cmake/config_cpack.cmake)

 ############################### Building: FastDeploy Python Wheel #############################
--- a/FastDeploy.cmake.in
+++ b/FastDeploy.cmake.in
@@ -33,6 +33,7 @@ set(ORT_DIRECTORY "@ORT_DIRECTORY@")
 set(OPENVINO_DIRECTORY "@OPENVINO_DIRECTORY@")
 set(RKNN2_TARGET_SOC "@RKNN2_TARGET_SOC@")
 set(WITH_KUNLUNXIN @WITH_KUNLUNXIN@)
+set(WITH_CAPI @WITH_CAPI@)
 # Whether to use FastDeploy static lib. The default
 # value for this option is determined by the SDK
 # build-time options.
@@ -357,6 +358,7 @@ message(STATUS "  CMAKE_INSTALL_PREFIX      : ${CMAKE_INSTALL_PREFIX}")
 message(STATUS "  CMAKE_MODULE_PATH         : ${CMAKE_MODULE_PATH}")
 message(STATUS "")
 message(STATUS "  WITH_GPU                  : ${WITH_GPU}")
+message(STATUS "  WITH_CAPI                  : ${WITH_CAPI}")
 message(STATUS "  ENABLE_ORT_BACKEND        : ${ENABLE_ORT_BACKEND}")
 message(STATUS "  ENABLE_RKNPU2_BACKEND     : ${ENABLE_RKNPU2_BACKEND}")
 message(STATUS "  ENABLE_SOPHGO_BACKEND     : ${ENABLE_SOPHGO_BACKEND}")
@@ -365,6 +367,7 @@ message(STATUS "  ENABLE_POROS_BACKEND      : ${ENABLE_POROS_BACKEND}")
 message(STATUS "  ENABLE_OPENVINO_BACKEND   : ${ENABLE_OPENVINO_BACKEND}")
 message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
 message(STATUS "  ENABLE_LITE_BACKEND       : ${ENABLE_LITE_BACKEND}")
+
 if(ENABLE_PADDLE_BACKEND)
  message(STATUS "  Paddle Inference version  : ${PADDLEINFERENCE_VERSION}")
 endif()
--- a/benchmark/cpp/benchmark_ppyolov8.cc
+++ b/benchmark/cpp/benchmark_ppyolov8.cc
@@ -12,16 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "flags.h"
 #include "macros.h"
+#include "flags.h"
 #include "option.h"

-#ifdef WIN32
-const char sep = '\\';
-#else
-const char sep = '/';
-#endif
-
 int main(int argc, char* argv[]) {
  google::ParseCommandLineFlags(&argc, &argv, true);
  auto im = cv::imread(FLAGS_image);
@@ -31,6 +25,7 @@ int main(int argc, char* argv[]) {
    PrintUsage();
    return false;
  }
+  PrintBenchmarkInfo();
  auto model_file = FLAGS_model + sep + "model.pdmodel";
  auto params_file = FLAGS_model + sep + "model.pdiparams";
  auto config_file = FLAGS_model + sep + "infer_cfg.yml";
--- a/benchmark/cpp/benchmark_yolov5.cc
+++ b/benchmark/cpp/benchmark_yolov5.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "flags.h"
 #include "macros.h"
+#include "flags.h"
 #include "option.h"

 int main(int argc, char* argv[]) {
@@ -25,6 +25,7 @@ int main(int argc, char* argv[]) {
    PrintUsage();
    return false;
  }
+  PrintBenchmarkInfo();
  auto model_yolov5 =
      fastdeploy::vision::detection::YOLOv5(FLAGS_model, "", option);
  fastdeploy::vision::DetectionResult res;
--- a/benchmark/cpp/flags.h
+++ b/benchmark/cpp/flags.h
@@ -16,6 +16,12 @@

 #include "gflags/gflags.h"

+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
 DEFINE_string(model, "", "Directory of the inference model.");
 DEFINE_string(image, "", "Path of the image file.");
 DEFINE_string(device, "cpu",
@@ -48,3 +54,35 @@ void PrintUsage() {
  std::cout << "Default value of backend: default" << std::endl;
  std::cout << "Default value of use_fp16: false" << std::endl;
 }
+
+void PrintBenchmarkInfo() {
+  // Get model name
+  std::vector<std::string> model_names;
+  fastdeploy::benchmark::Split(FLAGS_model, model_names, sep);
+  // Save benchmark info
+  std::stringstream ss;
+  ss.precision(3);
+  ss << "\n======= Model Info =======\n";
+  ss << "model_name: " << model_names[model_names.size() - 1] << std::endl;
+  ss << "profile_mode: " << FLAGS_profile_mode << std::endl;
+  if (FLAGS_profile_mode == "runtime") {
+    ss << "include_h2d_d2h: " << FLAGS_include_h2d_d2h << std::endl;
+  }
+  ss << "\n======= Backend Info =======\n";
+  ss << "warmup: " << FLAGS_warmup << std::endl;
+  ss << "repeats: " << FLAGS_repeat << std::endl;
+  ss << "device: " << FLAGS_device << std::endl;
+  if (FLAGS_device == "gpu") {
+    ss << "device_id: " << FLAGS_device_id << std::endl;
+  }
+  ss << "backend: " << FLAGS_backend << std::endl;
+  ss << "cpu_thread_nums: " << FLAGS_cpu_thread_nums << std::endl;
+  ss << "use_fp16: " << FLAGS_use_fp16 << std::endl;
+  ss << "collect_memory_info: " << FLAGS_collect_memory_info << std::endl;
+  if (FLAGS_collect_memory_info) {
+    ss << "sampling_interval: " << std::to_string(FLAGS_sampling_interval)
+       << "ms" << std::endl;
+  }
+  std::cout << ss.str() << std::endl;
+  return;
+}
--- a/benchmark/cpp/macros.h
+++ b/benchmark/cpp/macros.h
@@ -18,7 +18,6 @@

 #define BENCHMARK_MODEL(MODEL_NAME, BENCHMARK_FUNC)                         \
 {                                                                           \
-  std::cout << "====" << #MODEL_NAME << "====" << std::endl;                \
  if (!MODEL_NAME.Initialized()) {                                          \
    std::cerr << "Failed to initialize." << std::endl;                      \
    return 0;                                                               \
@@ -62,8 +61,8 @@
    float __cpu_mem__ = __resource_moniter__.GetMaxCpuMem();                \
    float __gpu_mem__ = __resource_moniter__.GetMaxGpuMem();                \
    float __gpu_util__ = __resource_moniter__.GetMaxGpuUtil();              \
-    std::cout << "cpu_pss_mb: " << __cpu_mem__ << "MB." << std::endl;       \
-    std::cout << "gpu_pss_mb: " << __gpu_mem__ << "MB." << std::endl;       \
+    std::cout << "cpu_rss_mb: " << __cpu_mem__ << "MB." << std::endl;       \
+    std::cout << "gpu_rss_mb: " << __gpu_mem__ << "MB." << std::endl;       \
    std::cout << "gpu_util: " << __gpu_util__ << std::endl;                 \
    __resource_moniter__.Stop();                                            \
  }                                                                         \
--- a/c_api/CMakeLists.txt
+++ b/c_api/CMakeLists.txt
@@ -19,6 +19,7 @@ if(NOT WITH_CAPI)
  return()
 endif()

+configure_file(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/config.h.in ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/config.h)
 file(GLOB_RECURSE DEPLOY_CAPI_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/*.cc)
 if(NOT ENABLE_VISION)
    file(GLOB_RECURSE DEPLOY_VISION_CAPI_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/vision/*.cc)
--- a/c_api/fastdeploy_capi/config.h
+++ b/c_api/fastdeploy_capi/config.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#ifndef ENABLE_VISION
+#define ENABLE_VISION
+#endif
+
+#ifndef ENABLE_TEXT
+/* #undef ENABLE_TEXT */
+#endif
--- a/c_api/fastdeploy_capi/config.h.in
+++ b/c_api/fastdeploy_capi/config.h.in
@@ -0,0 +1,22 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#ifndef ENABLE_VISION
+#cmakedefine ENABLE_VISION
+#endif
+
+#ifndef ENABLE_TEXT
+#cmakedefine ENABLE_TEXT
+#endif
--- a/c_api/fastdeploy_capi/enum_variables.h
+++ b/c_api/fastdeploy_capi/enum_variables.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#define FD_ENUM(type)                                                          \
+  typedef int32_t type;                                                        \
+  enum
+
+FD_ENUM(FD_C_ModelFormat){
+    AUTOREC,      ///< Auto recognize the model format by model file name
+    PADDLE,       ///< Model with paddlepaddle format
+    ONNX,         ///< Model with ONNX format
+    RKNN,         ///< Model with RKNN format
+    TORCHSCRIPT,  ///< Model with TorchScript format
+    SOPHGO,       ///< Model with SOPHGO format
+};
+
+FD_ENUM(FD_C_rknpu2_CpuName){
+    RK356X = 0, /* run on RK356X. */
+    RK3588 = 1, /* default,run on RK3588. */
+    UNDEFINED,
+};
+
+FD_ENUM(FD_C_rknpu2_CoreMask){
+    RKNN_NPU_CORE_AUTO = 0,  //< default, run on NPU core randomly.
+    RKNN_NPU_CORE_0 = 1,     //< run on NPU core 0.
+    RKNN_NPU_CORE_1 = 2,     //< run on NPU core 1.
+    RKNN_NPU_CORE_2 = 4,     //< run on NPU core 2.
+    RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 |
+                        RKNN_NPU_CORE_1,  //< run on NPU core 1 and core 2.
+    RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 |
+                          RKNN_NPU_CORE_2,  //< run on NPU core 1 and core 2.
+    RKNN_NPU_CORE_UNDEFINED,
+};
+
+FD_ENUM(FD_C_LitePowerMode){
+    LITE_POWER_HIGH = 0,       ///< Use Lite Backend with high power mode
+    LITE_POWER_LOW = 1,        ///< Use Lite Backend with low power mode
+    LITE_POWER_FULL = 2,       ///< Use Lite Backend with full power mode
+    LITE_POWER_NO_BIND = 3,    ///< Use Lite Backend with no bind power mode
+    LITE_POWER_RAND_HIGH = 4,  ///< Use Lite Backend with rand high mode
+    LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
+};
+
+FD_ENUM(FD_C_ResultType){
+    UNKNOWN_RESULT,
+    CLASSIFY,
+    DETECTION,
+    SEGMENTATION,
+    OCR,
+    MOT,
+    FACE_DETECTION,
+    FACE_ALIGNMENT,
+    FACE_RECOGNITION,
+    MATTING,
+    MASK,
+    KEYPOINT_DETECTION,
+    HEADPOSE,
+};
--- a/c_api/fastdeploy_capi/fd_common.h
+++ b/c_api/fastdeploy_capi/fd_common.h
@@ -58,43 +58,3 @@
 typedef int8_t FD_C_Bool;
 #define TRUE 1
 #define FALSE 0
-
-#define FD_ENUM(type)                                                          \
-  typedef int32_t type;                                                        \
-  enum
-
-FD_ENUM(FD_C_ModelFormat){
-    AUTOREC,      ///< Auto recognize the model format by model file name
-    PADDLE,       ///< Model with paddlepaddle format
-    ONNX,         ///< Model with ONNX format
-    RKNN,         ///< Model with RKNN format
-    TORCHSCRIPT,  ///< Model with TorchScript format
-    SOPHGO,       ///< Model with SOPHGO format
-};
-
-FD_ENUM(FD_C_rknpu2_CpuName){
-    RK356X = 0, /* run on RK356X. */
-    RK3588 = 1, /* default,run on RK3588. */
-    UNDEFINED,
-};
-
-FD_ENUM(FD_C_rknpu2_CoreMask){
-    RKNN_NPU_CORE_AUTO = 0,  //< default, run on NPU core randomly.
-    RKNN_NPU_CORE_0 = 1,     //< run on NPU core 0.
-    RKNN_NPU_CORE_1 = 2,     //< run on NPU core 1.
-    RKNN_NPU_CORE_2 = 4,     //< run on NPU core 2.
-    RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 |
-                        RKNN_NPU_CORE_1,  //< run on NPU core 1 and core 2.
-    RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 |
-                          RKNN_NPU_CORE_2,  //< run on NPU core 1 and core 2.
-    RKNN_NPU_CORE_UNDEFINED,
-};
-
-FD_ENUM(FD_C_LitePowerMode){
-    LITE_POWER_HIGH = 0,       ///< Use Lite Backend with high power mode
-    LITE_POWER_LOW = 1,        ///< Use Lite Backend with low power mode
-    LITE_POWER_FULL = 2,       ///< Use Lite Backend with full power mode
-    LITE_POWER_NO_BIND = 3,    ///< Use Lite Backend with no bind power mode
-    LITE_POWER_RAND_HIGH = 4,  ///< Use Lite Backend with rand high mode
-    LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
-};
--- a/c_api/fastdeploy_capi/fd_type.cc
+++ b/c_api/fastdeploy_capi/fd_type.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/fd_type.h"
+
+#include <opencv2/imgcodecs.hpp>
+
+#include "fastdeploy_capi/fd_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FD_C_Mat FD_C_Imread(const char* imgpath) {
+  cv::Mat image = cv::imread(imgpath);
+  return new cv::Mat(image);
+}
+
+FD_C_Bool FD_C_Imwrite(const char* savepath, FD_C_Mat img) {
+  cv::Mat cv_img = *(reinterpret_cast<cv::Mat*>(img));
+  bool result = cv::imwrite(savepath, cv_img);
+  return result;
+}
+
+void FD_C_DestroyMat(FD_C_Mat mat) { delete reinterpret_cast<cv::Mat*>(mat); }
+
+#ifdef __cplusplus
+}
+#endif
--- a/c_api/fastdeploy_capi/fd_type.h
+++ b/c_api/fastdeploy_capi/fd_type.h
@@ -17,7 +17,8 @@
 #include <stdint.h>
 #include <stdio.h>

-#include "fastdeploy_capi/fd_common.h"  // NOLINT
+#include "fastdeploy_capi/enum_variables.h"
+#include "fastdeploy_capi/fd_common.h"

 typedef struct FD_C_OneDimArrayUint8 {
  size_t size;
@@ -65,3 +66,19 @@ typedef struct FD_C_TwoDimArrayFloat {
 } FD_C_TwoDimArrayFloat;  // std::vector<std::vector<float>>

 typedef void* FD_C_Mat;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_Mat
+FD_C_Imread(const char* imgpath);
+
+FASTDEPLOY_CAPI_EXPORT extern FD_C_Bool FD_C_Imwrite(const char* savepath,
+                                                     __fd_keep FD_C_Mat);
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyMat(__fd_take FD_C_Mat mat);
+
+#ifdef __cplusplus
+}
+#endif
--- a/c_api/fastdeploy_capi/runtime_option.cc
+++ b/c_api/fastdeploy_capi/runtime_option.cc
@@ -17,7 +17,9 @@
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy_capi/types_internal.h"

+#ifdef __cplusplus
 extern "C" {
+#endif

 FD_C_RuntimeOptionWrapper* FD_C_CreateRuntimeOptionWrapper() {
  FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper =
@@ -28,7 +30,7 @@ FD_C_RuntimeOptionWrapper* FD_C_CreateRuntimeOptionWrapper() {
  return fd_c_runtime_option_wrapper;
 }

-void FD_C_DestroyRuntimeOption(
+void FD_C_DestroyRuntimeOptionWrapper(
    __fd_take FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
  delete fd_c_runtime_option_wrapper;
 }
@@ -404,15 +406,6 @@ void FD_C_RuntimeOptionWrapperUseIpu(
                         batches_per_step);
 }

-void FD_C_RuntimeOptionWrapperSetIpuConfig(
-    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
-    FD_C_Bool enable_fp16, int replica_num, float available_memory_proportion,
-    FD_C_Bool enable_half_partial) {
-  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
-                                                   fd_c_runtime_option_wrapper);
-  runtime_option->SetIpuConfig(enable_fp16, replica_num,
-                               available_memory_proportion,
-                               enable_half_partial);
+#ifdef __cplusplus
 }
-
-}  // extern "C"
+#endif
--- a/c_api/fastdeploy_capi/runtime_option.h
+++ b/c_api/fastdeploy_capi/runtime_option.h
@@ -14,9 +14,7 @@

 #pragma once

-#include <memory>
-
-#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"

 typedef struct FD_C_RuntimeOptionWrapper FD_C_RuntimeOptionWrapper;

@@ -499,19 +497,6 @@ FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseIpu(
    int device_num, int micro_batch_size, FD_C_Bool enable_pipelining,
    int batches_per_step);

-/** \brief Set IPU config.
- *
- * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
- * \param[in] enable_fp16 enable fp16.
- * \param[in] replica_num the number of graph replication.
- * \param[in] available_memory_proportion the available memory proportion for matmul/conv.
- * \param[in] enable_half_partial enable fp16 partial for matmul, only work with fp16.
- */
-FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetIpuConfig(
-    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
-    FD_C_Bool enable_fp16, int replica_num, float available_memory_proportion,
-    FD_C_Bool enable_half_partial);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/c_api/fastdeploy_capi/vision.h
+++ b/c_api/fastdeploy_capi/vision.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "fastdeploy_capi/config.h"
+
+#ifdef ENABLE_VISION
+#include "fastdeploy_capi/vision/classification/ppcls/model.h"
+#include "fastdeploy_capi/vision/detection/ppdet/model.h"
+#include "fastdeploy_capi/vision/result.h"
+#include "fastdeploy_capi/vision/visualize.h"
+#endif
+
+#include "fastdeploy_capi/fd_type.h"
+#include "fastdeploy_capi/runtime_option.h"
--- a/c_api/fastdeploy_capi/vision/classification/ppcls/model.cc
+++ b/c_api/fastdeploy_capi/vision/classification/ppcls/model.cc
@@ -16,7 +16,9 @@

 #include "fastdeploy_capi/types_internal.h"

+#ifdef __cplusplus
 extern "C" {
+#endif

 FD_C_PaddleClasModelWrapper* FD_C_CreatePaddleClasModelWrapper(
    const char* model_file, const char* params_file, const char* config_file,
@@ -50,4 +52,7 @@ FD_C_Bool FD_C_PaddleClasModelWrapperPredict(
      ClassifyResultWrapper, fd_c_classify_result_wrapper);
  return paddleclas_model->Predict(im, classify_result.get());
 }
-}
+
+#ifdef __cplusplus
+}
+#endif
--- a/c_api/fastdeploy_capi/vision/detection/ppdet/model.cc
+++ b/c_api/fastdeploy_capi/vision/detection/ppdet/model.cc
@@ -17,7 +17,9 @@
 #include "fastdeploy_capi/types_internal.h"
 #include "fastdeploy_capi/vision/visualize.h"

+#ifdef __cplusplus
 extern "C" {
+#endif

 FD_C_PPYOLOEWrapper* FD_C_CreatesPPYOLOEWrapper(
    const char* model_file, const char* params_file, const char* config_file,
@@ -50,4 +52,7 @@ FD_C_Bool FD_C_PPYOLOEWrapperPredict(
      DetectionResultWrapper, fd_c_detection_result_wrapper);
  return ppyoloe_model->Predict(im, detection_result.get());
 }
-}
+
+#ifdef __cplusplus
+}
+#endif
--- a/c_api/fastdeploy_capi/vision/result.cc
+++ b/c_api/fastdeploy_capi/vision/result.cc
@@ -17,7 +17,9 @@
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy_capi/types_internal.h"

+#ifdef __cplusplus
 extern "C" {
+#endif

 // Classification Results

@@ -235,4 +237,6 @@ FD_C_DetectionResultWrapper* FD_C_CreateDetectionResultWrapperFromData(

  return fd_c_detection_result_wrapper;
 }
-}
+#ifdef __cplusplus
+}
+#endif
--- a/c_api/fastdeploy_capi/vision/result.h
+++ b/c_api/fastdeploy_capi/vision/result.h
@@ -24,22 +24,6 @@ typedef struct FD_C_DetectionResultWrapper FD_C_DetectionResultWrapper;
 extern "C" {
 #endif

-FD_ENUM(FD_C_ResultType){
-    UNKNOWN_RESULT,
-    CLASSIFY,
-    DETECTION,
-    SEGMENTATION,
-    OCR,
-    MOT,
-    FACE_DETECTION,
-    FACE_ALIGNMENT,
-    FACE_RECOGNITION,
-    MATTING,
-    MASK,
-    KEYPOINT_DETECTION,
-    HEADPOSE,
-};
-
 typedef struct FD_C_ClassifyResult {
  FD_C_OneDimArrayInt32 label_ids;
  FD_C_OneDimArrayFloat scores;
--- a/c_api/fastdeploy_capi/vision/visualize.cc
+++ b/c_api/fastdeploy_capi/vision/visualize.cc
@@ -17,7 +17,9 @@
 #include "fastdeploy/vision/visualize/visualize.h"
 #include "fastdeploy_capi/types_internal.h"

+#ifdef __cplusplus
 extern "C" {
+#endif

 FD_C_Mat FD_C_VisDetection(FD_C_Mat im,
                           FD_C_DetectionResult* fd_c_detection_result,
@@ -32,4 +34,6 @@ FD_C_Mat FD_C_VisDetection(FD_C_Mat im,
      line_size, font_size);
  return new cv::Mat(result);
 }
-}
+#ifdef __cplusplus
+}
+#endif
--- a/examples/vision/detection/paddledetection/c/CMakeLists.txt
+++ b/examples/vision/detection/paddledetection/c/CMakeLists.txt
@@ -0,0 +1,13 @@
+PROJECT(infer_demo C)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_ppyoloe_demo ${PROJECT_SOURCE_DIR}/infer_ppyoloe.c)
+target_link_libraries(infer_ppyoloe_demo ${FASTDEPLOY_LIBS})
--- a/examples/vision/detection/paddledetection/c/README.md
+++ b/examples/vision/detection/paddledetection/c/README.md
@@ -0,0 +1,200 @@
+English | [简体中文](README_CN.md)
+# PaddleDetection C Deployment Example
+
+This directory provides examples that `infer_xxx.c` fast finishes the deployment of PaddleDetection models, including PPYOLOE on CPU/GPU.
+
+Before deployment, two steps require confirmation
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)  
+- 2.  Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../../docs/en/build_and_install/download_prebuilt_libraries.md)
+
+Taking inference on Linux as an example, the compilation test can be completed by executing the following command in this directory. FastDeploy version 1.0.4 or above (x.x.x>=1.0.4) is required to support this model.
+
+```bash
+ppyoloe is taken as an example for inference deployment
+
+mkdir build
+cd build
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# Download the PPYOLOE model file and test images
+wget https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco.tgz
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+tar xvf ppyoloe_crn_l_300e_coco.tgz
+
+
+# CPU inference
+./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 0
+# GPU inference
+./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 1
+```
+
+The above command works for Linux or MacOS. For SDK use-pattern in Windows, refer to:
+- [How to use FastDeploy C++ SDK in Windows](../../../../../docs/en/faq/use_sdk_on_windows.md)
+
+## PaddleDetection C Interface
+
+### RuntimeOption
+
+```c
+FD_C_RuntimeOptionWrapper* FD_C_CreateRuntimeOptionWrapper()
+```
+
+> Create a RuntimeOption object, and return a pointer to manipulate it.
+>
+> **Return**
+> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): Pointer to manipulate RuntimeOption object.
+
+
+```c
+void FD_C_RuntimeOptionWrapperUseCpu(
+     FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper)
+```
+
+> Enable Cpu inference.
+>
+> **Params**
+>
+> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): Pointer to manipulate RuntimeOption object.
+
+```c
+void FD_C_RuntimeOptionWrapperUseGpu(
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int gpu_id)
+```
+> Enable Gpu inference.
+>
+> **Params**
+>
+> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): Pointer to manipulate RuntimeOption object.
+> * **gpu_id**(int): gpu id
+
+
+### Model
+
+```c
+
+FD_C_PPYOLOEWrapper* FD_C_CreatesPPYOLOEWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* runtime_option,
+    const FD_C_ModelFormat model_format)
+
+```
+
+> Create a PPYOLOE model object, and return a pointer to manipulate it.
+>
+> **Params**
+>
+> * **model_file**(const char*): Model file path
+> * **params_file**(const char*): Parameter file path
+> * **config_file**(const char*): Configuration file path, which is the deployment yaml file exported by PaddleDetection
+> * **runtime_option**(FD_C_RuntimeOptionWrapper*): Backend inference configuration. None by default, which is the default configuration
+> * **model_format**(FD_C_ModelFormat): Model format. Paddle format by default
+>
+> **Return**
+> * **fd_c_ppyoloe_wrapper**(FD_C_PPYOLOEWrapper*): Pointer to manipulate PPYOLOE object.
+
+
+#### Read and write image
+
+```c
+FD_C_Mat FD_C_Imread(const char* imgpath)
+```
+
+> Read an image, and return a pointer to cv::Mat.
+>
+> **Params**
+>
+> * **imgpath**(const char*): image path
+>
+> **Return**
+>
+> * **imgmat**(FD_C_Mat): pointer to cv::Mat object which holds the image.
+
+
+```c
+FD_C_Bool FD_C_Imwrite(const char* savepath,  FD_C_Mat img);
+```
+
+> Write image to a file.
+>
+> **Params**
+>
+> * **savepath**(const char*): save path
+> * **img**(FD_C_Mat): pointer to cv::Mat object
+>
+> **Return**
+>
+> * **result**(FD_C_Bool): bool to indicate success or failure
+
+
+#### Prediction
+
+```c
+FD_C_Bool FD_C_PPYOLOEWrapperPredict(
+    __fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper, FD_C_Mat img,
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper)
+```
+>
+> Predict an image, and generate detection result.
+>
+> **Params**
+> * **fd_c_ppyoloe_wrapper**(FD_C_PPYOLOEWrapper*): pointer to manipulate PPYOLOE object
+> * **img**（FD_C_Mat）: pointer to cv::Mat object, which can be obained by FD_C_Imread interface
+> * **result**（FD_C_DetectionResultWrapper*): Detection result, including detection box and confidence of each box. Refer to [Vision Model Prediction Result](../../../../../docs/api/vision_results/) for DetectionResult
+
+
+#### Result
+
+```c
+FD_C_DetectionResultWrapper* FD_C_CreateDetectionResultWrapper();
+```
+>
+> Create a DetectionResult object to keep the detection result，return a pointer to manipulate it.
+>
+> **Return**
+> * **fd_c_detection_result_wrapper**(FD_C_DetectionResultWrapper*): pointer to manipulate DetectionResult object
+
+
+
+```c
+FD_C_DetectionResult* FD_C_DetectionResultWrapperGetData(
+     FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper)
+```
+>
+> Get the C DetectionResult structure from FD_C_DetectionResultWrapper, which can access the fileds directly.
+>
+> **Params**
+> * **fd_c_detection_result_wrapper**(FD_C_DetectionResultWrapper*): pointer to manipulate DetectionResult object
+>
+> **Return**
+> * **fd_c_detection_result**(FD_C_DetectionResult*): pointer to C DetectionResult structure
+
+
+
+```c
+FD_C_Mat FD_C_VisDetection(FD_C_Mat im, FD_C_DetectionResult* fd_detection_result,
+                  float score_threshold, int line_size, float font_size);
+```
+>
+> Visualize detection results and return visualization image.
+>
+> **Params**
+> * **im**(FD_C_Mat): pointer to input image
+> * **fd_detection_result**(FD_C_DetectionResult*): pointer to C DetectionResult structure
+> * **score_threshold**(float): score threshold
+> * **line_size**(int): line size
+> * **font_size**(float): font size
+>
+> **Return**
+> * **vis_im**(FD_C_Mat): pointer to visualization image.
+
+
+- [Model Description](../../)
+- [Python Deployment](../python)
+- [Vision Model prediction results](../../../../../docs/api/vision_results/)
+- [How to switch the model inference backend engine](../../../../../docs/en/faq/how_to_change_backend.md)
--- a/examples/vision/detection/paddledetection/c/README_CN.md
+++ b/examples/vision/detection/paddledetection/c/README_CN.md
@@ -0,0 +1,204 @@
+[English](README.md) | 简体中文
+# PaddleDetection C 部署示例
+
+本目录下提供`infer_xxx.c`来调用C API快速完成PaddleDetection模型PPYOLOE在CPU/GPU上部署的示例。
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+以Linux上推理为例，在本目录执行如下命令即可完成编译测试，支持此模型需保证FastDeploy版本1.0.4以上(x.x.x>=1.0.4)
+
+```bash
+以ppyoloe为例进行推理部署
+
+mkdir build
+cd build
+# 下载FastDeploy预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# 下载PPYOLOE模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco.tgz
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+tar xvf ppyoloe_crn_l_300e_coco.tgz
+
+
+# CPU推理
+./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 0
+# GPU推理
+./infer_ppyoloe_demo ./ppyoloe_crn_l_300e_coco 000000014439.jpg 1
+```
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+如果用户使用华为昇腾NPU部署, 请参考以下方式在部署前初始化部署环境:
+- [如何使用华为昇腾NPU部署](../../../../../docs/cn/faq/use_sdk_on_ascend.md)
+
+## PaddleDetection C API接口
+
+### 配置
+
+```c
+FD_C_RuntimeOptionWrapper* FD_C_CreateRuntimeOptionWrapper()
+```
+
+> 创建一个RuntimeOption的配置对象，并且返回操作它的指针。
+>
+> **返回**
+>
+> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption对象的指针
+
+
+```c
+void FD_C_RuntimeOptionWrapperUseCpu(
+     FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper)
+```
+
+> 开启CPU推理
+>
+> **参数**
+>
+> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption对象的指针
+
+```c
+void FD_C_RuntimeOptionWrapperUseGpu(
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int gpu_id)
+```
+> 开启GPU推理
+>
+> **参数**
+>
+> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption对象的指针
+> * **gpu_id**(int): 显卡号
+
+
+### 模型
+
+```c
+
+FD_C_PPYOLOEWrapper* FD_C_CreatesPPYOLOEWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* runtime_option,
+    const FD_C_ModelFormat model_format)
+
+```
+
+> 创建一个PPYOLOE的模型，并且返回操作它的指针。
+>
+> **参数**
+>
+> * **model_file**(const char*): 模型文件路径
+> * **params_file**(const char*): 参数文件路径
+> * **config_file**(const char*): 配置文件路径，即PaddleDetection导出的部署yaml文件
+> * **runtime_option**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption的指针，表示后端推理配置
+> * **model_format**(FD_C_ModelFormat): 模型格式
+>
+> **返回**
+> * **fd_c_ppyoloe_wrapper**(FD_C_PPYOLOEWrapper*): 指向PPYOLOE模型对象的指针
+
+
+#### 读写图像
+
+```c
+FD_C_Mat FD_C_Imread(const char* imgpath)
+```
+
+> 读取一个图像，并且返回cv::Mat的指针。
+>
+> **参数**
+>
+> * **imgpath**(const char*): 图像文件路径
+>
+> **返回**
+>
+> * **imgmat**(FD_C_Mat): 指向图像数据cv::Mat的指针。
+
+
+```c
+FD_C_Bool FD_C_Imwrite(const char* savepath,  FD_C_Mat img);
+```
+
+> 将图像写入文件中。
+>
+> **参数**
+>
+> * **savepath**(const char*): 保存图像的路径
+> * **img**(FD_C_Mat): 指向图像数据的指针
+>
+> **返回**
+>
+> * **result**(FD_C_Bool): 表示操作是否成功
+
+
+#### Predict函数
+
+```c
+FD_C_Bool FD_C_PPYOLOEWrapperPredict(
+    __fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper, FD_C_Mat img,
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper)
+```
+>
+> 模型预测接口，输入图像直接并生成检测结果。
+>
+> **参数**
+> * **fd_c_ppyoloe_wrapper**(FD_C_PPYOLOEWrapper*): 指向PPYOLOE模型的指针
+> * **img**（FD_C_Mat）: 输入图像的指针，指向cv::Mat对象，可以调用FD_C_Imread读取图像获取
+> * **result**（FD_C_DetectionResultWrapper*): 指向检测结果的指针，检测结果包括检测框，各个框的置信度, DetectionResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+
+#### Predict结果
+
+```c
+FD_C_DetectionResultWrapper* FD_C_CreateDetectionResultWrapper();
+```
+>
+> 创建一个DetectionResult对象，用来保存推理的结果，并返回所创建的DetectionResult对象的指针。
+>
+> **返回**
+> * **fd_c_detection_result_wrapper**(FD_C_DetectionResultWrapper*): 指向DetectionResult对象的指针
+
+
+
+```c
+FD_C_DetectionResult* FD_C_DetectionResultWrapperGetData(
+     FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper)
+```
+>
+> 从DetectionResult对象中提取纯C结构的DetectionResult结果，并返回结构指针，通过该指针可直接返回结构中的字段。
+>
+> **参数**
+> * **fd_c_detection_result_wrapper**(FD_C_DetectionResultWrapper*): 指向DetectionResult对象的指针
+>
+> **返回**
+> * **fd_c_detection_result**(FD_C_DetectionResult*): 指向纯C结构的DetectionResult的指针
+
+
+
+```c
+FD_C_Mat FD_C_VisDetection(FD_C_Mat im, FD_C_DetectionResult* fd_detection_result,
+                  float score_threshold, int line_size, float font_size);
+```
+>
+> 对检测结果进行可视化，返回可视化的图像。
+>
+> **参数**
+> * **im**(FD_C_Mat): 指向输入图像的指针
+> * **fd_detection_result**(FD_C_DetectionResult*): 指向纯C结构DetectionResult的指针
+> * **score_threshold**(float): 检测阈值
+> * **line_size**(int): 检测框线大小
+> * **font_size**(float): 检测框字体大小
+>
+> **返回**
+> * **vis_im**(FD_C_Mat): 指向可视化图像的指针
+
+
+- [模型介绍](../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../docs/api/vision_results/)
+- [如何切换模型推理后端引擎](../../../../../docs/cn/faq/how_to_change_backend.md)
--- a/examples/vision/detection/paddledetection/c/infer_ppyoloe.c
+++ b/examples/vision/detection/paddledetection/c/infer_ppyoloe.c
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "fastdeploy_capi/vision.h"
+
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+void CpuInfer(const char* model_dir, const char* image_file) {
+  char model_file[100];
+  char params_file[100];
+  char config_file[100];
+  int max_size = 99;
+  snprintf(model_file, max_size, "%s%c%s", model_dir, sep, "model.pdmodel");
+  snprintf(params_file, max_size, "%s%c%s", model_dir, sep, "model.pdiparams");
+  snprintf(config_file, max_size, "%s%c%s", model_dir, sep, "infer_cfg.yml");
+
+  FD_C_RuntimeOptionWrapper* option = FD_C_CreateRuntimeOptionWrapper();
+  FD_C_RuntimeOptionWrapperUseCpu(option);
+
+  FD_C_PPYOLOEWrapper* model = FD_C_CreatesPPYOLOEWrapper(
+      model_file, params_file, config_file, option, PADDLE);
+
+  FD_C_Mat im = FD_C_Imread(image_file);
+
+  FD_C_DetectionResultWrapper* result_wrapper =
+      FD_C_CreateDetectionResultWrapper();
+
+  if (!FD_C_PPYOLOEWrapperPredict(model, im, result_wrapper)) {
+    printf("Failed to predict.\n");
+    return;
+  }
+
+  FD_C_DetectionResult* result =
+      FD_C_DetectionResultWrapperGetData(result_wrapper);
+  FD_C_Mat vis_im = FD_C_VisDetection(im, result, 0.5, 1, 0.5);
+
+  FD_C_Imwrite("vis_result.jpg", vis_im);
+  printf("Visualized result saved in ./vis_result.jpg\n");
+
+  FD_C_DestroyRuntimeOptionWrapper(option);
+  FD_C_DestroyPPYOLOEWrapper(model);
+  FD_C_DestroyDetectionResultWrapper(result_wrapper);
+  FD_C_DestroyDetectionResult(result);
+  FD_C_DestroyMat(im);
+  FD_C_DestroyMat(vis_im);
+}
+
+void GpuInfer(const char* model_dir, const char* image_file) {
+  char model_file[100];
+  char params_file[100];
+  char config_file[100];
+  int max_size = 99;
+  snprintf(model_file, max_size, "%s%c%s", model_dir, sep, "model.pdmodel");
+  snprintf(params_file, max_size, "%s%c%s", model_dir, sep, "model.pdiparams");
+  snprintf(config_file, max_size, "%s%c%s", model_dir, sep, "infer_cfg.yml");
+
+  FD_C_RuntimeOptionWrapper* option = FD_C_CreateRuntimeOptionWrapper();
+  FD_C_RuntimeOptionWrapperUseGpu(option, 0);
+
+  FD_C_PPYOLOEWrapper* model = FD_C_CreatesPPYOLOEWrapper(
+      model_file, params_file, config_file, option, PADDLE);
+
+  FD_C_Mat im = FD_C_Imread(image_file);
+
+  FD_C_DetectionResultWrapper* result_wrapper =
+      FD_C_CreateDetectionResultWrapper();
+
+  if (!FD_C_PPYOLOEWrapperPredict(model, im, result_wrapper)) {
+    printf("Failed to predict.\n");
+    return;
+  }
+
+  FD_C_DetectionResult* result =
+      FD_C_DetectionResultWrapperGetData(result_wrapper);
+  FD_C_Mat vis_im = FD_C_VisDetection(im, result, 0.5, 1, 0.5);
+
+  FD_C_Imwrite("vis_result.jpg", vis_im);
+  printf("Visualized result saved in ./vis_result.jpg\n");
+
+  FD_C_DestroyRuntimeOptionWrapper(option);
+  FD_C_DestroyPPYOLOEWrapper(model);
+  FD_C_DestroyDetectionResultWrapper(result_wrapper);
+  FD_C_DestroyDetectionResult(result);
+  FD_C_DestroyMat(im);
+  FD_C_DestroyMat(vis_im);
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    printf(
+        "Usage: infer_demo path/to/model_dir path/to/image run_option, "
+        "e.g ./infer_model ./ppyoloe_model_dir ./test.jpeg 0"
+        "\n");
+    printf(
+        "The data type of run_option is int, 0: run with cpu; 1: run with gpu"
+        "\n");
+    return -1;
+  }
+
+  if (atoi(argv[3]) == 0) {
+    CpuInfer(argv[1], argv[2]);
+  } else if (atoi(argv[3]) == 1) {
+    GpuInfer(argv[1], argv[2]);
+  }
+  return 0;
+}
--- a/examples/vision/detection/paddledetection/rknpu2/README_CN.md
+++ b/examples/vision/detection/paddledetection/rknpu2/README_CN.md
@@ -4,12 +4,14 @@

 ## 支持模型列表

-目前FastDeploy使用RKNPU2支持如下PaddleDetection模型的部署:
+在RKNPU2上已经通过测试的PaddleDetection模型如下:

 - Picodet
- PPYOLOE
+- PPYOLOE(int8)
 - YOLOV8

+如果你需要查看详细的速度信息，请查看[RKNPU2模型速度一览表](../../../../../docs/cn/faq/rknpu2/rknpu2.md)
+
 ## 准备PaddleDetection部署模型以及转换模型

 RKNPU部署模型前需要将Paddle模型转换成RKNN模型，具体步骤如下:
@@ -20,8 +22,79 @@ RKNPU部署模型前需要将Paddle模型转换成RKNN模型，具体步骤如

 ## 模型转换example

- [Picodet RKNPU2模型转换文档](./picodet.md)
- [YOLOv8 RKNPU2模型转换文档](./yolov8.md)
+### 注意点
+
+PPDetection模型在RKNPU2上部署时要注意以下几点:
+
+* 模型导出需要包含Decode
+* 由于RKNPU2不支持NMS，因此输出节点必须裁剪至NMS之前
+* 由于RKNPU2 Div算子的限制，模型的输出节点需要裁剪至Div算子之前
+
+### Paddle模型转换为ONNX模型
+
+由于Rockchip提供的rknn-toolkit2工具暂时不支持Paddle模型直接导出为RKNN模型，因此需要先将Paddle模型导出为ONNX模型，再将ONNX模型转为RKNN模型。
+
+```bash
+# 以Picodet为例
+# 下载Paddle静态图模型并解压
+wget https://paddledet.bj.bcebos.com/deploy/Inference/picodet_s_416_coco_lcnet.tar
+tar xvf picodet_s_416_coco_lcnet.tar
+
+# 静态图转ONNX模型，注意，这里的save_file请和压缩包名对齐
+paddle2onnx --model_dir picodet_s_416_coco_lcnet \
+            --model_filename model.pdmodel \
+            --params_filename model.pdiparams \
+            --save_file picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx \
+            --enable_dev_version True
+
+# 固定shape
+python -m paddle2onnx.optimize --input_model picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx \
+                                --output_model picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx \
+                                --input_shape_dict "{'image':[1,3,416,416]}"
+```
+
+### 编写yaml文件
+
+**修改normalize参数**
+
+如果你需要在NPU上执行normalize操作，请根据你的模型配置normalize参数，例如:
+
+```yaml
+mean:
+  -
+    - 123.675
+    - 116.28
+    - 103.53
+std:
+  -
+    - 58.395
+    - 57.12
+    - 57.375
+```
+
+**修改outputs参数**
+由于Paddle2ONNX版本的不同，转换模型的输出节点名称也有所不同，请使用[Netron](https://netron.app)对模型进行可视化，并找到以下蓝色方框标记的NonMaxSuppression节点，红色方框的节点名称即为目标名称。
+
+例如，使用Netron可视化后，得到以下图片:
+
+![](https://user-images.githubusercontent.com/58363586/212599781-e1952da7-6eae-4951-8ca7-bab7e6940692.png)
+
+找到蓝色方框标记的NonMaxSuppression节点，可以看到红色方框标记的两个节点名称为p2o.Div.79和p2o.Concat.9,因此需要修改outputs参数，修改后如下:
+
+```yaml
+outputs_nodes:
+  - 'p2o.Mul.179'
+  - 'p2o.Concat.9'
+```
+
+### ONNX模型转RKNN模型
+
+为了方便大家使用，我们提供了python脚本，通过我们预配置的config文件，你将能够快速地转换ONNX模型到RKNN模型
+
+```bash
+python tools/rknpu2/export.py --config_path tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml \
+                              --target_platform rk3588
+```


 ## 其他链接
--- a/examples/vision/detection/paddledetection/rknpu2/picodet.md
+++ b/examples/vision/detection/paddledetection/rknpu2/picodet.md
@@ -1,68 +0,0 @@
-# Picodet RKNPU2模型转换文档
-
-以下步骤均在Ubuntu电脑上完成，请参考配置文档完成转换模型环境配置。下面以Picodet-s为例子,教大家如何转换PaddleDetection模型到RKNN模型。
-
-
-### 导出ONNX模型
-
-```bash
-# 下载Paddle静态图模型并解压
-wget https://paddledet.bj.bcebos.com/deploy/Inference/picodet_s_416_coco_lcnet.tar
-tar xvf picodet_s_416_coco_lcnet.tar
-
-# 静态图转ONNX模型，注意，这里的save_file请和压缩包名对齐
-paddle2onnx --model_dir picodet_s_416_coco_lcnet \
-            --model_filename model.pdmodel \
-            --params_filename model.pdiparams \
-            --save_file picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx \
-            --enable_dev_version True
-
-# 固定shape
-python -m paddle2onnx.optimize --input_model picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx \
-                                --output_model picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx \
-                                --input_shape_dict "{'image':[1,3,416,416]}"
-```
-
-### 编写模型导出配置文件
-
-以转化RK3568的RKNN模型为例子，我们需要编辑tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml，来转换ONNX模型到RKNN模型。
-
-**修改normalize参数**
-
-如果你需要在NPU上执行normalize操作，请根据你的模型配置normalize参数，例如:
-
-```yaml
-mean:
-  -
-    - 127.5
-    - 127.5
-    - 127.5
-std:
-  -
-    - 127.5
-    - 127.5
-    - 127.5
-```
-
-**修改outputs参数**
-由于Paddle2ONNX版本的不同，转换模型的输出节点名称也有所不同，请使用[Netron](https://netron.app)对模型进行可视化，并找到以下蓝色方框标记的NonMaxSuppression节点，红色方框的节点名称即为目标名称。
-
-例如，使用Netron可视化后，得到以下图片:
-
-![](https://user-images.githubusercontent.com/58363586/212599781-e1952da7-6eae-4951-8ca7-bab7e6940692.png)
-
-找到蓝色方框标记的NonMaxSuppression节点，可以看到红色方框标记的两个节点名称为p2o.Div.79和p2o.Concat.9,因此需要修改outputs参数，修改后如下:
-
-```yaml
-outputs_nodes: [ 'p2o.Div.79','p2o.Concat.9' ]
-```
-
-### 转换模型
-
-```bash
-
-# ONNX模型转RKNN模型
-# 转换模型,模型将生成在picodet_s_320_coco_lcnet_non_postprocess目录下
-python tools/rknpu2/export.py --config_path tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml \
-                              --target_platform rk3588
-```
--- a/examples/vision/detection/paddledetection/rknpu2/python/infer.py
+++ b/examples/vision/detection/paddledetection/rknpu2/python/infer.py
@@ -45,15 +45,16 @@ if __name__ == "__main__":

    # 配置runtime，加载模型
    runtime_option = fd.RuntimeOption()
-    runtime_option.use_cpu()
+    runtime_option.use_rknpu2()

    model = fd.vision.detection.PPYOLOE(
        model_file,
        params_file,
        config_file,
        runtime_option=runtime_option,
-        model_format=fd.ModelFormat.ONNX)
-
+        model_format=fd.ModelFormat.RKNN)
+    model.preprocessor.disable_normalize()
+    model.preprocessor.disable_permute()
    model.postprocessor.apply_decode_and_nms()

    # 预测图片分割结果
--- a/examples/vision/detection/paddledetection/rknpu2/yolov8.md
+++ b/examples/vision/detection/paddledetection/rknpu2/yolov8.md
@@ -1,50 +0,0 @@
-# YOLOv8 RKNPU2模型转换文档
-
-以下步骤均在Ubuntu电脑上完成，请参考配置文档完成转换模型环境配置。下面以yolov8为例子,教大家如何转换PaddleDetection模型到RKNN模型。
-
-
-### 导出ONNX模型
-
-```bash
-# 下载Paddle静态图模型并解压
-
-# 静态图转ONNX模型，注意，这里的save_file请和压缩包名对齐
-paddle2onnx --model_dir yolov8_n_500e_coco \
-            --model_filename model.pdmodel \
-            --params_filename model.pdiparams \
-            --save_file yolov8_n_500e_coco/yolov8_n_500e_coco.onnx \
-            --enable_dev_version True
-
-# 固定shape
-python -m paddle2onnx.optimize --input_model yolov8_n_500e_coco/yolov8_n_500e_coco.onnx \
-                                --output_model yolov8_n_500e_coco/yolov8_n_500e_coco.onnx \
-                                --input_shape_dict "{'image':[1,3,640,640],'scale_factor':[1,2]}"
-```
-
-### 编写模型导出配置文件
-**修改outputs参数**
-由于Paddle2ONNX版本的不同，转换模型的输出节点名称也有所不同，请使用[Netron](https://netron.app)对模型进行可视化，并找到以下蓝色方框标记的NonMaxSuppression节点，红色方框的节点名称即为目标名称。
-
-例如，使用Netron可视化后，得到以下图片:
-
-![](https://user-images.githubusercontent.com/58363586/212599658-8a2c4b79-f59a-40b5-ade7-f77c6fcfdf2a.png)
-
-找到蓝色方框标记的NonMaxSuppression节点，可以看到红色方框标记的两个节点名称为p2o.Div.1和p2o.Concat.9,因此需要修改outputs参数，修改后如下:
-
-```yaml
-outputs_nodes: [ 'p2o.Div.1','p2o.Concat.49' ]
-```
-
-### 转换模型
-
-```bash
-
-# ONNX模型转RKNN模型
-# 转换非全量化模型,模型将生成在yolov8_n目录下
-python tools/rknpu2/export.py --config_path tools/rknpu2/config/yolov8_n_unquantized.yaml \
-                              --target_platform rk3588
-
-# 转换全量化模型,模型将生成在yolov8_n目录下
-python tools/rknpu2/export.py --config_path tools/rknpu2/config/yolov8_n_quantized.yaml \
-                              --target_platform rk3588
-```
--- a/fastdeploy/benchmark/utils.cc
+++ b/fastdeploy/benchmark/utils.cc
@@ -13,8 +13,8 @@
 // limitations under the License.

 #include <sys/types.h>
-#if defined(__linux__) || defined(__ANDROID__)
-#include <unistd.h>
+#ifdef __linux__
+#include <sys/resource.h>
 #endif
 #include <cmath>

@@ -23,8 +23,7 @@
 namespace fastdeploy {
 namespace benchmark {

-// Remove the ch characters at both ends of str
-static std::string strip(const std::string& str, char ch = ' ') {
+std::string Strip(const std::string& str, char ch) {
  int i = 0;
  while (str[i] == ch) {
    i++;
@@ -36,9 +35,8 @@ static std::string strip(const std::string& str, char ch = ' ') {
  return str.substr(i, j + 1 - i);
 }

-// Split string
-static void split(const std::string& s, std::vector<std::string>& tokens,
-                  char delim = ' ') {
+void Split(const std::string& s, std::vector<std::string>& tokens,
+           char delim) {
  tokens.clear();
  size_t lastPos = s.find_first_not_of(delim, 0);
  size_t pos = s.find(delim, lastPos);
@@ -54,7 +52,7 @@ ResourceUsageMonitor::ResourceUsageMonitor(int sampling_interval_ms, int gpu_id)
    : is_supported_(false),
      sampling_interval_(sampling_interval_ms),
      gpu_id_(gpu_id) {
-#if defined(__linux__) || defined(__ANDROID__)
+#ifdef __linux__
  is_supported_ = true;
 #else
  is_supported_ = false;
@@ -67,7 +65,9 @@ ResourceUsageMonitor::ResourceUsageMonitor(int sampling_interval_ms, int gpu_id)
 }

 void ResourceUsageMonitor::Start() {
-  if (!is_supported_) return;
+  if (!is_supported_) {
+    return;
+  }
  if (check_memory_thd_ != nullptr) {
    FDINFO << "Memory monitoring has already started!" << std::endl;
    return;
@@ -77,20 +77,24 @@ void ResourceUsageMonitor::Start() {
  check_memory_thd_.reset(new std::thread(([this]() {
    // Note we retrieve the memory usage at the very beginning of the thread.
    while (true) {
-      std::string cpu_mem_info = GetCurrentCpuMemoryInfo();
-      // get max_cpu_mem
-      std::vector<std::string> cpu_tokens;
-      split(cpu_mem_info, cpu_tokens, ' ');
-      max_cpu_mem_ = std::max(max_cpu_mem_, stof(cpu_tokens[3]) / 1024);
+#ifdef __linux__
+      rusage res;
+      if (getrusage(RUSAGE_SELF, &res) == 0) {
+        max_cpu_mem_ =
+            std::max(max_cpu_mem_, static_cast<float>(res.ru_maxrss / 1024.0));
+      }
+#endif
 #if defined(WITH_GPU)
      std::string gpu_mem_info = GetCurrentGpuMemoryInfo(gpu_id_);
      // get max_gpu_mem and max_gpu_util
      std::vector<std::string> gpu_tokens;
-      split(gpu_mem_info, gpu_tokens, ',');
+      Split(gpu_mem_info, gpu_tokens, ',');
      max_gpu_mem_ = std::max(max_gpu_mem_, stof(gpu_tokens[6]));
      max_gpu_util_ = std::max(max_gpu_util_, stof(gpu_tokens[7]));
 #endif
-      if (stop_signal_) break;
+      if (stop_signal_) {
+        break;
+      }
      std::this_thread::sleep_for(
          std::chrono::milliseconds(sampling_interval_));
    }
@@ -121,26 +125,6 @@ void ResourceUsageMonitor::StopInternal() {
  check_memory_thd_.reset(nullptr);
 }

-std::string ResourceUsageMonitor::GetCurrentCpuMemoryInfo() {
-  std::string result = "";
-#if defined(__linux__) || defined(__ANDROID__)
-  int iPid = static_cast<int>(getpid());
-  std::string command = "pmap -x " + std::to_string(iPid) + " | grep total";
-  FILE* pp = popen(command.data(), "r");
-  if (!pp) return "";
-  char tmp[1024];
-
-  while (fgets(tmp, sizeof(tmp), pp) != NULL) {
-    result += tmp;
-  }
-  pclose(pp);
-#else
-  FDASSERT(false,
-           "Currently collect cpu memory info only supports Linux and ANDROID.")
-#endif
-  return result;
-}
-
 std::string ResourceUsageMonitor::GetCurrentGpuMemoryInfo(int device_id) {
  std::string result = "";
 #if defined(__linux__) && defined(WITH_GPU)
--- a/fastdeploy/benchmark/utils.h
+++ b/fastdeploy/benchmark/utils.h
@@ -65,20 +65,26 @@ class FASTDEPLOY_DECL ResourceUsageMonitor {

 private:
  void StopInternal();
-  // Get current cpu memory info
-  std::string GetCurrentCpuMemoryInfo();
  // Get current gpu memory info
  std::string GetCurrentGpuMemoryInfo(int device_id);

  bool is_supported_ = false;
  bool stop_signal_ = false;
  const int sampling_interval_;
-  float max_cpu_mem_ = 0.0f;
-  float max_gpu_mem_ = 0.0f;
+  float max_cpu_mem_ = 0.0f;  // MB
+  float max_gpu_mem_ = 0.0f;  // MB
  float max_gpu_util_ = 0.0f;
  const int gpu_id_ = 0;
  std::unique_ptr<std::thread> check_memory_thd_ = nullptr;
 };

+// Remove the ch characters at both ends of str
+FASTDEPLOY_DECL std::string Strip(const std::string& str, char ch = ' ');
+
+// Split string
+FASTDEPLOY_DECL void Split(const std::string& s,
+                           std::vector<std::string>& tokens,
+                           char delim = ' ');
+
 }  // namespace benchmark
 }  // namespace fastdeploy
--- a/fastdeploy/pybind/fd_tensor.cc
+++ b/fastdeploy/pybind/fd_tensor.cc
@@ -15,9 +15,9 @@
 #include <dlpack/dlpack.h>

 #include "fastdeploy/core/fd_type.h"
-#include "fastdeploy/utils/utils.h"
 #include "fastdeploy/fastdeploy_model.h"
 #include "fastdeploy/pybind/main.h"
+#include "fastdeploy/utils/utils.h"

 namespace fastdeploy {

@@ -68,8 +68,8 @@ DLDataType FDToDlpackType(FDDataType fd_dtype) {
      break;

    default:
-      FDASSERT(false,
-              "Convert to DlPack, FDType \"%s\" is not supported.", Str(fd_dtype).c_str());
+      FDASSERT(false, "Convert to DlPack, FDType \"%s\" is not supported.",
+               Str(fd_dtype).c_str());
  }

  dl_dtype.code = dl_code;
@@ -77,10 +77,8 @@ DLDataType FDToDlpackType(FDDataType fd_dtype) {
  return dl_dtype;
 }

-FDDataType
-DlpackToFDType(const DLDataType& data_type) {
-  FDASSERT(data_type.lanes == 1,
-          "FDTensor does not support dlpack lanes != 1")
+FDDataType DlpackToFDType(const DLDataType& data_type) {
+  FDASSERT(data_type.lanes == 1, "FDTensor does not support dlpack lanes != 1")

  if (data_type.code == DLDataTypeCode::kDLFloat) {
    if (data_type.bits == 16) {
@@ -152,7 +150,7 @@ pybind11::capsule FDTensorToDLPack(FDTensor& fd_tensor) {
  dlpack_tensor->dl_tensor.dtype = FDToDlpackType(fd_tensor.dtype);

  dlpack_tensor->dl_tensor.device.device_id = fd_tensor.device_id;
-  if(fd_tensor.device == Device::GPU) {
+  if (fd_tensor.device == Device::GPU) {
    if (fd_tensor.is_pinned_memory) {
      dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDAHost;
    } else {
@@ -162,8 +160,8 @@ pybind11::capsule FDTensorToDLPack(FDTensor& fd_tensor) {
    dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU;
  }

-  return pybind11::capsule(
-      static_cast<void*>(dlpack_tensor), "dltensor", &DeleteUnusedDltensor);
+  return pybind11::capsule(static_cast<void*>(dlpack_tensor), "dltensor",
+                           &DeleteUnusedDltensor);
 }

 FDTensor FDTensorFromDLPack(const std::string& name,
@@ -178,9 +176,8 @@ FDTensor FDTensorFromDLPack(const std::string& name,
  int64_t* strides = dl_managed_tensor->dl_tensor.strides;

  int ndim = dl_managed_tensor->dl_tensor.ndim;
-  std::vector<int64_t> dims(
-      dl_managed_tensor->dl_tensor.shape,
-      dl_managed_tensor->dl_tensor.shape + ndim);
+  std::vector<int64_t> dims(dl_managed_tensor->dl_tensor.shape,
+                            dl_managed_tensor->dl_tensor.shape + ndim);

  // Check if the input is contiguous and in C order
  if (strides != nullptr) {
@@ -196,8 +193,8 @@ FDTensor FDTensorFromDLPack(const std::string& name,
    }

    FDASSERT(is_contiguous_c_order,
-        "DLPack tensor is not contiguous. Only contiguous DLPack "
-        "tensors that are stored in C-Order are supported.");
+             "DLPack tensor is not contiguous. Only contiguous DLPack "
+             "tensors that are stored in C-Order are supported.");
  }

  Device device;
@@ -216,21 +213,20 @@ FDTensor FDTensorFromDLPack(const std::string& name,
      is_pinned_memory = true;
      break;
    default:
-      FDASSERT(false,
+      FDASSERT(
+          false,
          ("DLDevice type " +
-          std::to_string(dl_managed_tensor->dl_tensor.device.device_type) +
-          " is not support by Python backend.").c_str());
+           std::to_string(dl_managed_tensor->dl_tensor.device.device_type) +
+           " is not support by Python backend.")
+              .c_str());
      break;
  }

-  FDDataType dtype =
-      DlpackToFDType(dl_managed_tensor->dl_tensor.dtype);
+  FDDataType dtype = DlpackToFDType(dl_managed_tensor->dl_tensor.dtype);

  PyCapsule_SetName(dlpack_tensor.ptr(), "used_dlpack");
  FDTensor fd_tensor(name);
-  fd_tensor.SetExternalData(
-    dims, dtype, memory_ptr, device, device_id
-  );
+  fd_tensor.SetExternalData(dims, dtype, memory_ptr, device, device_id);
  fd_tensor.is_pinned_memory = is_pinned_memory;
  return fd_tensor;
 }
@@ -242,15 +238,52 @@ void BindFDTensor(pybind11::module& m) {
      .def_readonly("shape", &FDTensor::shape)
      .def_readonly("dtype", &FDTensor::dtype)
      .def_readonly("device", &FDTensor::device)
-      .def("numpy", [](FDTensor& self) {
-        return TensorToPyArray(self);
-      })
+      .def("numpy", [](FDTensor& self) { return TensorToPyArray(self); })
      .def("data", &FDTensor::MutableData)
-      .def("from_numpy", [](FDTensor& self, pybind11::array& pyarray, bool share_buffer = false) {
-        PyArrayToTensor(pyarray, &self, share_buffer);
-      })
+      .def("from_numpy",
+           [](FDTensor& self, pybind11::array& pyarray,
+              bool share_buffer = false) {
+             PyArrayToTensor(pyarray, &self, share_buffer);
+           })
+      .def("from_external_data",
+           [](const std::string& name, size_t data_addr,
+              const std::vector<int64_t>& shape, const std::string& data_type,
+              const std::string& data_place, int device_id) {
+             auto fd_data_type = FDDataType::UNKNOWN1;
+             if (data_type == "FP32") {
+               fd_data_type = FDDataType::FP32;
+             } else if (data_type == "FP16") {
+               fd_data_type = FDDataType::FP16;
+             } else if (data_type == "INT32") {
+               fd_data_type = FDDataType::INT32;
+             } else if (data_type == "INT64") {
+               fd_data_type = FDDataType::INT64;
+             } else {
+               FDASSERT(false,
+                        "FDTensor.from_external_data, datatype \"%s\" is not "
+                        "supported.",
+                        data_type.c_str());
+             }
+
+             Device fd_data_place;
+             if (data_place.find("gpu") != data_place.npos) {
+               fd_data_place = Device::GPU;
+             } else {
+               FDASSERT(false,
+                        ("Device type " + data_place +
+                         " is not support by FDTensor.from_external_data.")
+                            .c_str());
+             }
+             void* data_ptr = nullptr;
+             data_ptr = reinterpret_cast<void*>(data_addr);
+             FDTensor fd_tensor(name);
+             fd_tensor.SetExternalData(shape, fd_data_type,
+                                       static_cast<void*>(data_ptr),
+                                       fd_data_place, device_id);
+             return fd_tensor;
+           })
      .def("to_dlpack", &FDTensorToDLPack)
-      .def("from_dlpack",&FDTensorFromDLPack)
+      .def("from_dlpack", &FDTensorFromDLPack)
      .def("print_info", &FDTensor::PrintInfo);
 }

--- a/fastdeploy/pybind/runtime.cc
+++ b/fastdeploy/pybind/runtime.cc
@@ -110,6 +110,7 @@ void BindRuntime(pybind11::module& m) {
             return outputs;
           })
      .def("bind_input_tensor", &Runtime::BindInputTensor)
+      .def("bind_output_tensor", &Runtime::BindOutputTensor)
      .def("infer", [](Runtime& self) { self.Infer(); })
      .def("get_output_tensor",
           [](Runtime& self, const std::string& name) {
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -25,6 +25,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
  if (option.device == Device::GPU) {
    config_.EnableUseGpu(option.gpu_mem_init_size, option.device_id);
    if (option_.external_stream_) {
+      FDINFO << "Will use external stream for Paddle Backend." << std::endl;
      config_.SetExecStream(option_.external_stream_);
    }
    if (option.enable_trt) {
@@ -226,23 +227,47 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
            << inputs_desc_.size() << ")." << std::endl;
    return false;
  }
+  // output share backend memory only support CPU or GPU
+  if (option_.device == Device::IPU) {
+    copy_to_fd = true;
+  }

  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
  for (size_t i = 0; i < inputs.size(); ++i) {
    auto handle = predictor_->GetInputHandle(inputs[i].name);
    ShareTensorFromFDTensor(handle.get(), inputs[i]);
  }
+  std::unordered_set<std::string> prebinded_output_name;
+  // prebinded output only support for GPU
+  if (!copy_to_fd) {
+    for (size_t i = 0; i < (*outputs).size(); ++i) {
+      auto output_name = (*outputs)[i].name;
+      // if a output is not prebinded,
+      // the name of output is expected to be empty.
+      // We skip here
+      if (output_name.empty()) {
+        continue;
+      }
+      // Record the prebinded output_name.
+      // Those outputs do not need PaddleTensorToFDTensor
+      // after predictor_.Run()
+      prebinded_output_name.insert(output_name);
+      auto handle = predictor_->GetOutputHandle(output_name);
+      ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
+    }
+  }

  RUNTIME_PROFILE_LOOP_BEGIN(1)
  predictor_->Run();
  RUNTIME_PROFILE_LOOP_END

-  // output share backend memory only support CPU or GPU
-  if (option_.device == Device::IPU) {
-    copy_to_fd = true;
-  }
  outputs->resize(outputs_desc_.size());
  for (size_t i = 0; i < outputs_desc_.size(); ++i) {
+    // skip prebinded output
+    if (copy_to_fd == false &&
+        prebinded_output_name.count(outputs_desc_[i].name)) {
+      continue;
+    }
    auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name);
    if (copy_to_fd) {
      (*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.h
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.h
@@ -35,6 +35,9 @@ paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device);
 // Share memory buffer with paddle_infer::Tensor from fastdeploy::FDTensor
 void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, FDTensor& fd_tensor);

+void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
+                             FDTensor& fd_tensor);
+
 // convert paddle_infer::Tensor to fastdeploy::FDTensor
 // if copy_to_fd is true, copy memory data to FDTensor
 /// else share memory to FDTensor
@@ -89,4 +92,4 @@ class PaddleBackend : public BaseBackend {
  std::vector<TensorInfo> inputs_desc_;
  std::vector<TensorInfo> outputs_desc_;
 };
-}  // namespace fastdeploy
+}  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/paddle/util.cc
+++ b/fastdeploy/runtime/backends/paddle/util.cc
@@ -61,6 +61,43 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
           Str(fd_tensor.dtype).c_str());
 }

+void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
+                                FDTensor& fd_tensor) {
+  std::vector<int> shape(fd_tensor.shape.begin(), fd_tensor.shape.end());
+  auto place = ConvertFDDeviceToPlace(fd_tensor.device);
+  if (fd_tensor.dtype == FDDataType::FP32) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<float*>(fd_tensor.MutableData()),
+                                shape, place);
+    } else {
+      tensor->CopyToCpu(static_cast<float*>(fd_tensor.MutableData()));
+    }
+    return;
+  } else if (fd_tensor.dtype == FDDataType::INT32) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<int32_t*>(fd_tensor.MutableData()),
+                                shape, place);
+    } else {
+      tensor->CopyToCpu(static_cast<int32_t*>(fd_tensor.MutableData()));
+    }
+    return;
+  } else if (fd_tensor.dtype == FDDataType::INT64) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<int64_t*>(fd_tensor.MutableData()),
+                                shape, place);
+    } else {
+      tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor.MutableData()));
+    }
+    return;
+  } else if (fd_tensor.dtype == FDDataType::UINT8) {
+    tensor->ShareExternalData(static_cast<uint8_t*>(fd_tensor.MutableData()),
+                              shape, paddle_infer::PlaceType::kCPU);
+    return;
+  }
+  FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
+           Str(fd_tensor.dtype).c_str());
+}
+
 void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
                            FDTensor* fd_tensor, bool copy_to_fd) {
  auto fd_dtype = PaddleDataTypeToFD(tensor->type());
--- a/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.cc
+++ b/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.cc
@@ -67,6 +67,7 @@ bool RKNPU2Backend::GetSDKAndDeviceVersion() {
 ***************************************************************/
 void RKNPU2Backend::BuildOption(const RKNPU2BackendOption& option) {
  this->option_ = option;
+
  // save cpu_name
  this->option_.cpu_name = option.cpu_name;

@@ -123,7 +124,7 @@ bool RKNPU2Backend::InitFromRKNN(const std::string& model_file,
 *  @return     bool
 *  @note       Only support RK3588
 ***************************************************************/
-bool RKNPU2Backend::SetCoreMask(rknpu2::CoreMask& core_mask) const {
+bool RKNPU2Backend::SetCoreMask(const rknpu2::CoreMask& core_mask) {
  int ret = rknn_set_core_mask(ctx, static_cast<rknn_core_mask>(core_mask));
  if (ret != RKNN_SUCC) {
    FDERROR << "rknn_set_core_mask fail! ret=" << ret << std::endl;
--- a/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.h
+++ b/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.h
@@ -25,7 +25,7 @@

 namespace fastdeploy {
 struct RKNPU2BackendOption {
-  rknpu2::CpuName cpu_name = rknpu2::CpuName::RK3588;
+  rknpu2::CpuName cpu_name = rknpu2::CpuName::RK356X;

  // The specification of NPU core setting.It has the following choices :
  // RKNN_NPU_CORE_AUTO : Referring to automatic mode, meaning that it will
@@ -49,7 +49,7 @@ class RKNPU2Backend : public BaseBackend {

  bool GetSDKAndDeviceVersion();

-  bool SetCoreMask(rknpu2::CoreMask& core_mask) const;
+  bool SetCoreMask(const rknpu2::CoreMask& core_mask);

  bool GetModelInputOutputInfos();

--- a/fastdeploy/runtime/option_pybind.cc
+++ b/fastdeploy/runtime/option_pybind.cc
@@ -49,6 +49,10 @@ void BindOption(pybind11::module& m) {
      .def_readwrite("poros_option", &RuntimeOption::poros_option)
      .def_readwrite("paddle_infer_option", &RuntimeOption::paddle_infer_option)
      .def("set_external_stream", &RuntimeOption::SetExternalStream)
+      .def("set_external_raw_stream",
+           [](RuntimeOption& self, size_t external_stream) {
+             self.SetExternalStream(reinterpret_cast<void*>(external_stream));
+           })
      .def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
      .def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
      .def("use_poros_backend", &RuntimeOption::UsePorosBackend)
--- a/fastdeploy/runtime/runtime.cc
+++ b/fastdeploy/runtime/runtime.cc
@@ -224,6 +224,25 @@ void Runtime::BindInputTensor(const std::string& name, FDTensor& input) {
  }
 }

+void Runtime::BindOutputTensor(const std::string& name, FDTensor& output) {
+  bool is_exist = false;
+  for (auto& t : output_tensors_) {
+    if (t.name == name) {
+      FDINFO << "The output name [" << name << "] is exist." << std::endl;
+      is_exist = true;
+      t.SetExternalData(output.shape, output.dtype, output.MutableData(),
+                        output.device, output.device_id);
+      break;
+    }
+  }
+  if (!is_exist) {
+    FDINFO << "The output name [" << name << "] is prebinded added into output tensor list." << std::endl;
+    FDTensor new_tensor(name);
+    new_tensor.SetExternalData(output.shape, output.dtype, output.MutableData(),
+                               output.device, output.device_id);
+    output_tensors_.emplace_back(std::move(new_tensor));
+  }
+}
 FDTensor* Runtime::GetOutputTensor(const std::string& name) {
  for (auto& t : output_tensors_) {
    if (t.name == name) {
--- a/fastdeploy/runtime/runtime.h
+++ b/fastdeploy/runtime/runtime.h
@@ -75,6 +75,12 @@ struct FASTDEPLOY_DECL Runtime {
  /** \brief Bind FDTensor by name, no copy and share input memory
   */
  void BindInputTensor(const std::string& name, FDTensor& input);
+
+  /** \brief Bind FDTensor by name, no copy and share output memory.
+   *  Please make share the correctness of tensor shape of output.
+   */
+  void BindOutputTensor(const std::string& name, FDTensor& output);
+
  /** \brief Get output FDTensor by name, no copy and share backend output memory
   */
  FDTensor* GetOutputTensor(const std::string& name);
--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -71,9 +71,9 @@ struct FASTDEPLOY_DECL RuntimeOption {
  void UseGpu(int gpu_id = 0);
  /// Use RKNPU2 e.g RK3588/RK356X to inference
  void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name =
-                     fastdeploy::rknpu2::CpuName::RK3588,
+                     fastdeploy::rknpu2::CpuName::RK356X,
                 fastdeploy::rknpu2::CoreMask rknpu2_core =
-                     fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_0);
+                     fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO);
  /// Use TimVX e.g RV1126/A311D to inference
  void UseTimVX();
  /// Use Huawei Ascend to inference
@@ -110,6 +110,7 @@ struct FASTDEPLOY_DECL RuntimeOption {
                    bool enable_multi_stream = false);

  void SetExternalStream(void* external_stream);
+
  /*
   * @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
   */
--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -72,6 +72,14 @@ class Runtime:
        """
        self._runtime.bind_input_tensor(name, fdtensor)

+    def bind_output_tensor(self, name, fdtensor):
+        """Bind FDTensor by name, no copy and share output memory
+
+        :param name: (str)The name of output data.
+        :param fdtensor: (fastdeploy.FDTensor)The output FDTensor.
+        """
+        self._runtime.bind_output_tensor(name, fdtensor)
+
    def zero_copy_infer(self):
        """No params inference the model.

@@ -657,6 +665,11 @@ class RuntimeOption:
        """
        return self._option.disable_profiling()

+    def set_external_raw_stream(self, cuda_stream):
+        """Set the external raw stream used by fastdeploy runtime.
+        """
+        self._option.set_external_raw_stream(cuda_stream)
+
    def __repr__(self):
        attrs = dir(self._option)
        message = "RuntimeOption(\n"
--- a/tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml
+++ b/tools/rknpu2/config/picodet_s_416_coco_lcnet_unquantized.yaml
@@ -10,7 +10,7 @@ std:
    - 57.375
 model_path: ./picodet_s_416_coco_lcnet/picodet_s_416_coco_lcnet.onnx
 outputs_nodes:
-  - 'p2o.Div.79'
+  - 'p2o.Mul.179'
  - 'p2o.Concat.9'
 do_quantization: False
 dataset:
--- a/tools/rknpu2/config/ppyoloe_plus_crn_s_80e_coco_quantized.yaml
+++ b/tools/rknpu2/config/ppyoloe_plus_crn_s_80e_coco_quantized.yaml
@@ -0,0 +1,17 @@
+mean:
+  -
+    - 0
+    - 0
+    - 0
+std:
+  -
+    - 255
+    - 255
+    - 255
+model_path: ./ppyoloe_plus_crn_s_80e_coco/ppyoloe_plus_crn_s_80e_coco.onnx
+outputs_nodes:
+  - 'p2o.Mul.224'
+  - 'p2o.Concat.29'
+do_quantization: True
+dataset: "./ppyoloe_plus_crn_s_80e_coco/dataset.txt"
+output_folder: "./ppyoloe_plus_crn_s_80e_coco"