Merge branch 'develop' of https://github.com/paddlepaddle/fastdeploy into set_stream_infer-shareExData

2025-10-05 16:48:03 +08:00 · 2023-02-13 03:14:18 +00:00
parent abfa9fd850 e63f5f369e
commit 289d353d99
50 changed files with 2809 additions and 395 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ option(WITH_ASCEND "Whether to compile for Huawei Ascend deploy." OFF)
 option(WITH_TIMVX "Whether to compile for TIMVX deploy." OFF)
 option(WITH_KUNLUNXIN "Whether to compile for KunlunXin XPU deploy." OFF)
 option(WITH_TESTING "Whether to compile with unittest." OFF)
+option(WITH_CAPI "Whether to compile with c api." OFF)

 ############################# Options for Android cross compiling #########################
 if(ANDROID)
@@ -416,6 +417,14 @@ if(ENABLE_PADDLE2ONNX)
  list(APPEND DEPEND_LIBS external_paddle2onnx)
 endif(ENABLE_PADDLE2ONNX)

+if(WITH_CAPI)
+  include(${PROJECT_SOURCE_DIR}/c_api/CMakeLists.txt)
+  if(MSVC)
+  add_definitions(-DFD_CAPI)
+  endif()
+endif()
+
+
 configure_file(${PROJECT_SOURCE_DIR}/FastDeploy.cmake.in ${PROJECT_SOURCE_DIR}/FastDeploy.cmake @ONLY)
 configure_file(${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py.in ${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py)
 configure_file(${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py.in ${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py)
--- a/benchmark/cpp/CMakeLists.txt
+++ b/benchmark/cpp/CMakeLists.txt
@@ -9,9 +9,12 @@ include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 include_directories(${FASTDEPLOY_INCS})

 add_executable(benchmark_yolov5 ${PROJECT_SOURCE_DIR}/benchmark_yolov5.cc)
+add_executable(benchmark_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov8.cc)

 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags pthread)
 else()
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
 endif()
--- a/benchmark/cpp/benchmark_ppyolov8.cc
+++ b/benchmark/cpp/benchmark_ppyolov8.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/benchmark/utils.h"
+#include "fastdeploy/vision.h"
+#include "flags.h"
+
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+bool RunModel(std::string model_dir, std::string image_file, size_t warmup,
+              size_t repeats, size_t dump_period, std::string cpu_mem_file_name,
+              std::string gpu_mem_file_name) {
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option)) {
+    PrintUsage();
+    return false;
+  }
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto config_file = model_dir + sep + "infer_cfg.yml";
+
+  if (FLAGS_profile_mode == "runtime") {
+    option.EnableProfiling(FLAGS_include_h2d_d2h, repeats, warmup);
+  }
+  auto model = fastdeploy::vision::detection::PaddleYOLOv8(
+      model_file, params_file, config_file, option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return false;
+  }
+  auto im = cv::imread(image_file);
+  // For Runtime
+  if (FLAGS_profile_mode == "runtime") {
+    fastdeploy::vision::DetectionResult res;
+    if (!model.Predict(im, &res)) {
+      std::cerr << "Failed to predict." << std::endl;
+      return false;
+    }
+    double profile_time = model.GetProfileTime() * 1000;
+    std::cout << "Runtime(ms): " << profile_time << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  } else {
+    // For End2End
+    // Step1: warm up for warmup times
+    std::cout << "Warmup " << warmup << " times..." << std::endl;
+    for (int i = 0; i < warmup; i++) {
+      fastdeploy::vision::DetectionResult res;
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+    }
+    std::vector<float> end2end_statis;
+    // Step2: repeat for repeats times
+    std::cout << "Counting time..." << std::endl;
+    fastdeploy::TimeCounter tc;
+    fastdeploy::vision::DetectionResult res;
+    for (int i = 0; i < repeats; i++) {
+      if (FLAGS_collect_memory_info && i % dump_period == 0) {
+        fastdeploy::benchmark::DumpCurrentCpuMemoryUsage(cpu_mem_file_name);
+#if defined(WITH_GPU)
+        fastdeploy::benchmark::DumpCurrentGpuMemoryUsage(gpu_mem_file_name,
+                                                         FLAGS_device_id);
+#endif
+      }
+      tc.Start();
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+      tc.End();
+      end2end_statis.push_back(tc.Duration() * 1000);
+    }
+    float end2end = std::accumulate(end2end_statis.end() - repeats,
+                                    end2end_statis.end(), 0.f) /
+                    repeats;
+    std::cout << "End2End(ms): " << end2end << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  }
+
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  int repeats = FLAGS_repeat;
+  int warmup = FLAGS_warmup;
+  int dump_period = FLAGS_dump_period;
+  std::string cpu_mem_file_name = "result_cpu.txt";
+  std::string gpu_mem_file_name = "result_gpu.txt";
+  // Run model
+  if (RunModel(FLAGS_model, FLAGS_image, warmup, repeats, dump_period,
+               cpu_mem_file_name, gpu_mem_file_name) != true) {
+    exit(1);
+  }
+  if (FLAGS_collect_memory_info) {
+    float cpu_mem = fastdeploy::benchmark::GetCpuMemoryUsage(cpu_mem_file_name);
+    std::cout << "cpu_pss_mb: " << cpu_mem << "MB." << std::endl;
+#if defined(WITH_GPU)
+    float gpu_mem = fastdeploy::benchmark::GetGpuMemoryUsage(gpu_mem_file_name);
+    std::cout << "gpu_pss_mb: " << gpu_mem << "MB." << std::endl;
+#endif
+  }
+  return 0;
+}
--- a/benchmark/cpp/benchmark_yolov5.cc
+++ b/benchmark/cpp/benchmark_yolov5.cc
@@ -17,8 +17,7 @@
 #include "flags.h"

 bool RunModel(std::string model_file, std::string image_file, size_t warmup,
-              size_t repeats, size_t dump_period, std::string cpu_mem_file_name,
-              std::string gpu_mem_file_name) {
+              size_t repeats, size_t sampling_interval) {
  // Initialization
  auto option = fastdeploy::RuntimeOption();
  if (!CreateRuntimeOption(&option)) {
@@ -34,6 +33,12 @@ bool RunModel(std::string model_file, std::string image_file, size_t warmup,
    return false;
  }
  auto im = cv::imread(image_file);
+  // For collect memory info
+  fastdeploy::benchmark::ResourceUsageMonitor resource_moniter(
+      sampling_interval, FLAGS_device_id);
+  if (FLAGS_collect_memory_info) {
+    resource_moniter.Start();
+  }
  // For Runtime
  if (FLAGS_profile_mode == "runtime") {
    fastdeploy::vision::DetectionResult res;
@@ -57,33 +62,34 @@ bool RunModel(std::string model_file, std::string image_file, size_t warmup,
        return false;
      }
    }
-    std::vector<float> end2end_statis;
    // Step2: repeat for repeats times
    std::cout << "Counting time..." << std::endl;
-    fastdeploy::TimeCounter tc;
+    std::cout << "Repeat " << repeats << " times..." << std::endl;
    fastdeploy::vision::DetectionResult res;
-    for (int i = 0; i < repeats; i++) {
-      if (FLAGS_collect_memory_info && i % dump_period == 0) {
-        fastdeploy::benchmark::DumpCurrentCpuMemoryUsage(cpu_mem_file_name);
-        fastdeploy::benchmark::DumpCurrentGpuMemoryUsage(gpu_mem_file_name,
-                                                         FLAGS_device_id);
-      }
+    fastdeploy::TimeCounter tc;
    tc.Start();
+    for (int i = 0; i < repeats; i++) {
      if (!model.Predict(im, &res)) {
        std::cerr << "Failed to predict." << std::endl;
        return false;
      }
-      tc.End();
-      end2end_statis.push_back(tc.Duration() * 1000);
    }
-    float end2end = std::accumulate(end2end_statis.end() - repeats,
-                                    end2end_statis.end(), 0.f) /
-                    repeats;
+    tc.End();
+    double end2end = tc.Duration() / repeats * 1000;
    std::cout << "End2End(ms): " << end2end << "ms." << std::endl;
    auto vis_im = fastdeploy::vision::VisDetection(im, res);
    cv::imwrite("vis_result.jpg", vis_im);
    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
  }
+  if (FLAGS_collect_memory_info) {
+    float cpu_mem = resource_moniter.GetMaxCpuMem();
+    float gpu_mem = resource_moniter.GetMaxGpuMem();
+    float gpu_util = resource_moniter.GetMaxGpuUtil();
+    std::cout << "cpu_pss_mb: " << cpu_mem << "MB." << std::endl;
+    std::cout << "gpu_pss_mb: " << gpu_mem << "MB." << std::endl;
+    std::cout << "gpu_util: " << gpu_util << std::endl;
+    resource_moniter.Stop();
+  }

  return true;
 }
@@ -92,19 +98,10 @@ int main(int argc, char* argv[]) {
  google::ParseCommandLineFlags(&argc, &argv, true);
  int repeats = FLAGS_repeat;
  int warmup = FLAGS_warmup;
-  int dump_period = FLAGS_dump_period;
-  std::string cpu_mem_file_name = "result_cpu.txt";
-  std::string gpu_mem_file_name = "result_gpu.txt";
+  int sampling_interval = FLAGS_sampling_interval;
  // Run model
-  if (RunModel(FLAGS_model, FLAGS_image, warmup, repeats, dump_period,
-               cpu_mem_file_name, gpu_mem_file_name) != true) {
+  if (!RunModel(FLAGS_model, FLAGS_image, warmup, repeats, sampling_interval)) {
    exit(1);
  }
-  if (FLAGS_collect_memory_info) {
-    float cpu_mem = fastdeploy::benchmark::GetCpuMemoryUsage(cpu_mem_file_name);
-    float gpu_mem = fastdeploy::benchmark::GetGpuMemoryUsage(gpu_mem_file_name);
-    std::cout << "cpu_pss_mb: " << cpu_mem << "MB." << std::endl;
-    std::cout << "gpu_pss_mb: " << gpu_mem << "MB." << std::endl;
-  }
  return 0;
 }
--- a/benchmark/cpp/flags.h
+++ b/benchmark/cpp/flags.h
@@ -20,28 +20,29 @@
 DEFINE_string(model, "", "Directory of the inference model.");
 DEFINE_string(image, "", "Path of the image file.");
 DEFINE_string(device, "cpu",
-              "Type of inference device, support 'cpu' or 'gpu'.");
-DEFINE_int32(device_id, 0, "device(gpu) id.");
+              "Type of inference device, support 'cpu/gpu/xpu'.");
+DEFINE_int32(device_id, 0, "device(gpu/xpu/...) id.");
 DEFINE_int32(warmup, 200, "Number of warmup for profiling.");
 DEFINE_int32(repeat, 1000, "Number of repeats for profiling.");
 DEFINE_string(profile_mode, "runtime", "runtime or end2end.");
 DEFINE_string(backend, "default",
              "The inference runtime backend, support: ['default', 'ort', "
-              "'paddle', 'ov', 'trt', 'paddle_trt']");
+              "'paddle', 'ov', 'trt', 'paddle_trt', 'lite']");
 DEFINE_int32(cpu_thread_nums, 8, "Set numbers of cpu thread.");
 DEFINE_bool(
    include_h2d_d2h, false, "Whether run profiling with h2d and d2h.");
 DEFINE_bool(
    use_fp16, false,
-    "Whether to use FP16 mode, only support 'trt' and 'paddle_trt' backend");
+    "Whether to use FP16 mode, only support 'trt', 'paddle_trt' "
+    "and 'lite' backend");
 DEFINE_bool(
    collect_memory_info, false, "Whether to collect memory info");
-DEFINE_int32(dump_period, 100, "How often to collect memory info.");
+DEFINE_int32(sampling_interval, 50, "How often to collect memory info(ms).");

 void PrintUsage() {
  std::cout << "Usage: infer_demo --model model_path --image img_path --device "
-               "[cpu|gpu] --backend "
-               "[default|ort|paddle|ov|trt|paddle_trt] "
+               "[cpu|gpu|xpu] --backend "
+               "[default|ort|paddle|ov|trt|paddle_trt|lite] "
               "--use_fp16 false"
            << std::endl;
  std::cout << "Default value of device: cpu" << std::endl;
@@ -51,14 +52,13 @@ void PrintUsage() {

 bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
  if (FLAGS_device == "gpu") {
-    option->UseGpu();
+    option->UseGpu(FLAGS_device_id);
    if (FLAGS_backend == "ort") {
      option->UseOrtBackend();
    } else if (FLAGS_backend == "paddle") {
      option->UsePaddleInferBackend();
    } else if (FLAGS_backend == "trt" || FLAGS_backend == "paddle_trt") {
      option->UseTrtBackend();
-      option->SetTrtInputShape("input", {1, 3, 112, 112});
      if (FLAGS_backend == "paddle_trt") {
        option->EnablePaddleToTrt();
      }
@@ -81,16 +81,40 @@ bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
      option->UseOpenVINOBackend();
    } else if (FLAGS_backend == "paddle") {
      option->UsePaddleInferBackend();
+    } else if (FLAGS_backend == "lite") {
+      option->UsePaddleLiteBackend();
+      if (FLAGS_use_fp16) {
+        option->EnableLiteFP16();
+      }
    } else if (FLAGS_backend == "default") {
      return true;
    } else {
      std::cout << "While inference with CPU, only support "
-                   "default/ort/ov/paddle now, "
+                   "default/ort/ov/paddle/lite now, "
+                << FLAGS_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else if (FLAGS_device == "xpu") {
+    option->UseKunlunXin(FLAGS_device_id);
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleInferBackend();
+    } else if (FLAGS_backend == "lite") {
+      option->UsePaddleLiteBackend();
+      if (FLAGS_use_fp16) {
+        option->EnableLiteFP16();
+      }
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with XPU, only support "
+                   "default/ort/paddle/lite now, "
                << FLAGS_backend << " is not supported." << std::endl;
      return false;
    }
  } else {
-    std::cerr << "Only support device CPU/GPU now, " << FLAGS_device
+    std::cerr << "Only support device CPU/GPU/XPU now, " << FLAGS_device
              << " is not supported." << std::endl;
    return false;
  }
--- a/c_api/CMakeLists.txt
+++ b/c_api/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##################################### Building: FastDeploy C API #######################################
+message("----start--CAPI-------")
+
+if(NOT WITH_CAPI)
+  return()
+endif()
+
+file(GLOB_RECURSE DEPLOY_CAPI_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/*.cc)
+if(NOT ENABLE_VISION)
+    file(GLOB_RECURSE DEPLOY_VISION_CAPI_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api/fastdeploy_capi/vision/*.cc)
+    list(REMOVE_ITEM DEPLOY_CAPI_SRCS ${DEPLOY_VISION_CAPI_SRCS})
+endif()
+list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_CAPI_SRCS})
+include_directories(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/c_api)
--- a/c_api/fastdeploy_capi/fd_common.h
+++ b/c_api/fastdeploy_capi/fd_common.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#ifdef FD_CAPI
+#define FASTDEPLOY_CAPI_EXPORT __declspec(dllexport)
+#else
+#define FASTDEPLOY_CAPI_EXPORT __declspec(dllimport)
+#endif  // FD_CAPI
+#else
+#define FASTDEPLOY_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+///
+/// __fd_give means that a new object is returned. The user should make sure
+/// that the returned pointer is used exactly once as a value for an __fd_take
+/// argument. In between, it can be used as a value for as many __fd_keep
+/// arguments as the user likes.
+///
+#ifndef __fd_give
+#define __fd_give
+#endif
+///
+/// __fd_take means that the object the argument points to is taken over by the
+/// function and may no longer be used by the user as an argument to any other
+/// function. The pointer value must be one returned by a function returning an
+/// __fd_give pointer.
+///
+#ifndef __fd_take
+#define __fd_take
+#endif
+///
+/// __fd_keep means that the function will only use the object temporarily. The
+/// object which the argument points to is not taken over by the function. After
+/// the function has finished, the user can still use it as an argument to other
+/// functions.
+///
+#ifndef __fd_keep
+#define __fd_keep
+#endif
+
+typedef int8_t FD_C_Bool;
+#define TRUE 1
+#define FALSE 0
+
+#define FD_ENUM(type)                                                          \
+  typedef int32_t type;                                                        \
+  enum
+
+FD_ENUM(FD_C_ModelFormat){
+    AUTOREC,      ///< Auto recognize the model format by model file name
+    PADDLE,       ///< Model with paddlepaddle format
+    ONNX,         ///< Model with ONNX format
+    RKNN,         ///< Model with RKNN format
+    TORCHSCRIPT,  ///< Model with TorchScript format
+    SOPHGO,       ///< Model with SOPHGO format
+};
+
+FD_ENUM(FD_C_rknpu2_CpuName){
+    RK356X = 0, /* run on RK356X. */
+    RK3588 = 1, /* default,run on RK3588. */
+    UNDEFINED,
+};
+
+FD_ENUM(FD_C_rknpu2_CoreMask){
+    RKNN_NPU_CORE_AUTO = 0,  //< default, run on NPU core randomly.
+    RKNN_NPU_CORE_0 = 1,     //< run on NPU core 0.
+    RKNN_NPU_CORE_1 = 2,     //< run on NPU core 1.
+    RKNN_NPU_CORE_2 = 4,     //< run on NPU core 2.
+    RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 |
+                        RKNN_NPU_CORE_1,  //< run on NPU core 1 and core 2.
+    RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 |
+                          RKNN_NPU_CORE_2,  //< run on NPU core 1 and core 2.
+    RKNN_NPU_CORE_UNDEFINED,
+};
+
+FD_ENUM(FD_C_LitePowerMode){
+    LITE_POWER_HIGH = 0,       ///< Use Lite Backend with high power mode
+    LITE_POWER_LOW = 1,        ///< Use Lite Backend with low power mode
+    LITE_POWER_FULL = 2,       ///< Use Lite Backend with full power mode
+    LITE_POWER_NO_BIND = 3,    ///< Use Lite Backend with no bind power mode
+    LITE_POWER_RAND_HIGH = 4,  ///< Use Lite Backend with rand high mode
+    LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
+};
--- a/c_api/fastdeploy_capi/fd_type.h
+++ b/c_api/fastdeploy_capi/fd_type.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "fastdeploy_capi/fd_common.h"  // NOLINT
+
+typedef struct FD_C_OneDimArrayUint8 {
+  size_t size;
+  uint8_t* data;
+} FD_C_OneDimArrayUint8;  // std::vector<int32_t>
+
+typedef struct FD_C_OneDimArrayInt32 {
+  size_t size;
+  int32_t* data;
+} FD_C_OneDimArrayInt32;  // std::vector<int32_t>
+
+typedef struct FD_C_OneDimArraySize {
+  size_t size;
+  size_t* data;
+} FD_C_OneDimArraySize;  // std::vector<size_t>
+
+typedef struct FD_C_OneDimArrayInt64 {
+  size_t size;
+  int64_t* data;
+} FD_C_OneDimArrayInt64;  // std::vector<int64_t>
+
+typedef struct FD_C_OneDimArrayFloat {
+  size_t size;
+  float* data;
+} FD_C_OneDimArrayFloat;  // std::vector<float>
+
+typedef struct FD_C_Cstr {
+  size_t size;
+  char* data;
+} FD_C_Cstr;  // std::string
+
+typedef struct FD_C_OneDimArrayCstr {
+  size_t size;
+  FD_C_Cstr* data;
+} FD_C_OneDimArrayCstr;  // std::vector<std::string>
+
+typedef struct FD_C_TwoDimArraySize {
+  size_t size;
+  FD_C_OneDimArraySize* data;
+} FD_C_TwoDimArraySize;  // std::vector<std::vector<size_t>>
+
+typedef struct FD_C_TwoDimArrayFloat {
+  size_t size;
+  FD_C_OneDimArrayFloat* data;
+} FD_C_TwoDimArrayFloat;  // std::vector<std::vector<float>>
+
+typedef void* FD_C_Mat;
--- a/c_api/fastdeploy_capi/runtime_option.cc
+++ b/c_api/fastdeploy_capi/runtime_option.cc
@@ -0,0 +1,418 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/runtime_option.h"
+
+#include "fastdeploy/utils/utils.h"
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+FD_C_RuntimeOptionWrapper* FD_C_CreateRuntimeOptionWrapper() {
+  FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper =
+      new FD_C_RuntimeOptionWrapper();
+  fd_c_runtime_option_wrapper->runtime_option =
+      std::unique_ptr<fastdeploy::RuntimeOption>(
+          new fastdeploy::RuntimeOption());
+  return fd_c_runtime_option_wrapper;
+}
+
+void FD_C_DestroyRuntimeOption(
+    __fd_take FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  delete fd_c_runtime_option_wrapper;
+}
+
+void FD_C_RuntimeOptionWrapperSetModelPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_path, const char* params_path,
+    const FD_C_ModelFormat format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetModelPath(std::string(model_path),
+                               std::string(params_path),
+                               static_cast<fastdeploy::ModelFormat>(format));
+}
+
+void FD_C_RuntimeOptionWrapperSetModelBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_buffer, const char* params_buffer,
+    const FD_C_ModelFormat format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetModelBuffer(model_buffer, params_buffer,
+                                 static_cast<fastdeploy::ModelFormat>(format));
+}
+
+void FD_C_RuntimeOptionWrapperUseCpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseCpu();
+}
+
+void FD_C_RuntimeOptionWrapperUseGpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int gpu_id) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseGpu(gpu_id);
+}
+
+void FD_C_RuntimeOptionWrapperUseRKNPU2(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_rknpu2_CpuName rknpu2_name, FD_C_rknpu2_CoreMask rknpu2_core) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseRKNPU2(
+      static_cast<fastdeploy::rknpu2::CpuName>(rknpu2_name),
+      static_cast<fastdeploy::rknpu2::CoreMask>(rknpu2_core));
+}
+
+void FD_C_RuntimeOptionWrapperUseTimVX(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseTimVX();
+}
+
+void FD_C_RuntimeOptionWrapperUseAscend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseAscend();
+}
+
+void FD_C_RuntimeOptionWrapperUseKunlunXin(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int kunlunxin_id, int l3_workspace_size, FD_C_Bool locked,
+    FD_C_Bool autotune, const char* autotune_file, const char* precision,
+    FD_C_Bool adaptive_seqlen, FD_C_Bool enable_multi_stream) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseKunlunXin(kunlunxin_id, l3_workspace_size, bool(locked),
+                               bool(autotune), std::string(autotune_file),
+                               std::string(precision), bool(adaptive_seqlen),
+                               bool(enable_multi_stream));
+}
+
+void FD_C_RuntimeOptionWrapperUseSophgo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseSophgo();
+}
+
+void FD_C_RuntimeOptionWrapperSetExternalStream(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    void* external_stream) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetExternalStream(external_stream);
+}
+
+void FD_C_RuntimeOptionWrapperSetCpuThreadNum(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int thread_num) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetCpuThreadNum(thread_num);
+}
+
+void FD_C_RuntimeOptionWrapperSetOrtGraphOptLevel(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int level) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetOrtGraphOptLevel(level);
+}
+
+void FD_C_RuntimeOptionWrapperUsePaddleBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UsePaddleBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUsePaddleInferBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  return FD_C_RuntimeOptionWrapperUsePaddleBackend(fd_c_runtime_option_wrapper);
+}
+
+void FD_C_RuntimeOptionWrapperUseOrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseOrtBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseSophgoBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseSophgoBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseTrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseTrtBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUsePorosBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UsePorosBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseOpenVINOBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseOpenVINOBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUseLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseLiteBackend();
+}
+
+void FD_C_RuntimeOptionWrapperUsePaddleLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  return FD_C_RuntimeOptionWrapperUseLiteBackend(fd_c_runtime_option_wrapper);
+}
+
+void FD_C_RuntimeOptionWrapperSetPaddleMKLDNN(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool pd_mkldnn) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetPaddleMKLDNN(pd_mkldnn);
+}
+
+void FD_C_RuntimeOptionWrapperEnablePaddleToTrt(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePaddleToTrt();
+}
+
+void FD_C_RuntimeOptionWrapperDeletePaddleBackendPass(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* delete_pass_name) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DeletePaddleBackendPass(std::string(delete_pass_name));
+}
+
+void FD_C_RuntimeOptionWrapperEnablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePaddleLogInfo();
+}
+
+void FD_C_RuntimeOptionWrapperDisablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisablePaddleLogInfo();
+}
+
+void FD_C_RuntimeOptionWrapperSetPaddleMKLDNNCacheSize(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int size) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetPaddleMKLDNNCacheSize(size);
+}
+
+void FD_C_RuntimeOptionWrapperSetOpenVINODevice(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* name) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetOpenVINODevice(std::string(name));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteOptimizedModelDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* optimized_model_dir) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteOptimizedModelDir(std::string(optimized_model_dir));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_path) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteSubgraphPartitionPath(
+      std::string(nnadapter_subgraph_partition_config_path));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionConfigBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_buffer) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteSubgraphPartitionConfigBuffer(
+      std::string(nnadapter_subgraph_partition_config_buffer));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteContextProperties(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_context_properties) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteContextProperties(
+      std::string(nnadapter_context_properties));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteModelCacheDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_model_cache_dir) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLiteModelCacheDir(std::string(nnadapter_model_cache_dir));
+}
+
+void FD_C_RuntimeOptionWrapperSetLiteMixedPrecisionQuantizationConfigPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_mixed_precision_quantization_config_path) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+}
+
+void FD_C_RuntimeOptionWrapperEnableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnableLiteFP16();
+}
+
+void FD_C_RuntimeOptionWrapperDisableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisableLiteFP16();
+}
+
+void FD_C_RuntimeOptionWrapperEnableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnableLiteInt8();
+}
+
+void FD_C_RuntimeOptionWrapperDisableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisableLiteInt8();
+}
+
+void FD_C_RuntimeOptionWrapperSetLitePowerMode(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_LitePowerMode mode) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetLitePowerMode(
+      static_cast<fastdeploy::LitePowerMode>(mode));
+}
+
+void FD_C_RuntimeOptionWrapperEnableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnableTrtFP16();
+}
+
+void FD_C_RuntimeOptionWrapperDisableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisableTrtFP16();
+}
+
+void FD_C_RuntimeOptionWrapperSetTrtCacheFile(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* cache_file_path) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetTrtCacheFile(std::string(cache_file_path));
+}
+
+void FD_C_RuntimeOptionWrapperEnablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePinnedMemory();
+}
+
+void FD_C_RuntimeOptionWrapperDisablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisablePinnedMemory();
+}
+
+void FD_C_RuntimeOptionWrapperEnablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->EnablePaddleTrtCollectShape();
+}
+
+void FD_C_RuntimeOptionWrapperDisablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->DisablePaddleTrtCollectShape();
+}
+
+void FD_C_RuntimeOptionWrapperSetOpenVINOStreams(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int num_streams) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetOpenVINOStreams(num_streams);
+}
+
+void FD_C_RuntimeOptionWrapperUseIpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int device_num, int micro_batch_size, FD_C_Bool enable_pipelining,
+    int batches_per_step) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->UseIpu(device_num, micro_batch_size, enable_pipelining,
+                         batches_per_step);
+}
+
+void FD_C_RuntimeOptionWrapperSetIpuConfig(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool enable_fp16, int replica_num, float available_memory_proportion,
+    FD_C_Bool enable_half_partial) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  runtime_option->SetIpuConfig(enable_fp16, replica_num,
+                               available_memory_proportion,
+                               enable_half_partial);
+}
+
+}  // extern "C"
--- a/c_api/fastdeploy_capi/runtime_option.h
+++ b/c_api/fastdeploy_capi/runtime_option.h
@@ -0,0 +1,517 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "fastdeploy_capi/fd_common.h"
+
+typedef struct FD_C_RuntimeOptionWrapper FD_C_RuntimeOptionWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Create a new FD_C_RuntimeOptionWrapper object
+ *
+ * \return Return a pointer to FD_C_RuntimeOptionWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_RuntimeOptionWrapper*
+FD_C_CreateRuntimeOptionWrapper();
+
+/** \brief Destroy a FD_C_RuntimeOptionWrapper object
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyRuntimeOptionWrapper(
+    __fd_take FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/** \brief Set path of model file and parameter file
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] model_path Path of model file, e.g ResNet50/model.pdmodel for Paddle format model / ResNet50/model.onnx for ONNX format model
+ * \param[in] params_path Path of parameter file, this only used when the model format is Paddle, e.g Resnet50/model.pdiparams
+ * \param[in] format Format of the loaded model
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetModelPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_path, const char* params_path,
+    const FD_C_ModelFormat format);
+
+/** \brief Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] model_buffer The memory buffer of model
+ * \param[in] params_buffer The memory buffer of the combined parameters file
+ * \param[in] format Format of the loaded model
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetModelBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* model_buffer, const char* params_buffer,
+    const FD_C_ModelFormat);
+
+/** \brief Use cpu to inference, the runtime will inference on CPU by default
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseCpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/** \brief Use Nvidia GPU to inference
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseGpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int gpu_id);
+
+/** \brief Use RKNPU2 to inference
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] rknpu2_name  CpuName enum value
+ * \param[in] rknpu2_core CoreMask enum value
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseRKNPU2(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_rknpu2_CpuName rknpu2_name, FD_C_rknpu2_CoreMask rknpu2_core);
+
+/** \brief Use TimVX to inference
+ *
+ *  \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseTimVX(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/** \brief Use Huawei Ascend to inference
+ *
+ *  \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseAscend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+///
+/// \brief Turn on KunlunXin XPU.
+///
+/// \param[in] fd_c_runtime_option_wrapper pointer to \
+                    FD_C_RuntimeOptionWrapper object
+/// \param[in] kunlunxin_id the KunlunXin XPU card to use\
+                    (default is 0).
+/// \param[in] l3_workspace_size The size of the video memory allocated\
+///         by the l3 cache, the maximum is 16M.
+/// \param[in] locked Whether the allocated L3 cache can be locked. If false,
+///       it means that the L3 cache is not locked, and the allocated L3
+///       cache can be shared by multiple models, and multiple models
+///       sharing the L3 cache will be executed sequentially on the card.
+/// \param[in] autotune Whether to autotune the conv operator in the model. If
+///       true, when the conv operator of a certain dimension is executed
+///       for the first time, it will automatically search for a better
+///       algorithm to improve the performance of subsequent conv operators
+///       of the same dimension.
+/// \param[in] autotune_file Specify the path of the autotune file. If
+///       autotune_file is specified, the algorithm specified in the
+///       file will be used and autotune will not be performed again.
+/// \param[in] precision Calculation accuracy of multi_encoder
+/// \param[in] adaptive_seqlen Is the input of multi_encoder variable length
+/// \param[in] enable_multi_stream Whether to enable the multi stream of
+///        KunlunXin XPU.
+///
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseKunlunXin(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int kunlunxin_id, int l3_workspace_size, FD_C_Bool locked,
+    FD_C_Bool autotune, const char* autotune_file, const char* precision,
+    FD_C_Bool adaptive_seqlen, FD_C_Bool enable_multi_stream);
+
+/** Use Sophgo to inference
+ *
+ *  \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseSophgo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetExternalStream(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    void* external_stream);
+
+/**
+  * @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] thread_num number of threads
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetCpuThreadNum(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int thread_num);
+
+/**
+  * @brief Set ORT graph opt level, default is decide by ONNX Runtime itself
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] level optimization level
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetOrtGraphOptLevel(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int level);
+
+/**
+  * @brief Set Paddle Inference as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUsePaddleBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Wrapper function of UsePaddleBackend()
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperUsePaddleInferBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set ONNX Runtime as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseOrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set SOPHGO Runtime as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseSophgoBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set TensorRT as inference backend, only support GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseTrtBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set Poros backend as inference backend, support CPU/GPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUsePorosBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set OpenVINO as inference backend, only support CPU
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseOpenVINOBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set Paddle Lite as inference backend, only support arm cpu
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Wrapper function of UseLiteBackend()
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperUsePaddleLiteBackend(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set mkldnn switch while using Paddle Inference as inference backend
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] pd_mkldnn whether to use mkldnn
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetPaddleMKLDNN(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool pd_mkldnn);
+
+/**
+  * @brief If TensorRT backend is used, EnablePaddleToTrt will change to use Paddle Inference backend, and use its integrated TensorRT instead.
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnablePaddleToTrt(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Delete pass by name while using Paddle Inference as inference backend, this can be called multiple times to delete a set of passes
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] delete_pass_name pass name
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperDeletePaddleBackendPass(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* delete_pass_name);
+
+/**
+  * @brief Enable print debug information while using Paddle Inference as inference backend, the backend disable the debug information by default
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Disable print debug information while using Paddle Inference as inference backend
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperDisablePaddleLogInfo(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set shape cache size while using Paddle Inference with mkldnn, by default it will cache all the difference shape
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] size cache size
+  */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetPaddleMKLDNNCacheSize(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper, int size);
+
+/**
+  * @brief Set device name for OpenVINO, default 'CPU', can also be 'AUTO', 'GPU', 'GPU.1'....
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] name device name
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetOpenVINODevice(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* name);
+
+/**
+ * @brief Set optimzed model dir for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] optimized_model_dir optimzed model dir
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteOptimizedModelDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* optimized_model_dir);
+
+/**
+ * @brief Set subgraph partition path for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_subgraph_partition_config_path subgraph partition path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_path);
+
+/**
+ * @brief Set subgraph partition path for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_subgraph_partition_config_buffer subgraph partition path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteSubgraphPartitionConfigBuffer(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_subgraph_partition_config_buffer);
+
+/**
+ * @brief Set context properties for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_context_properties context properties
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteContextProperties(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_context_properties);
+
+/**
+ * @brief Set model cache dir for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_model_cache_dir model cache dir
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteModelCacheDir(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_model_cache_dir);
+
+/**
+ * @brief Set mixed precision quantization config path for Paddle Lite backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] nnadapter_mixed_precision_quantization_config_path mixed precision quantization config path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperSetLiteMixedPrecisionQuantizationConfigPath(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* nnadapter_mixed_precision_quantization_config_path);
+
+/**
+ * @brief enable half precision while use paddle lite backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief disable half precision, change to full precision(float32)
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisableLiteFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief enable int8 precision while use paddle lite backend
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief disable int8 precision, change to full precision(float32)
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisableLiteInt8(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Set power mode while using Paddle Lite as inference backend, mode(0: LITE_POWER_HIGH; 1: LITE_POWER_LOW; 2: LITE_POWER_FULL; 3: LITE_POWER_NO_BIND, 4: LITE_POWER_RAND_HIGH; 5: LITE_POWER_RAND_LOW, refer [paddle lite](https://paddle-lite.readthedocs.io/zh/latest/api_reference/cxx_api_doc.html#set-power-mode) for more details)
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] mode power mode
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetLitePowerMode(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_LitePowerMode mode);
+
+/**
+ * @brief Enable FP16 inference while using TensorRT backend. Notice: not all the GPU device support FP16, on those device doesn't support FP16, FastDeploy will fallback to FP32 automaticly
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Disable FP16 inference while using TensorRT backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisableTrtFP16(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] cache_file_path cache file path
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetTrtCacheFile(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const char* cache_file_path);
+
+/**
+ * @brief Enable pinned memory. Pinned memory can be utilized to speedup the data transfer between CPU and GPU. Currently it's only suppurted in TRT backend and Paddle Inference backend.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperEnablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Disable pinned memory
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperDisablePinnedMemory(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Enable to collect shape in paddle trt backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperEnablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+ * @brief Disable to collect shape in paddle trt backend
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ */
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_RuntimeOptionWrapperDisablePaddleTrtCollectShape(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+
+/**
+  * @brief Set number of streams by the OpenVINO backends
+  *
+  * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+  * \param[in] num_streams number of streams
+  */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetOpenVINOStreams(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int num_streams);
+
+/**
+ * @brief \Use Graphcore IPU to inference.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] device_num the number of IPUs.
+ * \param[in] micro_batch_size the batch size in the graph, only work when graph has no batch shape info.
+ * \param[in] enable_pipelining enable pipelining.
+ * \param[in] batches_per_step the number of batches per run in pipelining.
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperUseIpu(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    int device_num, int micro_batch_size, FD_C_Bool enable_pipelining,
+    int batches_per_step);
+
+/** \brief Set IPU config.
+ *
+ * \param[in] fd_c_runtime_option_wrapper pointer to FD_C_RuntimeOptionWrapper object
+ * \param[in] enable_fp16 enable fp16.
+ * \param[in] replica_num the number of graph replication.
+ * \param[in] available_memory_proportion the available memory proportion for matmul/conv.
+ * \param[in] enable_half_partial enable fp16 partial for matmul, only work with fp16.
+ */
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_RuntimeOptionWrapperSetIpuConfig(
+    __fd_keep FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    FD_C_Bool enable_fp16, int replica_num, float available_memory_proportion,
+    FD_C_Bool enable_half_partial);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
--- a/c_api/fastdeploy_capi/types_internal.cc
+++ b/c_api/fastdeploy_capi/types_internal.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/types_internal.h"
+
+namespace fastdeploy {
+
+#ifdef ENABLE_VISION
+
+std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>&
+FD_C_CheckAndConvertPaddleClasModelWrapper(
+    FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper) {
+  FDASSERT(
+      fd_c_paddleclas_model_wrapper != nullptr,
+      "The pointer of fd_c_paddleclas_model_wrapper shouldn't be nullptr.");
+  return fd_c_paddleclas_model_wrapper->paddleclas_model;
+}
+
+std::unique_ptr<fastdeploy::vision::detection::PPYOLOE>&
+FD_C_CheckAndConvertPPYOLOEWrapper(FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper) {
+  FDASSERT(fd_c_ppyoloe_wrapper != nullptr,
+           "The pointer of fd_c_ppyoloe_wrapper shouldn't be nullptr.");
+  return fd_c_ppyoloe_wrapper->ppyoloe_model;
+}
+
+std::unique_ptr<fastdeploy::vision::ClassifyResult>&
+FD_C_CheckAndConvertClassifyResultWrapper(
+    FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  FDASSERT(fd_c_classify_result_wrapper != nullptr,
+           "The pointer of fd_c_classify_result_wrapper shouldn't be nullptr.");
+  return fd_c_classify_result_wrapper->classify_result;
+}
+
+std::unique_ptr<fastdeploy::vision::DetectionResult>&
+FD_C_CheckAndConvertDetectionResultWrapper(
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  FDASSERT(
+      fd_c_detection_result_wrapper != nullptr,
+      "The pointer of fd_c_detection_result_wrapper shouldn't be nullptr.");
+  return fd_c_detection_result_wrapper->detection_result;
+}
+#endif
+
+std::unique_ptr<fastdeploy::RuntimeOption>&
+FD_C_CheckAndConvertRuntimeOptionWrapper(
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) {
+  FDASSERT(fd_c_runtime_option_wrapper != nullptr,
+           "The pointer of fd_c_runtime_option_wrapper shouldn't be nullptr.");
+  return fd_c_runtime_option_wrapper->runtime_option;
+}
+
+}  // namespace fastdeploy
--- a/c_api/fastdeploy_capi/types_internal.h
+++ b/c_api/fastdeploy_capi/types_internal.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/runtime/runtime_option.h"
+#include "fastdeploy_capi/fd_type.h"
+#include <memory>
+
+#ifdef ENABLE_VISION
+#include "fastdeploy/vision/classification/ppcls/model.h"
+#include "fastdeploy/vision/common/result.h"
+#include "fastdeploy/vision/detection/ppdet/model.h"
+
+typedef struct FD_C_ClassifyResultWrapper {
+  std::unique_ptr<fastdeploy::vision::ClassifyResult> classify_result;
+} FD_C_ClassifyResultWrapper;
+
+typedef struct FD_C_DetectionResultWrapper {
+  std::unique_ptr<fastdeploy::vision::DetectionResult> detection_result;
+} FD_C_DetectionResultWrapper;
+
+typedef struct FD_C_PaddleClasModelWrapper {
+  std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>
+      paddleclas_model;
+} FD_C_PaddleClasModelWrapper;
+
+typedef struct FD_C_PPYOLOEWrapper {
+  std::unique_ptr<fastdeploy::vision::detection::PPYOLOE> ppyoloe_model;
+} FD_C_PPYOLOEWrapper;
+
+namespace fastdeploy {
+std::unique_ptr<fastdeploy::vision::ClassifyResult>&
+FD_C_CheckAndConvertClassifyResultWrapper(
+    FD_C_ClassifyResultWrapper* fd_classify_result_wrapper);
+std::unique_ptr<fastdeploy::vision::DetectionResult>&
+FD_C_CheckAndConvertDetectionResultWrapper(
+    FD_C_DetectionResultWrapper* fd_detection_result_wrapper);
+std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>&
+FD_C_CheckAndConvertPaddleClasModelWrapper(
+    FD_C_PaddleClasModelWrapper* fd_paddleclas_model_wrapper);
+std::unique_ptr<fastdeploy::vision::detection::PPYOLOE>&
+FD_C_CheckAndConvertPPYOLOEWrapper(FD_C_PPYOLOEWrapper* fd_ppyoloe_wrapper);
+}  // namespace fastdeploy
+
+#endif
+
+typedef struct FD_C_RuntimeOptionWrapper {
+  std::unique_ptr<fastdeploy::RuntimeOption> runtime_option;
+} FD_C_RuntimeOptionWrapper;
+
+namespace fastdeploy {
+std::unique_ptr<fastdeploy::RuntimeOption>&
+FD_C_CheckAndConvertRuntimeOptionWrapper(
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper);
+}
+
+#define CHECK_AND_CONVERT_FD_TYPE(TYPENAME, variable_name)                     \
+  fastdeploy::FD_C_CheckAndConvert##TYPENAME(variable_name)
--- a/c_api/fastdeploy_capi/vision/classification/ppcls/model.cc
+++ b/c_api/fastdeploy_capi/vision/classification/ppcls/model.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/classification/ppcls/model.h"
+
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+FD_C_PaddleClasModelWrapper* FD_C_CreatePaddleClasModelWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper =
+      new FD_C_PaddleClasModelWrapper();
+  fd_c_paddleclas_model_wrapper->paddleclas_model =
+      std::unique_ptr<fastdeploy::vision::classification::PaddleClasModel>(
+          new fastdeploy::vision::classification::PaddleClasModel(
+              std::string(model_file), std::string(params_file),
+              std::string(config_file), *runtime_option,
+              static_cast<fastdeploy::ModelFormat>(model_format)));
+  return fd_c_paddleclas_model_wrapper;
+}
+
+void FD_C_DestroyPaddleClasModelWrapper(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper) {
+  delete fd_c_paddleclas_model_wrapper;
+}
+
+FD_C_Bool FD_C_PaddleClasModelWrapperPredict(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper,
+    FD_C_Mat img, FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  cv::Mat* im = reinterpret_cast<cv::Mat*>(img);
+  auto& paddleclas_model = CHECK_AND_CONVERT_FD_TYPE(
+      PaddleClasModelWrapper, fd_c_paddleclas_model_wrapper);
+  auto& classify_result = CHECK_AND_CONVERT_FD_TYPE(
+      ClassifyResultWrapper, fd_c_classify_result_wrapper);
+  return paddleclas_model->Predict(im, classify_result.get());
+}
+}
--- a/c_api/fastdeploy_capi/vision/classification/ppcls/model.h
+++ b/c_api/fastdeploy_capi/vision/classification/ppcls/model.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+#include "fastdeploy_capi/runtime_option.h"
+#include "fastdeploy_capi/vision/result.h"
+
+typedef struct FD_C_PaddleClasModelWrapper FD_C_PaddleClasModelWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Create a new FD_C_PaddleClasModelWrapper object
+ *
+ * \param[in] model_file Path of model file, e.g resnet/model.pdmodel
+ * \param[in] params_file Path of parameter file, e.g resnet/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+ * \param[in] config_file Path of configuration file for deployment, e.g resnet/infer_cfg.yml
+ * \param[in] fd_c_runtime_option_wrapper RuntimeOption for inference, the default will use cpu, and choose the backend defined in `valid_cpu_backends`
+ * \param[in] model_format Model format of the loaded model, default is Paddle format
+ *
+ * \return Return a pointer to FD_C_PaddleClasModelWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_PaddleClasModelWrapper*
+FD_C_CreatePaddleClasModelWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format);
+
+/** \brief Destroy a FD_C_PaddleClasModelWrapper object
+ *
+ * \param[in] fd_c_paddleclas_model_wrapper pointer to FD_C_PaddleClasModelWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyPaddleClasModelWrapper(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper);
+
+/** \brief Predict the classification result for an input image
+ *
+ * \param[in] fd_c_paddleclas_model_wrapper pointer to FD_C_PaddleClasModelWrapper object
+ * \param[in] img pointer to cv::Mat image
+ * \param[in] fd_c_classify_result_wrapper pointer to FD_C_PaddleClasModelWrapper object, which stores the result.
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern FD_C_Bool FD_C_PaddleClasModelWrapperPredict(
+    __fd_take FD_C_PaddleClasModelWrapper* fd_c_paddleclas_model_wrapper,
+    FD_C_Mat img, FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
--- a/c_api/fastdeploy_capi/vision/detection/ppdet/model.cc
+++ b/c_api/fastdeploy_capi/vision/detection/ppdet/model.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/detection/ppdet/model.h"
+
+#include "fastdeploy_capi/types_internal.h"
+#include "fastdeploy_capi/vision/visualize.h"
+
+extern "C" {
+
+FD_C_PPYOLOEWrapper* FD_C_CreatesPPYOLOEWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format) {
+  auto& runtime_option = CHECK_AND_CONVERT_FD_TYPE(RuntimeOptionWrapper,
+                                                   fd_c_runtime_option_wrapper);
+  FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper = new FD_C_PPYOLOEWrapper();
+  fd_c_ppyoloe_wrapper->ppyoloe_model =
+      std::unique_ptr<fastdeploy::vision::detection::PPYOLOE>(
+          new fastdeploy::vision::detection::PPYOLOE(
+              std::string(model_file), std::string(params_file),
+              std::string(config_file), *runtime_option,
+              static_cast<fastdeploy::ModelFormat>(model_format)));
+  return fd_c_ppyoloe_wrapper;
+}
+
+void FD_C_DestroyPPYOLOEWrapper(
+    __fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper) {
+  delete fd_c_ppyoloe_wrapper;
+}
+
+FD_C_Bool FD_C_PPYOLOEWrapperPredict(
+    FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper, FD_C_Mat img,
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  cv::Mat* im = reinterpret_cast<cv::Mat*>(img);
+  auto& ppyoloe_model =
+      CHECK_AND_CONVERT_FD_TYPE(PPYOLOEWrapper, fd_c_ppyoloe_wrapper);
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+  return ppyoloe_model->Predict(im, detection_result.get());
+}
+}
--- a/c_api/fastdeploy_capi/vision/detection/ppdet/model.h
+++ b/c_api/fastdeploy_capi/vision/detection/ppdet/model.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+#include "fastdeploy_capi/runtime_option.h"
+#include "fastdeploy_capi/vision/result.h"
+
+typedef struct FD_C_PPYOLOEWrapper FD_C_PPYOLOEWrapper;
+typedef struct FD_C_RuntimeOptionWrapper FD_C_RuntimeOptionWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Create a new FD_C_PPYOLOEWrapper object
+ *
+ * \param[in] model_file Path of model file, e.g resnet/model.pdmodel
+ * \param[in] params_file Path of parameter file, e.g resnet/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+ * \param[in] config_file Path of configuration file for deployment, e.g resnet/infer_cfg.yml
+ * \param[in] fd_c_runtime_option_wrapper RuntimeOption for inference, the default will use cpu, and choose the backend defined in `valid_cpu_backends`
+ * \param[in] model_format Model format of the loaded model, default is Paddle format
+ *
+ * \return Return a pointer to FD_C_PPYOLOEWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_PPYOLOEWrapper*
+FD_C_CreatesPPYOLOEWrapper(
+    const char* model_file, const char* params_file, const char* config_file,
+    FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper,
+    const FD_C_ModelFormat model_format);
+
+/** \brief Destroy a FD_C_PPYOLOEWrapper object
+ *
+ * \param[in] fd_c_ppyoloe_wrapper pointer to FD_C_PPYOLOEWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_DestroyPPYOLOEWrapper(__fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper);
+
+/** \brief Predict the detection result for an input image
+ *
+ * \param[in] fd_c_ppyoloe_wrapper pointer to FD_C_PPYOLOEWrapper object
+ * \param[in] img pointer to cv::Mat image
+ * \param[in] fd_c_detection_result_wrapper pointer to FD_C_DetectionResultWrapper object, which stores the result.
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern FD_C_Bool FD_C_PPYOLOEWrapperPredict(
+    __fd_take FD_C_PPYOLOEWrapper* fd_c_ppyoloe_wrapper, FD_C_Mat img,
+    FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
--- a/c_api/fastdeploy_capi/vision/result.cc
+++ b/c_api/fastdeploy_capi/vision/result.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/result.h"
+
+#include "fastdeploy/utils/utils.h"
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+// Classification Results
+
+FD_C_ClassifyResultWrapper* FD_C_CreateClassifyResultWrapper() {
+  FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper =
+      new FD_C_ClassifyResultWrapper();
+  fd_c_classify_result_wrapper->classify_result =
+      std::unique_ptr<fastdeploy::vision::ClassifyResult>(
+          new fastdeploy::vision::ClassifyResult());
+  return fd_c_classify_result_wrapper;
+}
+
+void FD_C_DestroyClassifyResultWrapper(
+    __fd_take FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  delete fd_c_classify_result_wrapper;
+}
+
+void FD_C_DestroyClassifyResult(
+    __fd_take FD_C_ClassifyResult* fd_c_classify_result) {
+  if (fd_c_classify_result == nullptr) return;
+  // delete label_ids
+  delete[] fd_c_classify_result->label_ids.data;
+  // delete scores
+  delete[] fd_c_classify_result->scores.data;
+  delete fd_c_classify_result;
+}
+
+FD_C_ClassifyResult* FD_C_ClassifyResultWrapperGetData(
+    __fd_keep FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper) {
+  auto& classify_result = CHECK_AND_CONVERT_FD_TYPE(
+      ClassifyResultWrapper, fd_c_classify_result_wrapper);
+  FD_C_ClassifyResult* fd_c_classify_result_data = new FD_C_ClassifyResult();
+  // copy label_ids
+  fd_c_classify_result_data->label_ids.size = classify_result->label_ids.size();
+  fd_c_classify_result_data->label_ids.data =
+      new int32_t[fd_c_classify_result_data->label_ids.size];
+  memcpy(fd_c_classify_result_data->label_ids.data,
+         classify_result->label_ids.data(),
+         sizeof(int32_t) * fd_c_classify_result_data->label_ids.size);
+  // copy scores
+  fd_c_classify_result_data->scores.size = classify_result->scores.size();
+  fd_c_classify_result_data->scores.data =
+      new float[fd_c_classify_result_data->scores.size];
+  memcpy(fd_c_classify_result_data->scores.data, classify_result->scores.data(),
+         sizeof(float) * fd_c_classify_result_data->scores.size);
+  fd_c_classify_result_data->type =
+      static_cast<FD_C_ResultType>(classify_result->type);
+  return fd_c_classify_result_data;
+}
+
+FD_C_ClassifyResultWrapper* FD_C_CreateClassifyResultWrapperFromData(
+    __fd_keep FD_C_ClassifyResult* fd_c_classify_result) {
+  FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper =
+      FD_C_CreateClassifyResultWrapper();
+  auto& classify_result = CHECK_AND_CONVERT_FD_TYPE(
+      ClassifyResultWrapper, fd_c_classify_result_wrapper);
+  // copy label_ids
+  classify_result->label_ids.resize(fd_c_classify_result->label_ids.size);
+  memcpy(classify_result->label_ids.data(),
+         fd_c_classify_result->label_ids.data,
+         sizeof(int32_t) * fd_c_classify_result->label_ids.size);
+  // copy scores
+  classify_result->scores.resize(fd_c_classify_result->scores.size);
+  memcpy(classify_result->scores.data(), fd_c_classify_result->scores.data,
+         sizeof(int32_t) * fd_c_classify_result->scores.size);
+  classify_result->type =
+      static_cast<fastdeploy::vision::ResultType>(fd_c_classify_result->type);
+  return fd_c_classify_result_wrapper;
+}
+
+// Detection Results
+
+FD_C_DetectionResultWrapper* FD_C_CreateDetectionResultWrapper() {
+  FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper =
+      new FD_C_DetectionResultWrapper();
+  fd_c_detection_result_wrapper->detection_result =
+      std::unique_ptr<fastdeploy::vision::DetectionResult>(
+          new fastdeploy::vision::DetectionResult());
+  return fd_c_detection_result_wrapper;
+}
+
+void FD_C_DestroyDetectionResultWrapper(
+    __fd_take FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  delete fd_c_detection_result_wrapper;
+}
+
+void FD_C_DestroyDetectionResult(
+    __fd_take FD_C_DetectionResult* fd_c_detection_result) {
+  if (fd_c_detection_result == nullptr) return;
+  // delete boxes
+  for (size_t i = 0; i < fd_c_detection_result->boxes.size; i++) {
+    delete[] fd_c_detection_result->boxes.data[i].data;
+  }
+  delete[] fd_c_detection_result->boxes.data;
+  // delete scores
+  delete[] fd_c_detection_result->scores.data;
+  // delete label_ids
+  delete[] fd_c_detection_result->label_ids.data;
+  // delete masks
+  for (size_t i = 0; i < fd_c_detection_result->masks.size; i++) {
+    delete[] fd_c_detection_result->masks.data[i].data.data;
+    delete[] fd_c_detection_result->masks.data[i].shape.data;
+  }
+  delete fd_c_detection_result;
+}
+
+FD_C_DetectionResult* FD_C_DetectionResultWrapperGetData(
+    __fd_keep FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper) {
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+  FD_C_DetectionResult* fd_c_detection_result = new FD_C_DetectionResult();
+  // copy boxes
+  const int boxes_coordinate_dim = 4;
+  fd_c_detection_result->boxes.size = detection_result->boxes.size();
+  fd_c_detection_result->boxes.data =
+      new FD_C_OneDimArrayFloat[fd_c_detection_result->boxes.size];
+  for (size_t i = 0; i < detection_result->boxes.size(); i++) {
+    fd_c_detection_result->boxes.data[i].size = boxes_coordinate_dim;
+    fd_c_detection_result->boxes.data[i].data = new float[boxes_coordinate_dim];
+    for (size_t j = 0; j < boxes_coordinate_dim; j++) {
+      fd_c_detection_result->boxes.data[i].data[j] =
+          detection_result->boxes[i][j];
+    }
+  }
+  // copy scores
+  fd_c_detection_result->scores.size = detection_result->scores.size();
+  fd_c_detection_result->scores.data =
+      new float[fd_c_detection_result->scores.size];
+  memcpy(fd_c_detection_result->scores.data, detection_result->scores.data(),
+         sizeof(float) * fd_c_detection_result->scores.size);
+  // copy label_ids
+  fd_c_detection_result->label_ids.size = detection_result->label_ids.size();
+  fd_c_detection_result->label_ids.data =
+      new int32_t[fd_c_detection_result->label_ids.size];
+  memcpy(fd_c_detection_result->label_ids.data,
+         detection_result->label_ids.data(),
+         sizeof(int32_t) * fd_c_detection_result->label_ids.size);
+  // copy masks
+  fd_c_detection_result->masks.size = detection_result->masks.size();
+  fd_c_detection_result->masks.data =
+      new FD_C_Mask[fd_c_detection_result->masks.size];
+  for (size_t i = 0; i < detection_result->masks.size(); i++) {
+    // copy data in mask
+    fd_c_detection_result->masks.data[i].data.size =
+        detection_result->masks[i].data.size();
+    fd_c_detection_result->masks.data[i].data.data =
+        new uint8_t[detection_result->masks[i].data.size()];
+    memcpy(fd_c_detection_result->masks.data[i].data.data,
+           detection_result->masks[i].data.data(),
+           sizeof(uint8_t) * detection_result->masks[i].data.size());
+    // copy shape in mask
+    fd_c_detection_result->masks.data[i].shape.size =
+        detection_result->masks[i].shape.size();
+    fd_c_detection_result->masks.data[i].shape.data =
+        new int64_t[detection_result->masks[i].shape.size()];
+    memcpy(fd_c_detection_result->masks.data[i].shape.data,
+           detection_result->masks[i].shape.data(),
+           sizeof(int64_t) * detection_result->masks[i].shape.size());
+    fd_c_detection_result->masks.data[i].type =
+        static_cast<FD_C_ResultType>(detection_result->masks[i].type);
+  }
+  fd_c_detection_result->contain_masks = detection_result->contain_masks;
+  fd_c_detection_result->type =
+      static_cast<FD_C_ResultType>(detection_result->type);
+  return fd_c_detection_result;
+}
+
+FD_C_DetectionResultWrapper* FD_C_CreateDetectionResultWrapperFromData(
+    __fd_keep FD_C_DetectionResult* fd_c_detection_result) {
+  FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper =
+      FD_C_CreateDetectionResultWrapper();
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+
+  // copy boxes
+  const int boxes_coordinate_dim = 4;
+  detection_result->boxes.resize(fd_c_detection_result->boxes.size);
+  for (size_t i = 0; i < fd_c_detection_result->boxes.size; i++) {
+    for (size_t j = 0; j < boxes_coordinate_dim; j++) {
+      detection_result->boxes[i][j] =
+          fd_c_detection_result->boxes.data[i].data[j];
+    }
+  }
+  // copy scores
+  detection_result->scores.resize(fd_c_detection_result->scores.size);
+  memcpy(detection_result->scores.data(), fd_c_detection_result->scores.data,
+         sizeof(float) * fd_c_detection_result->scores.size);
+  // copy label_ids
+  detection_result->label_ids.resize(fd_c_detection_result->label_ids.size);
+  memcpy(detection_result->label_ids.data(),
+         fd_c_detection_result->label_ids.data,
+         sizeof(int32_t) * fd_c_detection_result->label_ids.size);
+  // copy masks
+  detection_result->masks.resize(fd_c_detection_result->masks.size);
+  for (size_t i = 0; i < fd_c_detection_result->masks.size; i++) {
+    // copy data in mask
+    detection_result->masks[i].data.resize(
+        fd_c_detection_result->masks.data[i].data.size);
+    memcpy(detection_result->masks[i].data.data(),
+           fd_c_detection_result->masks.data[i].data.data,
+           sizeof(uint8_t) * fd_c_detection_result->masks.data[i].data.size);
+    // copy shape in mask
+    detection_result->masks[i].shape.resize(
+        fd_c_detection_result->masks.data[i].shape.size);
+    memcpy(detection_result->masks[i].shape.data(),
+           fd_c_detection_result->masks.data[i].shape.data,
+           sizeof(int64_t) * fd_c_detection_result->masks.data[i].shape.size);
+    detection_result->masks[i].type =
+        static_cast<fastdeploy::vision::ResultType>(
+            fd_c_detection_result->masks.data[i].type);
+  }
+  detection_result->contain_masks = fd_c_detection_result->contain_masks;
+  detection_result->type =
+      static_cast<fastdeploy::vision::ResultType>(fd_c_detection_result->type);
+
+  return fd_c_detection_result_wrapper;
+}
+}
--- a/c_api/fastdeploy_capi/vision/result.h
+++ b/c_api/fastdeploy_capi/vision/result.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+
+typedef struct FD_C_ClassifyResultWrapper FD_C_ClassifyResultWrapper;
+typedef struct FD_C_DetectionResultWrapper FD_C_DetectionResultWrapper;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FD_ENUM(FD_C_ResultType){
+    UNKNOWN_RESULT,
+    CLASSIFY,
+    DETECTION,
+    SEGMENTATION,
+    OCR,
+    MOT,
+    FACE_DETECTION,
+    FACE_ALIGNMENT,
+    FACE_RECOGNITION,
+    MATTING,
+    MASK,
+    KEYPOINT_DETECTION,
+    HEADPOSE,
+};
+
+typedef struct FD_C_ClassifyResult {
+  FD_C_OneDimArrayInt32 label_ids;
+  FD_C_OneDimArrayFloat scores;
+  FD_C_ResultType type;
+} FD_C_ClassifyResult;
+
+typedef struct FD_C_Mask {
+  FD_C_OneDimArrayUint8 data;
+  FD_C_OneDimArrayInt64 shape;
+  FD_C_ResultType type;
+} FD_C_Mask;
+
+typedef struct FD_C_OneDimMask {
+  size_t size;
+  FD_C_Mask* data;
+} FD_C_OneDimMask;  // std::vector<FD_C_Mask>
+
+typedef struct FD_C_DetectionResult {
+  FD_C_TwoDimArrayFloat boxes;
+  FD_C_OneDimArrayFloat scores;
+  FD_C_OneDimArrayInt32 label_ids;
+  FD_C_OneDimMask masks;
+  FD_C_Bool contain_masks;
+  FD_C_ResultType type;
+} FD_C_DetectionResult;
+
+// Classification Results
+
+/** \brief Create a new FD_C_ClassifyResultWrapper object
+ *
+ * \return Return a pointer to FD_C_ClassifyResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_ClassifyResultWrapper*
+FD_C_CreateClassifyResultWrapper();
+
+/** \brief Destroy a FD_C_ClassifyResultWrapper object
+ *
+ * \param[in] fd_c_classify_result_wrapper pointer to FD_C_ClassifyResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyClassifyResultWrapper(
+    __fd_take FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper);
+
+/** \brief Destroy a FD_C_ClassifyResult object
+ *
+ * \param[in] fd_c_classify_result pointer to FD_C_ClassifyResult object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void
+FD_C_DestroyClassifyResult(__fd_take FD_C_ClassifyResult* fd_c_classify_result);
+
+/** \brief Get a FD_C_ClassifyResult object from FD_C_ClassifyResultWrapper object
+ *
+ * \param[in] fd_c_classify_result_wrapper pointer to FD_C_ClassifyResultWrapper object
+ * \return Return a pointer to FD_C_ClassifyResult object
+ */
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_ClassifyResult*
+FD_C_ClassifyResultWrapperGetData(
+    __fd_keep FD_C_ClassifyResultWrapper* fd_c_classify_result_wrapper);
+
+/** \brief Create a new FD_C_ClassifyResultWrapper object from FD_C_ClassifyResult object
+ *
+ * \param[in] fd_c_classify_result pointer to FD_C_ClassifyResult object
+ * \return Return a pointer to FD_C_ClassifyResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_ClassifyResultWrapper*
+FD_C_CreateClassifyResultWrapperFromData(
+    __fd_keep FD_C_ClassifyResult* fd_c_classify_result);
+
+// Detection Results
+
+/** \brief Create a new FD_C_DetectionResultWrapper object
+ *
+ * \return Return a pointer to FD_C_DetectionResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_DetectionResultWrapper*
+FD_C_CreateDetectionResultWrapper();
+
+/** \brief Destroy a FD_C_DetectionResultWrapper object
+ *
+ * \param[in] fd_c_detection_result_wrapper pointer to FD_C_DetectionResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyDetectionResultWrapper(
+    __fd_take FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper);
+
+/** \brief Destroy a FD_C_DetectionResult object
+ *
+ * \param[in] fd_c_detection_result pointer to FD_C_DetectionResult object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern void FD_C_DestroyDetectionResult(
+    __fd_take FD_C_DetectionResult* fd_c_detection_result);
+
+/** \brief Get a FD_C_DetectionResult object from FD_C_DetectionResultWrapper object
+ *
+ * \param[in] fd_c_detection_result_wrapper pointer to FD_C_DetectionResultWrapper object
+ * \return Return a pointer to FD_C_DetectionResult object
+ */
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_DetectionResult*
+FD_C_DetectionResultWrapperGetData(
+    __fd_keep FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper);
+
+/** \brief Create a new FD_C_DetectionResultWrapper object from FD_C_DetectionResult object
+ *
+ * \param[in] fd_c_detection_result pointer to FD_C_DetectionResult object
+ * \return Return a pointer to FD_C_DetectionResultWrapper object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_DetectionResultWrapper*
+FD_C_CreateDetectionResultWrapperFromData(
+    __fd_keep FD_C_DetectionResult* fd_c_detection_result);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
--- a/c_api/fastdeploy_capi/vision/visualize.cc
+++ b/c_api/fastdeploy_capi/vision/visualize.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy_capi/vision/visualize.h"
+
+#include "fastdeploy/vision/visualize/visualize.h"
+#include "fastdeploy_capi/types_internal.h"
+
+extern "C" {
+
+FD_C_Mat FD_C_VisDetection(FD_C_Mat im,
+                           FD_C_DetectionResult* fd_c_detection_result,
+                           float score_threshold, int line_size,
+                           float font_size) {
+  FD_C_DetectionResultWrapper* fd_c_detection_result_wrapper =
+      FD_C_CreateDetectionResultWrapperFromData(fd_c_detection_result);
+  auto& detection_result = CHECK_AND_CONVERT_FD_TYPE(
+      DetectionResultWrapper, fd_c_detection_result_wrapper);
+  cv::Mat result = fastdeploy::vision::Visualize::VisDetection(
+      *(reinterpret_cast<cv::Mat*>(im)), *detection_result, score_threshold,
+      line_size, font_size);
+  return new cv::Mat(result);
+}
+}
--- a/c_api/fastdeploy_capi/vision/visualize.h
+++ b/c_api/fastdeploy_capi/vision/visualize.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy_capi/fd_common.h"
+#include "fastdeploy_capi/fd_type.h"
+#include "fastdeploy_capi/vision/result.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Visualize Detection
+ *
+ * \return Return a pointer to cv::Mat object
+ */
+
+FASTDEPLOY_CAPI_EXPORT extern __fd_give FD_C_Mat
+FD_C_VisDetection(FD_C_Mat im, FD_C_DetectionResult* fd_detection_result,
+                  float score_threshold, int line_size, float font_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
--- a/cmake/fast_tokenizer.cmake
+++ b/cmake/fast_tokenizer.cmake
@@ -61,7 +61,7 @@ endif(WIN32)
 message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")

 set(FASTTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/fast_tokenizer/")
-set(FASTTOKENIZER_VERSION "1.0.1")
+set(FASTTOKENIZER_VERSION "1.0.2")

 # Set download url
 if(WIN32)
--- a/cmake/paddle_inference.cmake
+++ b/cmake/paddle_inference.cmake
@@ -80,7 +80,7 @@ if(PADDLEINFERENCE_DIRECTORY)
  endif()
 else()
  set(PADDLEINFERENCE_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
-  set(PADDLEINFERENCE_VERSION "2.4-dev4")
+  set(PADDLEINFERENCE_VERSION "2.4-dev5")
  if(WIN32)
    if (WITH_GPU)
      set(PADDLEINFERENCE_FILE "paddle_inference-win-x64-gpu-trt-${PADDLEINFERENCE_VERSION}.zip")
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -44,6 +44,7 @@ function(fastdeploy_summary)
  message(STATUS "  WITH_ASCEND               : ${WITH_ASCEND}")
  message(STATUS "  WITH_TIMVX                : ${WITH_TIMVX}")
  message(STATUS "  WITH_KUNLUNXIN            : ${WITH_KUNLUNXIN}")
+  message(STATUS "  WITH_CAPI            : ${WITH_CAPI}")
  if(ENABLE_ORT_BACKEND)
    message(STATUS "  ONNXRuntime version       : ${ONNXRUNTIME_VERSION}")
  endif()
--- a/docs/README_CN.md
+++ b/docs/README_CN.md
@@ -10,7 +10,7 @@
 - [IPU部署环境编译安装](cn/build_and_install/ipu.md)
 - [昆仑芯XPU部署环境编译安装](cn/build_and_install/kunlunxin.md)
 - [瑞芯微RV1126部署环境编译安装](cn/build_and_install/rv1126.md)
- [瑞芯微RK3588部署环境编译安装](cn/build_and_install/rknpu2.md)
+- [瑞芯微RK3588、RK356X部署环境编译安装](cn/build_and_install/rknpu2.md)
 - [晶晨A311D部署环境编译安装](cn/build_and_install/a311d.md)
 - [华为昇腾部署环境编译安装](cn/build_and_install/huawei_ascend.md)
 - [Jetson部署环境编译安装](cn/build_and_install/jetson.md)
--- a/fastdeploy/benchmark/utils.cc
+++ b/fastdeploy/benchmark/utils.cc
@@ -36,86 +36,131 @@ static std::string strip(const std::string& str, char ch = ' ') {
  return str.substr(i, j + 1 - i);
 }

-void DumpCurrentCpuMemoryUsage(const std::string& name) {
+// Split string
+static void split(const std::string& s, std::vector<std::string>& tokens,
+                  char delim = ' ') {
+  tokens.clear();
+  size_t lastPos = s.find_first_not_of(delim, 0);
+  size_t pos = s.find(delim, lastPos);
+  while (lastPos != std::string::npos) {
+    tokens.emplace_back(s.substr(lastPos, pos - lastPos));
+    lastPos = s.find_first_not_of(delim, pos);
+    pos = s.find(delim, lastPos);
+  }
+  return;
+}
+
+ResourceUsageMonitor::ResourceUsageMonitor(int sampling_interval_ms, int gpu_id)
+    : is_supported_(false),
+      sampling_interval_(sampling_interval_ms),
+      gpu_id_(gpu_id) {
+#if defined(__linux__) || defined(__ANDROID__)
+  is_supported_ = true;
+#else
+  is_supported_ = false;
+#endif
+  if (!is_supported_) {
+    FDASSERT(false,
+             "Currently ResourceUsageMonitor only supports Linux and ANDROID.")
+    return;
+  }
+}
+
+void ResourceUsageMonitor::Start() {
+  if (!is_supported_) return;
+  if (check_memory_thd_ != nullptr) {
+    FDINFO << "Memory monitoring has already started!" << std::endl;
+    return;
+  }
+  FDINFO << "Start monitoring memory!" << std::endl;
+  stop_signal_ = false;
+  check_memory_thd_.reset(new std::thread(([this]() {
+    // Note we retrieve the memory usage at the very beginning of the thread.
+    while (true) {
+      std::string cpu_mem_info = GetCurrentCpuMemoryInfo();
+      // get max_cpu_mem
+      std::vector<std::string> cpu_tokens;
+      split(cpu_mem_info, cpu_tokens, ' ');
+      max_cpu_mem_ = std::max(max_cpu_mem_, stof(cpu_tokens[3]) / 1024);
+#if defined(WITH_GPU)
+      std::string gpu_mem_info = GetCurrentGpuMemoryInfo(gpu_id_);
+      // get max_gpu_mem and max_gpu_util
+      std::vector<std::string> gpu_tokens;
+      split(gpu_mem_info, gpu_tokens, ',');
+      max_gpu_mem_ = std::max(max_gpu_mem_, stof(gpu_tokens[6]));
+      max_gpu_util_ = std::max(max_gpu_util_, stof(gpu_tokens[7]));
+#endif
+      if (stop_signal_) break;
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(sampling_interval_));
+    }
+  })));
+}
+
+void ResourceUsageMonitor::Stop() {
+  if (!is_supported_) {
+    return;
+  }
+  if (check_memory_thd_ == nullptr) {
+    FDINFO << "Memory monitoring hasn't started yet or has stopped!"
+           << std::endl;
+    return;
+  }
+  FDINFO << "Stop monitoring memory!" << std::endl;
+  StopInternal();
+}
+
+void ResourceUsageMonitor::StopInternal() {
+  stop_signal_ = true;
+  if (check_memory_thd_ == nullptr) {
+    return;
+  }
+  if (check_memory_thd_ != nullptr) {
+    check_memory_thd_->join();
+  }
+  check_memory_thd_.reset(nullptr);
+}
+
+std::string ResourceUsageMonitor::GetCurrentCpuMemoryInfo() {
+  std::string result = "";
 #if defined(__linux__) || defined(__ANDROID__)
  int iPid = static_cast<int>(getpid());
  std::string command = "pmap -x " + std::to_string(iPid) + " | grep total";
  FILE* pp = popen(command.data(), "r");
-  if (!pp) return;
+  if (!pp) return "";
  char tmp[1024];

  while (fgets(tmp, sizeof(tmp), pp) != NULL) {
-    std::ofstream write;
-    write.open(name, std::ios::app);
-    write << tmp;
-    write.close();
+    result += tmp;
  }
  pclose(pp);
 #else
  FDASSERT(false,
           "Currently collect cpu memory info only supports Linux and ANDROID.")
 #endif
-  return;
+  return result;
 }

-void DumpCurrentGpuMemoryUsage(const std::string& name, int device_id) {
+std::string ResourceUsageMonitor::GetCurrentGpuMemoryInfo(int device_id) {
+  std::string result = "";
 #if defined(__linux__) && defined(WITH_GPU)
  std::string command = "nvidia-smi --id=" + std::to_string(device_id) +
                        " --query-gpu=index,uuid,name,timestamp,memory.total,"
                        "memory.free,memory.used,utilization.gpu,utilization."
                        "memory --format=csv,noheader,nounits";
  FILE* pp = popen(command.data(), "r");
-  if (!pp) return;
+  if (!pp) return "";
  char tmp[1024];

  while (fgets(tmp, sizeof(tmp), pp) != NULL) {
-    std::ofstream write;
-    write.open(name, std::ios::app);
-    write << tmp;
-    write.close();
+    result += tmp;
  }
  pclose(pp);
 #else
  FDASSERT(false,
           "Currently collect gpu memory info only supports Linux in GPU.")
 #endif
-  return;
-}
-
-float GetCpuMemoryUsage(const std::string& name) {
-  std::ifstream read(name);
-  std::string line;
-  float max_cpu_mem = -1;
-  while (getline(read, line)) {
-    std::stringstream ss(line);
-    std::string tmp;
-    std::vector<std::string> nums;
-    while (getline(ss, tmp, ' ')) {
-      tmp = strip(tmp);
-      if (tmp.empty()) continue;
-      nums.push_back(tmp);
-    }
-    max_cpu_mem = std::max(max_cpu_mem, stof(nums[3]));
-  }
-  return max_cpu_mem / 1024;
-}
-
-float GetGpuMemoryUsage(const std::string& name) {
-  std::ifstream read(name);
-  std::string line;
-  float max_gpu_mem = -1;
-  while (getline(read, line)) {
-    std::stringstream ss(line);
-    std::string tmp;
-    std::vector<std::string> nums;
-    while (getline(ss, tmp, ',')) {
-      tmp = strip(tmp);
-      if (tmp.empty()) continue;
-      nums.push_back(tmp);
-    }
-    max_gpu_mem = std::max(max_gpu_mem, stof(nums[6]));
-  }
-  return max_gpu_mem;
+  return result;
 }

 }  // namespace benchmark
--- a/fastdeploy/benchmark/utils.h
+++ b/fastdeploy/benchmark/utils.h
@@ -13,23 +13,72 @@
 // limitations under the License.
 #pragma once

+#include <memory>
+#include <thread>  // NOLINT
 #include "fastdeploy/utils/utils.h"

 namespace fastdeploy {
 namespace benchmark {
+/*! @brief ResourceUsageMonitor object used when to collect memory info.
+ */
+class FASTDEPLOY_DECL ResourceUsageMonitor {
+ public:
+   /** \brief  Set sampling_interval_ms and gpu_id for ResourceUsageMonitor.
+   *
+   * \param[in] sampling_interval_ms How often to collect memory info(ms).
+   * \param[in] gpu_id Device(gpu) id, default 0.
+   */
+  explicit ResourceUsageMonitor(int sampling_interval_ms, int gpu_id = 0);

-// Record current cpu memory usage into file
-FASTDEPLOY_DECL void DumpCurrentCpuMemoryUsage(const std::string& name);
+  ~ResourceUsageMonitor() { StopInternal(); }

-// Record current gpu memory usage into file
-FASTDEPLOY_DECL void DumpCurrentGpuMemoryUsage(const std::string& name,
-                                               int device_id);
+  /// Start memory info collect
+  void Start();
+  /// Stop memory info collect
+  void Stop();
+  /// Get maximum cpu memory usage
+  float GetMaxCpuMem() const {
+    if (!is_supported_ || check_memory_thd_ == nullptr) {
+      return -1.0f;
+    }
+    return max_cpu_mem_;
+  }
+  /// Get maximum gpu memory usage
+  float GetMaxGpuMem() const {
+    if (!is_supported_ || check_memory_thd_ == nullptr) {
+      return -1.0f;
+    }
+    return max_gpu_mem_;
+  }
+  /// Get maximum gpu util
+  float GetMaxGpuUtil() const {
+    if (!is_supported_ || check_memory_thd_ == nullptr) {
+      return -1.0f;
+    }
+    return max_gpu_util_;
+  }

-// Get Max cpu memory usage
-FASTDEPLOY_DECL float GetCpuMemoryUsage(const std::string& name);
+  ResourceUsageMonitor(ResourceUsageMonitor&) = delete;
+  ResourceUsageMonitor& operator=(const ResourceUsageMonitor&) = delete;
+  ResourceUsageMonitor(ResourceUsageMonitor&&) = delete;
+  ResourceUsageMonitor& operator=(const ResourceUsageMonitor&&) = delete;

-// Get Max gpu memory usage
-FASTDEPLOY_DECL float GetGpuMemoryUsage(const std::string& name);
+ private:
+  void StopInternal();
+  // Get current cpu memory info
+  std::string GetCurrentCpuMemoryInfo();
+  // Get current gpu memory info
+  std::string GetCurrentGpuMemoryInfo(int device_id);
+
+  bool is_supported_ = false;
+  bool stop_signal_ = false;
+  const int sampling_interval_;
+  float max_cpu_mem_ = 0.0f;
+  float max_gpu_mem_ = 0.0f;
+  float max_gpu_util_ = 0.0f;
+  const int gpu_id_ = 0;
+  std::unique_ptr<std::thread> check_memory_thd_ = nullptr;
+};

 }  // namespace benchmark
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/lite/configure_hardware.cc
+++ b/fastdeploy/runtime/backends/lite/configure_hardware.cc
@@ -51,8 +51,9 @@ void LiteBackend::ConfigureCpu(const LiteBackendOption& option) {

 void LiteBackend::ConfigureKunlunXin(const LiteBackendOption& option) {
  std::vector<paddle::lite_api::Place> valid_places;
-  valid_places.push_back(
-      paddle::lite_api::Place{TARGET(kXPU), PRECISION(kInt8)});
+  // TODO(yeliang): Placing kInt8 first may cause accuracy issues of some model
+  // valid_places.push_back(
+  //     paddle::lite_api::Place{TARGET(kXPU), PRECISION(kInt8)});
  if (option.enable_fp16) {
    valid_places.push_back(
        paddle::lite_api::Place{TARGET(kXPU), PRECISION(kFP16)});
--- a/fastdeploy/runtime/backends/lite/lite_backend.h
+++ b/fastdeploy/runtime/backends/lite/lite_backend.h
@@ -32,7 +32,7 @@ class LiteBackend : public BaseBackend {
  LiteBackend() {}
  virtual ~LiteBackend() = default;

-  bool Init(const RuntimeOption& option);
+  bool Init(const RuntimeOption& option) override;

  bool Infer(std::vector<FDTensor>& inputs,
            std::vector<FDTensor>* outputs,
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -75,6 +75,16 @@ struct PaddleBackendOption {
    delete_pass_names.push_back(pass_name);
  }

+  void SetIpuConfig(bool enable_fp16, int replica_num,
+                                   float available_memory_proportion,
+                                   bool enable_half_partial) {
+    ipu_option.ipu_enable_fp16 = enable_fp16;
+    ipu_option.ipu_replica_num = replica_num;
+    ipu_option.ipu_available_memory_proportion =
+        available_memory_proportion;
+    ipu_option.ipu_enable_half_partial = enable_half_partial;
+  }
+
  // The belowing parameters may be removed, please do not
  // read or write them directly
  TrtBackendOption trt_option;
--- a/fastdeploy/runtime/backends/paddle/option_pybind.cc
+++ b/fastdeploy/runtime/backends/paddle/option_pybind.cc
@@ -47,7 +47,8 @@ void BindPaddleOption(pybind11::module& m) {
      .def_readwrite("gpu_mem_init_size",
                     &PaddleBackendOption::gpu_mem_init_size)
      .def("disable_trt_ops", &PaddleBackendOption::DisableTrtOps)
-      .def("delete_pass", &PaddleBackendOption::DeletePass);
+      .def("delete_pass", &PaddleBackendOption::DeletePass)
+      .def("set_ipu_config", &PaddleBackendOption::SetIpuConfig);
 }

 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -45,7 +45,10 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
               "file will save to the directory where paddle model saved."
            << std::endl;
        use_static = true;
-        config_.SetOptimCacheDir(option.trt_option.serialize_file);
+        std::string opt_cache_dir =
+            GetDirFromPath(option.trt_option.serialize_file);
+
+        config_.SetOptimCacheDir(opt_cache_dir);
      }
      config_.EnableTensorRtEngine(option.trt_option.max_workspace_size,
                                   option.trt_option.max_batch_size, 20,
--- a/fastdeploy/runtime/runtime_option.cc
+++ b/fastdeploy/runtime/runtime_option.cc
@@ -462,14 +462,4 @@ void RuntimeOption::UseIpu(int device_num, int micro_batch_size,
 #endif
 }

-void RuntimeOption::SetIpuConfig(bool enable_fp16, int replica_num,
-                                 float available_memory_proportion,
-                                 bool enable_half_partial) {
-  paddle_infer_option.ipu_option.ipu_enable_fp16 = enable_fp16;
-  paddle_infer_option.ipu_option.ipu_replica_num = replica_num;
-  paddle_infer_option.ipu_option.ipu_available_memory_proportion =
-      available_memory_proportion;
-  paddle_infer_option.ipu_option.ipu_enable_half_partial = enable_half_partial;
-}
-
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -61,22 +61,19 @@ struct FASTDEPLOY_DECL RuntimeOption {

  /// Use cpu to inference, the runtime will inference on CPU by default
  void UseCpu();
-
  /// Use Nvidia GPU to inference
  void UseGpu(int gpu_id = 0);
-
+  /// Use RKNPU2 e.g RK3588/RK356X to inference
  void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name =
                     fastdeploy::rknpu2::CpuName::RK3588,
                 fastdeploy::rknpu2::CoreMask rknpu2_core =
                     fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_0);
-
-  /// Use TimVX to inference
+  /// Use TimVX e.g RV1126/A311D to inference
  void UseTimVX();
-
  /// Use Huawei Ascend to inference
  void UseAscend();
-
-  ///
+  /// Use Sophgo to inference
+  void UseSophgo();
  /// \brief Turn on KunlunXin XPU.
  ///
  /// \param kunlunxin_id the KunlunXin XPU card to use (default is 0).
@@ -106,9 +103,6 @@ struct FASTDEPLOY_DECL RuntimeOption {
                    bool adaptive_seqlen = false,
                    bool enable_multi_stream = false);

-  /// Use Sophgo to inference
-  void UseSophgo();
-
  void SetExternalStream(void* external_stream);

  void SetExternalRawStream(size_t external_stream);
@@ -117,212 +111,20 @@ struct FASTDEPLOY_DECL RuntimeOption {
   * @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
   */
  void SetCpuThreadNum(int thread_num);
-
-  /// Set ORT graph opt level, default is decide by ONNX Runtime itself
-  void SetOrtGraphOptLevel(int level = -1);
-
  /// Set Paddle Inference as inference backend, support CPU/GPU
-  void UsePaddleBackend();
-
-  /// Wrapper function of UsePaddleBackend()
  void UsePaddleInferBackend() { return UsePaddleBackend(); }
-
  /// Set ONNX Runtime as inference backend, support CPU/GPU
  void UseOrtBackend();
-
-  /// Set SOPHGO Runtime as inference backend, support CPU/GPU
+  /// Set SOPHGO Runtime as inference backend, support SOPHGO
  void UseSophgoBackend();
-
  /// Set TensorRT as inference backend, only support GPU
  void UseTrtBackend();
-
  /// Set Poros backend as inference backend, support CPU/GPU
  void UsePorosBackend();
-
  /// Set OpenVINO as inference backend, only support CPU
  void UseOpenVINOBackend();
-
  /// Set Paddle Lite as inference backend, only support arm cpu
-  void UseLiteBackend();
-
-  /// Wrapper function of UseLiteBackend()
  void UsePaddleLiteBackend() { return UseLiteBackend(); }
-
-  /// Set mkldnn switch while using Paddle Inference as inference backend
-  void SetPaddleMKLDNN(bool pd_mkldnn = true);
-
-  /*
-   * @brief If TensorRT backend is used, EnablePaddleToTrt will change to use Paddle Inference backend, and use its integrated TensorRT instead.
-   */
-  void EnablePaddleToTrt();
-
-  /**
-   * @brief Delete pass by name while using Paddle Inference as inference backend, this can be called multiple times to delete a set of passes
-   */
-  void DeletePaddleBackendPass(const std::string& delete_pass_name);
-
-  /**
-   * @brief Enable print debug information while using Paddle Inference as inference backend, the backend disable the debug information by default
-   */
-  void EnablePaddleLogInfo();
-
-  /**
-   * @brief Disable print debug information while using Paddle Inference as inference backend
-   */
-  void DisablePaddleLogInfo();
-
-  /**
-   * @brief Set shape cache size while using Paddle Inference with mkldnn, by default it will cache all the difference shape
-   */
-  void SetPaddleMKLDNNCacheSize(int size);
-
-  /**
-   * @brief Set device name for OpenVINO, default 'CPU', can also be 'AUTO', 'GPU', 'GPU.1'....
-   */
-  void SetOpenVINODevice(const std::string& name = "CPU");
-
-  /**
-   * @brief Set shape info for OpenVINO
-   */
-  void SetOpenVINOShapeInfo(
-      const std::map<std::string, std::vector<int64_t>>& shape_info) {
-    openvino_option.shape_infos = shape_info;
-  }
-
-  /**
-   * @brief While use OpenVINO backend with intel GPU, use this interface to specify operators run on CPU
-   */
-  void SetOpenVINOCpuOperators(const std::vector<std::string>& operators) {
-    openvino_option.SetCpuOperators(operators);
-  }
-
-  /**
-   * @brief Set optimzed model dir for Paddle Lite backend.
-   */
-  void SetLiteOptimizedModelDir(const std::string& optimized_model_dir);
-
-  /**
-   * @brief Set subgraph partition path for Paddle Lite backend.
-   */
-  void SetLiteSubgraphPartitionPath(
-      const std::string& nnadapter_subgraph_partition_config_path);
-
-  /**
-   * @brief Set subgraph partition path for Paddle Lite backend.
-   */
-  void SetLiteSubgraphPartitionConfigBuffer(
-      const std::string& nnadapter_subgraph_partition_config_buffer);
-
-  /**
-   * @brief Set context properties for Paddle Lite backend.
-   */
-  void
-  SetLiteContextProperties(const std::string& nnadapter_context_properties);
-
-  /**
-   * @brief Set model cache dir for Paddle Lite backend.
-   */
-  void SetLiteModelCacheDir(const std::string& nnadapter_model_cache_dir);
-
-  /**
-   * @brief Set dynamic shape info for Paddle Lite backend.
-   */
-  void SetLiteDynamicShapeInfo(
-      const std::map<std::string, std::vector<std::vector<int64_t>>>&
-          nnadapter_dynamic_shape_info);
-
-  /**
-   * @brief Set mixed precision quantization config path for Paddle Lite backend.
-   */
-  void SetLiteMixedPrecisionQuantizationConfigPath(
-      const std::string& nnadapter_mixed_precision_quantization_config_path);
-
-  /**
-   * @brief enable half precision while use paddle lite backend
-   */
-  void EnableLiteFP16();
-
-  /**
-   * @brief disable half precision, change to full precision(float32)
-   */
-  void DisableLiteFP16();
-
-  /**
-    * @brief enable int8 precision while use paddle lite backend
-    */
-  void EnableLiteInt8();
-
-  /**
-    * @brief disable int8 precision, change to full precision(float32)
-    */
-  void DisableLiteInt8();
-
-  /**
-   * @brief Set power mode while using Paddle Lite as inference backend, mode(0: LITE_POWER_HIGH; 1: LITE_POWER_LOW; 2: LITE_POWER_FULL; 3: LITE_POWER_NO_BIND, 4: LITE_POWER_RAND_HIGH; 5: LITE_POWER_RAND_LOW, refer [paddle lite](https://paddle-lite.readthedocs.io/zh/latest/api_reference/cxx_api_doc.html#set-power-mode) for more details)
-   */
-  void SetLitePowerMode(LitePowerMode mode);
-
-  /** \brief Set shape range of input tensor for the model that contain dynamic input shape while using TensorRT backend
-   *
-   * \param[in] input_name The name of input for the model which is dynamic shape
-   * \param[in] min_shape The minimal shape for the input tensor
-   * \param[in] opt_shape The optimized shape for the input tensor, just set the most common shape, if set as default value, it will keep same with min_shape
-   * \param[in] max_shape The maximum shape for the input tensor, if set as default value, it will keep same with min_shape
-   */
-  void SetTrtInputShape(
-      const std::string& input_name, const std::vector<int32_t>& min_shape,
-      const std::vector<int32_t>& opt_shape = std::vector<int32_t>(),
-      const std::vector<int32_t>& max_shape = std::vector<int32_t>());
-
-  /// Set max_workspace_size for TensorRT, default 1<<30
-  void SetTrtMaxWorkspaceSize(size_t trt_max_workspace_size);
-
-  /// Set max_batch_size for TensorRT, default 32
-  void SetTrtMaxBatchSize(size_t max_batch_size);
-
-  /**
-   * @brief Enable FP16 inference while using TensorRT backend. Notice: not all the GPU device support FP16, on those device doesn't support FP16, FastDeploy will fallback to FP32 automaticly
-   */
-  void EnableTrtFP16();
-
-  /// Disable FP16 inference while using TensorRT backend
-  void DisableTrtFP16();
-
-  /**
-   * @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
-   */
-  void SetTrtCacheFile(const std::string& cache_file_path);
-
-  /**
-   * @brief Enable pinned memory. Pinned memory can be utilized to speedup the data transfer between CPU and GPU. Currently it's only suppurted in TRT backend and Paddle Inference backend.
-   */
-  void EnablePinnedMemory();
-
-  /**
-   * @brief Disable pinned memory
-   */
-  void DisablePinnedMemory();
-
-  /**
-   * @brief Enable to collect shape in paddle trt backend
-   */
-  void EnablePaddleTrtCollectShape();
-
-  /**
-   * @brief Disable to collect shape in paddle trt backend
-   */
-  void DisablePaddleTrtCollectShape();
-
-  /**
-   * @brief Prevent ops running in paddle trt backend
-   */
-  void DisablePaddleTrtOPs(const std::vector<std::string>& ops);
-
-  /*
-   * @brief Set number of streams by the OpenVINO backends
-   */
-  void SetOpenVINOStreams(int num_streams);
-
  /** \Use Graphcore IPU to inference.
   *
   * \param[in] device_num the number of IPUs.
@@ -333,16 +135,18 @@ struct FASTDEPLOY_DECL RuntimeOption {
  void UseIpu(int device_num = 1, int micro_batch_size = 1,
              bool enable_pipelining = false, int batches_per_step = 1);

-  /** \brief Set IPU config.
-   *
-   * \param[in] enable_fp16 enable fp16.
-   * \param[in] replica_num the number of graph replication.
-   * \param[in] available_memory_proportion the available memory proportion for matmul/conv.
-   * \param[in] enable_half_partial enable fp16 partial for matmul, only work with fp16.
-   */
-  void SetIpuConfig(bool enable_fp16 = false, int replica_num = 1,
-                    float available_memory_proportion = 1.0,
-                    bool enable_half_partial = false);
+  /// Option to configure ONNX Runtime backend
+  OrtBackendOption ort_option;
+  /// Option to configure TensorRT backend
+  TrtBackendOption trt_option;
+  /// Option to configure Paddle Inference backend
+  PaddleBackendOption paddle_infer_option;
+  /// Option to configure Poros backend
+  PorosBackendOption poros_option;
+  /// Option to configure OpenVINO backend
+  OpenVINOBackendOption openvino_option;
+  /// Option to configure Paddle Lite backend
+  LiteBackendOption paddle_lite_option;

  /** \brief Set the profile mode as 'true'.
   *
@@ -364,46 +168,9 @@ struct FASTDEPLOY_DECL RuntimeOption {
    benchmark_option.enable_profile = false;
  }

-  Backend backend = Backend::UNKNOWN;

-  // for cpu inference
-  // default will let the backend choose their own default value
-  int cpu_thread_num = -1;
-  int device_id = 0;
-
-  Device device = Device::CPU;
-
-  void* external_stream_ = nullptr;
-
-  bool enable_pinned_memory = false;
-
-  /// Option to configure ONNX Runtime backend
-  OrtBackendOption ort_option;
-
-  /// Option to configure TensorRT backend
-  TrtBackendOption trt_option;
-
-  /// Option to configure Paddle Inference backend
-  PaddleBackendOption paddle_infer_option;
-
-  // ======Only for PaddleTrt Backend=======
-  std::vector<std::string> trt_disabled_ops_{};
-
-  /// Option to configure Poros backend
-  PorosBackendOption poros_option;
-
-  /// Option to configure OpenVINO backend
-  OpenVINOBackendOption openvino_option;
-
-  // ======Only for RKNPU2 Backend=======
-  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_ =
-      fastdeploy::rknpu2::CpuName::RK3588;
-  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_ =
-      fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
-
-
-  /// Option to configure Paddle Lite backend
-  LiteBackendOption paddle_lite_option;
+  /// Benchmark option
+  benchmark::BenchmarkOption benchmark_option;

  // If model_from_memory is true, the model_file and params_file is
  // binary stream in memory;
@@ -414,8 +181,76 @@ struct FASTDEPLOY_DECL RuntimeOption {
  /// format of input model
  ModelFormat model_format = ModelFormat::PADDLE;

-  /// Benchmark option
-  benchmark::BenchmarkOption benchmark_option;
+  // for cpu inference
+  // default will let the backend choose their own default value
+  int cpu_thread_num = -1;
+  int device_id = 0;
+  Backend backend = Backend::UNKNOWN;
+
+  Device device = Device::CPU;
+
+  void* external_stream_ = nullptr;
+
+  bool enable_pinned_memory = false;
+
+  // ======Only for RKNPU2 Backend=======
+  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_ =
+      fastdeploy::rknpu2::CpuName::RK3588;
+  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_ =
+      fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
+
+  // *** The belowing api are deprecated, will be removed in v1.2.0
+  // *** Do not use it anymore
+  void SetPaddleMKLDNN(bool pd_mkldnn = true);
+  void EnablePaddleToTrt();
+  void DeletePaddleBackendPass(const std::string& delete_pass_name);
+  void EnablePaddleLogInfo();
+  void DisablePaddleLogInfo();
+  void SetPaddleMKLDNNCacheSize(int size);
+  void SetOpenVINODevice(const std::string& name = "CPU");
+  void SetOpenVINOShapeInfo(
+      const std::map<std::string, std::vector<int64_t>>& shape_info) {
+    openvino_option.shape_infos = shape_info;
+  }
+  void SetOpenVINOCpuOperators(const std::vector<std::string>& operators) {
+    openvino_option.SetCpuOperators(operators);
+  }
+  void SetLiteOptimizedModelDir(const std::string& optimized_model_dir);
+  void SetLiteSubgraphPartitionPath(
+      const std::string& nnadapter_subgraph_partition_config_path);
+  void SetLiteSubgraphPartitionConfigBuffer(
+      const std::string& nnadapter_subgraph_partition_config_buffer);
+  void
+  SetLiteContextProperties(const std::string& nnadapter_context_properties);
+  void SetLiteModelCacheDir(const std::string& nnadapter_model_cache_dir);
+  void SetLiteDynamicShapeInfo(
+      const std::map<std::string, std::vector<std::vector<int64_t>>>&
+          nnadapter_dynamic_shape_info);
+  void SetLiteMixedPrecisionQuantizationConfigPath(
+      const std::string& nnadapter_mixed_precision_quantization_config_path);
+  void EnableLiteFP16();
+  void DisableLiteFP16();
+  void EnableLiteInt8();
+  void DisableLiteInt8();
+  void SetLitePowerMode(LitePowerMode mode);
+  void SetTrtInputShape(
+      const std::string& input_name, const std::vector<int32_t>& min_shape,
+      const std::vector<int32_t>& opt_shape = std::vector<int32_t>(),
+      const std::vector<int32_t>& max_shape = std::vector<int32_t>());
+  void SetTrtMaxWorkspaceSize(size_t trt_max_workspace_size);
+  void SetTrtMaxBatchSize(size_t max_batch_size);
+  void EnableTrtFP16();
+  void DisableTrtFP16();
+  void SetTrtCacheFile(const std::string& cache_file_path);
+  void EnablePinnedMemory();
+  void DisablePinnedMemory();
+  void EnablePaddleTrtCollectShape();
+  void DisablePaddleTrtCollectShape();
+  void DisablePaddleTrtOPs(const std::vector<std::string>& ops);
+  void SetOpenVINOStreams(int num_streams);
+  void SetOrtGraphOptLevel(int level = -1);
+  void UsePaddleBackend();
+  void UseLiteBackend();
 };

 }  // namespace fastdeploy
--- a/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h
+++ b/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h
@@ -56,7 +56,7 @@ class FASTDEPLOY_DECL RKYOLOPostprocessor {
  float GetNMSThreshold() const { return nms_threshold_; }

  /// Set height and weight
-  void SetHeightAndWeight(int height,int width) {
+  void SetHeightAndWeight(int height, int width) {
    height_ = height;
    width_ = width;
  }
@@ -80,6 +80,10 @@ class FASTDEPLOY_DECL RKYOLOPostprocessor {
    obj_class_num_ = num;
    prob_box_size_ = obj_class_num_ + 5;
  }
+  /// Get the number of class
+  int GetClassNum() {
+    return obj_class_num_;
+  }

 private:
  std::vector<int> anchors_ = {10, 13, 16,  30,  33, 23,  30,  61,  62,
--- a/fastdeploy/vision/detection/contrib/rknpu2/rkyolo_pybind.cc
+++ b/fastdeploy/vision/detection/contrib/rknpu2/rkyolo_pybind.cc
@@ -65,7 +65,9 @@ void BindRKYOLO(pybind11::module& m) {
      .def_property("conf_threshold", &vision::detection::RKYOLOPostprocessor::GetConfThreshold,
                    &vision::detection::RKYOLOPostprocessor::SetConfThreshold)
      .def_property("nms_threshold", &vision::detection::RKYOLOPostprocessor::GetNMSThreshold,
-                    &vision::detection::RKYOLOPostprocessor::SetNMSThreshold);
+                    &vision::detection::RKYOLOPostprocessor::SetNMSThreshold)
+      .def_property("class_num", &vision::detection::RKYOLOPostprocessor::GetClassNum,
+                    &vision::detection::RKYOLOPostprocessor::SetClassNum);

  pybind11::class_<vision::detection::RKYOLOV5, FastDeployModel>(m, "RKYOLOV5")
      .def(pybind11::init<std::string,
--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -591,7 +591,8 @@ class RuntimeOption:
                       replica_num=1,
                       available_memory_proportion=1.0,
                       enable_half_partial=False):
-        return self._option.set_ipu_config(enable_fp16, replica_num,
+        logging.warning("`RuntimeOption.set_ipu_config` will be deprecated in v1.2.0, please use `RuntimeOption.paddle_infer_option.set_ipu_config()` instead.")
+        self._option.paddle_infer_option.set_ipu_config(enable_fp16, replica_num,
                                           available_memory_proportion,
                                           enable_half_partial)

--- a/python/fastdeploy/vision/detection/contrib/rkyolo/rkyolov5.py
+++ b/python/fastdeploy/vision/detection/contrib/rkyolo/rkyolov5.py
@@ -108,11 +108,11 @@ class RKYOLOPostprocessor:
        return self._postprocessor.nms_threshold

    @property
-    def multi_label(self):
+    def class_num(self):
        """
-        multi_label for postprocessing, set true for eval, default is True
+        class_num for postprocessing, default is 80
        """
-        return self._postprocessor.multi_label
+        return self._postprocessor.class_num

    @conf_threshold.setter
    def conf_threshold(self, conf_threshold):
@@ -126,13 +126,14 @@ class RKYOLOPostprocessor:
            "The value to set `nms_threshold` must be type of float."
        self._postprocessor.nms_threshold = nms_threshold

-    @multi_label.setter
-    def multi_label(self, value):
-        assert isinstance(
-            value,
-            bool), "The value to set `multi_label` must be type of bool."
-        self._postprocessor.multi_label = value
-
+    @class_num.setter
+    def class_num(self, class_num):
+        """
+        class_num for postprocessing, default is 80
+        """
+        assert isinstance(class_num, int), \
+            "The value to set `nms_threshold` must be type of float."
+        self._postprocessor.class_num = class_num

 class RKYOLOV5(FastDeployModel):
    def __init__(self,
--- a/scripts/android/build_android_cpp_with_benchmark.sh
+++ b/scripts/android/build_android_cpp_with_benchmark.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        mutable global variables
+# -------------------------------------------------------------------------------
+TOOLCHAIN=clang # gcc/clang toolchain
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly ANDROID_ABI=$1
+readonly ANDROID_PLATFORM="android-$2"
+readonly BUILD_ROOT=build/Android
+readonly BUILD_DIR=${BUILD_ROOT}/${ANDROID_ABI}-api-$2
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${ANDROID_ABI}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__set_android_ndk() {
+  if [ -z $ANDROID_NDK ]; then
+    echo "-- [INFO] ANDROID_NDK not exists, please setup manually ..."
+    exit 0
+  else
+    echo "-- [INFO] Found ANDROID_NDK: ${ANDROID_NDK}"
+  fi
+  if [ "$ANDROID_NDK" ]; then
+      NDK_VERSION=$(echo $ANDROID_NDK | egrep -o "[0-9]{2}" | head -n 1)
+      if [ "$NDK_VERSION" -gt 17 ]; then
+          TOOLCHAIN=clang
+      fi
+      echo "-- [INFO] Checked ndk version: ${NDK_VERSION}"
+      echo "-- [INFO] Selected toolchain: ${TOOLCHAIN}"
+  fi
+}
+
+__build_fastdeploy_android_shared() {
+
+  local ANDROID_STL=c++_shared  # c++_static
+  local ANDROID_TOOLCHAIN=${TOOLCHAIN}
+  local TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DANDROID_ABI=${ANDROID_ABI} \
+        -DANDROID_NDK=${ANDROID_NDK} \
+        -DANDROID_PLATFORM=${ANDROID_PLATFORM} \
+        -DANDROID_STL=${ANDROID_STL} \
+        -DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN} \
+        -DENABLE_ORT_BACKEND=OFF \
+        -DENABLE_LITE_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=OFF \
+        -DENABLE_FLYCV=ON \
+        -DENABLE_TEXT=OFF \
+        -DENABLE_VISION=ON \
+        -DBUILD_EXAMPLES=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DWITH_OPENCV_STATIC=OFF \
+        -DWITH_LITE_STATIC=OFF \
+        -DWITH_OPENMP=OFF \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][${ANDROID_ABI}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __set_android_ndk
+  __build_fastdeploy_android_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/android/build_android_cpp_with_benchmark.sh arm64-v8a 21
+# ./scripts/android/build_android_cpp_with_benchmark.sh armeabi-v7a 21
--- a/scripts/android/bundle_android_cpp_with_text_api_only.sh
+++ b/scripts/android/bundle_android_cpp_with_text_api_only.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -e
+set +x
+
+FASTDEPLOY_DIR=$(pwd)
+BUILT_PACKAGE_DIR=build/Android
+CXX_PACKAGE_PREFIX=fastdeploy-android-latest-shared-dev
+CXX_PACKAGE_NAME=${BUILT_PACKAGE_DIR}/${CXX_PACKAGE_PREFIX}
+ARMV8_CXX_PACKAGE_NAME=${BUILT_PACKAGE_DIR}/arm64-v8a-api-21/install
+ARMV7_CXX_PACKAGE_NAME=${BUILT_PACKAGE_DIR}/armeabi-v7a-api-21/install
+
+# check package name
+echo "[INFO] --- FASTDEPLOY_DIR: ${FASTDEPLOY_DIR}"
+
+# check arm v7 & v8 c++ sdk
+if [ ! -d "${BUILT_PACKAGE_DIR}" ]; then
+    echo "[ERROR] --- ${BUILT_PACKAGE_DIR} not exist, please build c++ sdk first!"
+    exit 0
+fi
+if [ ! -d "${ARMV8_CXX_PACKAGE_NAME}" ]; then
+    echo "[ERROR] --- ${ARMV8_CXX_PACKAGE_NAME} not exist, please build c++ sdk first!"
+    exit 0
+fi
+if [ ! -d "${ARMV7_CXX_PACKAGE_NAME}" ]; then
+    echo "[ERROR] --- ${ARMV7_CXX_PACKAGE_NAME} not exist, please build c++ sdk first!"
+    exit 0
+fi
+
+# remove old package
+echo "[INFO] --- Packing ${CXX_PACKAGE_NAME} package ..."
+if [ -d "${CXX_PACKAGE_NAME}" ]; then
+	rm -rf ${CXX_PACKAGE_NAME}
+    echo "[INFO] --- Removed old ${CXX_PACKAGE_NAME} done !"
+    if [ -f "${CXX_PACKAGE_NAME}.tgz" ]; then
+        rm ${CXX_PACKAGE_NAME}.tgz
+        echo "[INFO] --- Removed old ${CXX_PACKAGE_NAME} done !"
+    fi
+fi
+
+# package latest c++ sdk
+mkdir ${CXX_PACKAGE_NAME}
+echo "[INFO] --- Collecting package contents ..."
+cp -r ${ARMV7_CXX_PACKAGE_NAME}/* ${CXX_PACKAGE_NAME}/
+cp -r ${ARMV8_CXX_PACKAGE_NAME}/* ${CXX_PACKAGE_NAME}/
+if [ -d "${CXX_PACKAGE_NAME}/examples" ]; then
+    rm -rf ${CXX_PACKAGE_NAME}/examples
+fi
+echo "[INFO] --- Removed examples files ..."
+echo "[INFO] --- Removing static .a files: "
+static_files=$(find ${CXX_PACKAGE_NAME}/third_libs/install/ -name "*.a")
+if [ ${#static_files[@]} -gt 10 ]; then
+    echo "${#static_files[@]}: ${static_files}"
+    rm $(find ${CXX_PACKAGE_NAME}/third_libs/install/ -name "*.a")
+fi
+echo "[INFO] --- Taring ${CXX_PACKAGE_NAME}.tgz package ..."
+tar -zcvf ${CXX_PACKAGE_NAME}.tgz ${CXX_PACKAGE_NAME}/* >> ${BUILT_PACKAGE_DIR}/pkg.log 2>&1
+echo "[INFO] --- Package ${CXX_PACKAGE_NAME}.tgz done ! Package size info: "
+du -sh ${BUILT_PACKAGE_DIR}/* | grep ${CXX_PACKAGE_PREFIX}
+
+# Usage:
+# ./scripts/android/bundle_android_cpp_with_text_api_only.sh
--- a/scripts/linux/build_linux_x86_64_cpp_gpu.sh
+++ b/scripts/linux/build_linux_x86_64_cpp_gpu.sh
@@ -62,7 +62,7 @@ __build_fastdeploy_linux_x86_64_gpu_shared() {
        -DENABLE_OPENVINO_BACKEND=ON \
        -DENABLE_PADDLE2ONNX=ON \
        -DENABLE_VISION=ON \
-        -DENABLE_BENCHMARK=ON \
+        -DENABLE_BENCHMARK=OFF \
        -DBUILD_EXAMPLES=ON \
        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
        -Wno-dev ../../.. && make -j8 && make install
--- a/scripts/linux/build_linux_x86_64_cpp_gpu_with_benchmark.sh
+++ b/scripts/linux/build_linux_x86_64_cpp_gpu_with_benchmark.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly BUILD_ROOT=build/Linux
+readonly BUILD_DIR="${BUILD_ROOT}/x86_64_gpu"
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__build_fastdeploy_linux_x86_64_gpu_shared() {
+
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_GPU=ON \
+        -DTRT_DIRECTORY=${TRT_DIRECTORY} \
+        -DCUDA_DIRECTORY=${CUDA_DIRECTORY} \
+        -DENABLE_ORT_BACKEND=ON \
+        -DENABLE_TRT_BACKEND=ON \
+        -DENABLE_PADDLE_BACKEND=ON \
+        -DENABLE_OPENVINO_BACKEND=ON \
+        -DENABLE_PADDLE2ONNX=ON \
+        -DENABLE_VISION=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DBUILD_EXAMPLES=ON \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][x86_64_gpu}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __build_fastdeploy_linux_x86_64_gpu_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/linux/build_linux_x86_64_cpp_gpu.sh
--- a/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh
+++ b/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+set -e
+set +x
+
+# -------------------------------------------------------------------------------
+#                        readonly global variables
+# -------------------------------------------------------------------------------
+readonly ROOT_PATH=$(pwd)
+readonly BUILD_ROOT=build/Linux
+readonly BUILD_DIR="${BUILD_ROOT}/x86_64_xpu"
+
+# -------------------------------------------------------------------------------
+#                                 tasks
+# -------------------------------------------------------------------------------
+__make_build_dir() {
+  if [ ! -d "${BUILD_DIR}" ]; then
+    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
+    if [ ! -d "${BUILD_ROOT}" ]; then
+      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
+    fi
+    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
+  else
+    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
+  fi
+}
+
+__check_cxx_envs() {
+  if [ $LDFLAGS ]; then
+    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset LDFLAGS
+  fi
+  if [ $CPPFLAGS ]; then
+    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPPFLAGS
+  fi
+  if [ $CPLUS_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset CPLUS_INCLUDE_PATH
+  fi
+  if [ $C_INCLUDE_PATH ]; then
+    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
+    echo "unset it before crossing compiling ${BUILD_DIR}"
+    unset C_INCLUDE_PATH
+  fi
+}
+
+__build_fastdeploy_linux_x86_64_xpu_shared() {
+
+  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
+  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
+
+  cmake -DWITH_KUNLUNXIN=ON \
+	-DCMAKE_BUILD_TYPE=Release \
+        -DWITH_GPU=OFF \
+        -DENABLE_ORT_BACKEND=ON \
+        -DENABLE_PADDLE_BACKEND=ON \
+        -DENABLE_VISION=ON \
+        -DENABLE_BENCHMARK=ON \
+        -DBUILD_EXAMPLES=OFF \
+        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
+        -Wno-dev ../../.. && make -j8 && make install
+
+  echo "-- [INFO][built][x86_64_xpu}][${BUILD_DIR}/install]"
+}
+
+main() {
+  __make_build_dir
+  __check_cxx_envs
+  __build_fastdeploy_linux_x86_64_xpu_shared
+  exit 0
+}
+
+main
+
+# Usage:
+# ./scripts/linux/build_linux_x86_64_cpp_gpu.sh
--- a/tests/acc_eval/classification/run.sh
+++ b/tests/acc_eval/classification/run.sh
@@ -4,5 +4,5 @@ model_dir=`ls ./models/`

 for MODEL_NAME in $model_dir
 do
-    python infer.py --model ./models/$MODEL_NAME  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/${MODEL_NAME}_acc.log
+    python eval.py --model ./models/$MODEL_NAME  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/${MODEL_NAME}_acc.log
 done
--- a/tests/acc_eval/detection/eval_yolov5.py
+++ b/tests/acc_eval/detection/eval_yolov5.py
@@ -52,8 +52,8 @@ model = fd.vision.detection.YOLOv5(
    runtime_option=runtime_option,
    model_format=fd.ModelFormat.PADDLE)

-image_file_path = "/xieyunyao/Project/coco/val2017"
-annotation_file_path = "/xieyunyao/Project/coco/annotations/instances_val2017.json"
+image_file_path = "../dataset/coco/val2017"
+annotation_file_path = "../dataset/coco/annotations/instances_val2017.json"

 res = fd.vision.evaluation.eval_detection(model, image_file_path,
                                          annotation_file_path, 0.001, 0.65)
--- a/tests/acc_eval/detection/eval_yolov6.py
+++ b/tests/acc_eval/detection/eval_yolov6.py
@@ -52,8 +52,8 @@ model = fd.vision.detection.YOLOv6(
    runtime_option=runtime_option,
    model_format=fd.ModelFormat.PADDLE)

-image_file_path = "/xieyunyao/Project/coco/val2017"
-annotation_file_path = "/xieyunyao/Project/coco/annotations/instances_val2017.json"
+image_file_path = "../dataset/coco/val2017"
+annotation_file_path = "../dataset/coco/annotations/instances_val2017.json"

 res = fd.vision.evaluation.eval_detection(model, image_file_path,
                                          annotation_file_path, 0.001, 0.65)
--- a/tests/acc_eval/detection/eval_yolov7.py
+++ b/tests/acc_eval/detection/eval_yolov7.py
@@ -52,8 +52,8 @@ model = fd.vision.detection.YOLOv6(
    runtime_option=runtime_option,
    model_format=fd.ModelFormat.PADDLE)

-image_file_path = "/xieyunyao/Project/coco/val2017"
-annotation_file_path = "/xieyunyao/Project/coco/annotations/instances_val2017.json"
+image_file_path = "../dataset/coco/val2017"
+annotation_file_path = "../dataset/coco/annotations/instances_val2017.json"

 res = fd.vision.evaluation.eval_detection(model, image_file_path,
                                          annotation_file_path, 0.001, 0.65)
--- a/tests/acc_eval/detection/run.sh
+++ b/tests/acc_eval/detection/run.sh
@@ -12,6 +12,6 @@ python eval_yolov3.py  --model_dir ./models/yolov3_darknet53_270e_coco  --image
 python eval_yolox.py --model_dir ./models/yolox_s_300e_coco  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolox_s_300e_coco.log
 python eval_faster_rcnn.py  --model_dir ./models/faster_rcnn_r50_vd_fpn_2x_coco  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/faster_rcnn_r50_vd_fpn_2x_coco.log
 python eval_mask_rcnn.py  --model_dir ./models/mask_rcnn_r50_1x_coco  --image None --device $TARGET_DEVICE 2>&1 | tee ./log/mask_rcnn_r50_1x_coco.log
-python eval_yolov5.py  --model_dir ./models/yolov5s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov5s_infer.log
-python eval_yolov6.py  --model_dir ./models/yolov6s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov6s_infer.log
-python eval_yolov5.py  --model_dir ./models/yolov7_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov7_infer.log
+python eval_yolov5.py  --model ./models/yolov5s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov5s_infer.log
+python eval_yolov6.py  --model ./models/yolov6s_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov6s_infer.log
+python eval_yolov7.py  --model ./models/yolov7_infer --image None --device $TARGET_DEVICE 2>&1 | tee ./log/yolov7_infer.log
--- a/tests/acc_eval/ppocr/eval_ppocrv2.py
+++ b/tests/acc_eval/ppocr/eval_ppocrv2.py
@@ -103,7 +103,7 @@ rec_model = fd.vision.ocr.Recognizer(
    runtime_option=runtime_option)

 # PPOCR的Rec模型开启静态推理, 其他硬件不需要的话请注释掉.
-rec_model.preprocessor.static_shape = True
+rec_model.preprocessor.static_shape_infer = True

 # 创建PP-OCR，串联3个模型，其中cls_model可选，如无需求，可设置为None
 ppocr_v2 = fd.vision.ocr.PPOCRv2(
--- a/tests/acc_eval/ppocr/eval_ppocrv3.py
+++ b/tests/acc_eval/ppocr/eval_ppocrv3.py
@@ -103,7 +103,7 @@ rec_model = fd.vision.ocr.Recognizer(
    runtime_option=runtime_option)

 # PPOCR的Rec模型开启静态推理, 其他硬件不需要的话请注释掉.
-rec_model.preprocessor.static_shape = True
+rec_model.preprocessor.static_shape_infer = True

 # 创建PP-OCR，串联3个模型，其中cls_model可选，如无需求，可设置为None
 ppocr_v3 = fd.vision.ocr.PPOCRv3(
--- a/tests/acc_eval/segmentation/eval.py
+++ b/tests/acc_eval/segmentation/eval.py
@@ -54,5 +54,5 @@ model = fd.vision.segmentation.PaddleSegModel(
    model_file, params_file, config_file, runtime_option=runtime_option)

 res = fd.vision.evaluation.eval_segmentation(
-    model=model, data_dir="../dataset/FD_dataset/data/cityscapes")
+    model=model, data_dir="../dataset/cityscapes")
 print(res)