[Backend] support bechmark mode for runtime and backend (#1201)

* [backend] support bechmark mode for runtime and backend * [backend] support bechmark mode for runtime and backend * [pybind11] add benchmark methods pybind * [pybind11] add benchmark methods pybind * [Other] Update build scripts * [Other] Update cmake/summary.cmake * [Other] update build scripts * [Other] add ENABLE_BENCHMARK option -> setup.py * optimize backend time recording * optimize backend time recording * optimize trt backend time record * [backend] optimze backend_time recording for trt * [benchmark] remove redundant logs * fixed ov_backend confilct * [benchmark] fixed paddle_backend conflicts * [benchmark] fixed paddle_backend conflicts * [benchmark] fixed paddle_backend conflicts * [benchmark] remove use_gpu option from ort backend option * [benchmark] update benchmark_ppdet.py * [benchmark] update benchmark_ppcls.py * fixed lite backend conflicts * [Lite] fixed lite xpu * add benchmark macro * add RUNTIME_PROFILE_LOOP macros * add comments for RUNTIME_PROFILE macros * add comments for new apis * add comments for new apis * update benchmark_ppdet.py * afixed bugs * remove unused codes * optimize RUNTIME_PROFILE_LOOP macros * optimize RUNTIME_PROFILE_LOOP macros * add comments for benchmark option and result * add docs for benchmark namespace
2025-10-05 16:48:03 +08:00 · 2023-02-06 14:29:35 +08:00
parent 42d14e7119
commit f73a538f61
34 changed files with 741 additions and 91 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,6 +68,7 @@ option(ENABLE_TEXT "Whether to enable text models usage." OFF)
 option(ENABLE_FLYCV "Whether to enable flycv to boost image preprocess." OFF)
 option(ENABLE_CVCUDA "Whether to enable NVIDIA CV-CUDA to boost image preprocess." OFF)
 option(ENABLE_ENCRYPTION "Whether to enable ENCRYPTION." OFF)
 option(ENABLE_BENCHMARK "Whether to enable Benchmark mode." OFF)
 option(WITH_ASCEND "Whether to compile for Huawei Ascend deploy." OFF)
 option(WITH_TIMVX "Whether to compile for TIMVX deploy." OFF)
 option(WITH_KUNLUNXIN "Whether to compile for KunlunXin XPU deploy." OFF)
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -0,0 +1,13 @@
 *.tgz
 *.zip
 *.tar
 *.tar.gz
 *.tgz
 *.jpg
 *.png
 *.jpeg
 *.txt
 *.log
 yolov8_s_*
 ._yolov8_s_*
 Mobile*
--- a/benchmark/benchmark_ppcls.py
+++ b/benchmark/benchmark_ppcls.py
@@ -17,7 +17,7 @@ import cv2
 import os
 import numpy as np
 import time
-
+from tqdm import tqdm 
 def parse_arguments():
    import argparse
@@ -35,11 +35,22 @@ def parse_arguments():
    parser.add_argument(
        "--device_id", type=int, default=0, help="device(gpu) id")
    parser.add_argument(
-        "--iter_num",
+        "--profile_mode",
        type=str,
        default="runtime",
        help="runtime or end2end.")      
    parser.add_argument(
        "--repeat",
        required=True,
        type=int,
-        default=300,
+        default=1000,
-        help="number of iterations for computing performace.")
+        help="number of repeats for profiling.")    
    parser.add_argument(
        "--warmup",
        required=True,
        type=int,
        default=50,
        help="number of warmup for profiling.")      
    parser.add_argument(
        "--device",
        default="cpu",
@@ -59,6 +70,11 @@ def parse_arguments():
        type=ast.literal_eval,
        default=False,
        help="whether enable collect memory info")
    parser.add_argument(
        "--include_h2d_d2h",
        type=ast.literal_eval,
        default=False,
        help="whether run profiling with h2d and d2h")       
    args = parser.parse_args()
    return args
@@ -68,6 +84,8 @@ def build_option(args):
    device = args.device
    backend = args.backend
    enable_trt_fp16 = args.enable_trt_fp16
    if args.profile_mode == "runtime":
        option.enable_profiling(args.include_h2d_d2h, args.repeat, args.warmup)    
    option.set_cpu_thread_num(args.cpu_num_thread)
    if device == "gpu":
        option.use_gpu()
@@ -229,7 +247,6 @@ if __name__ == '__main__':
    gpu_id = args.device_id
    enable_collect_memory_info = args.enable_collect_memory_info
    dump_result = dict()
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
    gpu_util = list()
@@ -258,18 +275,26 @@ if __name__ == '__main__':
            monitor = Monitor(enable_gpu, gpu_id)
            monitor.start()
        model.enable_record_time_of_runtime()
        im_ori = cv2.imread(args.image)
-        for i in range(args.iter_num):
+        if args.profile_mode == "runtime":
-            im = im_ori
+            result = model.predict(im_ori)
            profile_time = model.get_profile_time()
            dump_result["runtime"] = profile_time * 1000
            f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
            print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        else:
            # end2end
            for i in range(args.warmup):
                result = model.predict(im_ori)
            start = time.time()
-            result = model.predict(im)
+            for i in tqdm(range(args.repeat)):
-            end2end_statis.append(time.time() - start)
+                result = model.predict(im_ori)
            end = time.time()
            dump_result["end2end"] = ((end - start) / args.repeat) * 1000.0
            f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
            print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        runtime_statis = model.print_statis_info_of_runtime()
        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
        if enable_collect_memory_info:
            monitor.stop()
            mem_info = monitor.output()
@@ -280,13 +305,6 @@ if __name__ == '__main__':
            dump_result["gpu_util"] = mem_info['gpu'][
                'utilization.gpu'] if 'gpu' in mem_info else 0
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
@@ -297,7 +315,8 @@ if __name__ == '__main__':
            print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
            print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
            print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
-    except:
+    except Exception as e:
        f.writelines("!!!!!Infer Failed\n")
        raise e
    f.close()
--- a/benchmark/benchmark_ppdet.py
+++ b/benchmark/benchmark_ppdet.py
@@ -17,6 +17,7 @@ import cv2
 import os
 import numpy as np
 import time
 from sympy import EX
 from tqdm import tqdm
 def parse_arguments():
@@ -24,7 +25,7 @@ def parse_arguments():
    import ast
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--model", required=True, help="Path of PaddleDetection model.")
+        "--model", required=True, help="Path of PaddleClas model.")
    parser.add_argument(
        "--image", type=str, required=False, help="Path of test image file.")
    parser.add_argument(
@@ -35,20 +36,31 @@ def parse_arguments():
    parser.add_argument(
        "--device_id", type=int, default=0, help="device(gpu) id")
    parser.add_argument(
-        "--iter_num",
+        "--profile_mode",
        type=str,
        default="runtime",
        help="runtime or end2end.")      
    parser.add_argument(
        "--repeat",
        required=True,
        type=int,
-        default=300,
+        default=1000,
-        help="number of iterations for computing performace.")
+        help="number of repeats for profiling.")    
    parser.add_argument(
        "--warmup",
        required=True,
        type=int,
        default=50,
        help="number of warmup for profiling.")      
    parser.add_argument(
        "--device",
        default="cpu",
-        help="Type of inference device, support 'cpu', 'gpu', 'kunlunxin', 'ascend' etc.")
+        help="Type of inference device, support 'cpu' or 'gpu'.")
    parser.add_argument(
        "--backend",
        type=str,
        default="default",
-        help="inference backend, default, ort, ov, trt, paddle, paddle_trt, lite.")
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
    parser.add_argument(
        "--enable_trt_fp16",
        type=ast.literal_eval,
@@ -58,12 +70,17 @@ def parse_arguments():
        "--enable_lite_fp16",
        type=ast.literal_eval,
        default=False,
-        help="whether enable fp16 in lite backend")    
+        help="whether enable fp16 in Paddle Lite backend")    
    parser.add_argument(
        "--enable_collect_memory_info",
        type=ast.literal_eval,
        default=False,
        help="whether enable collect memory info")
    parser.add_argument(
        "--include_h2d_d2h",
        type=ast.literal_eval,
        default=False,
        help="whether run profiling with h2d and d2h")       
    args = parser.parse_args()
    return args
@@ -74,6 +91,8 @@ def build_option(args):
    backend = args.backend
    enable_trt_fp16 = args.enable_trt_fp16
    enable_lite_fp16 = args.enable_lite_fp16
    if args.profile_mode == "runtime":
        option.enable_profiling(args.include_h2d_d2h, args.repeat, args.warmup)  
    option.set_cpu_thread_num(args.cpu_num_thread)
    if device == "gpu":
        option.use_gpu()
@@ -266,8 +285,12 @@ if __name__ == '__main__':
    gpu_id = args.device_id
    enable_collect_memory_info = args.enable_collect_memory_info
    enable_record_time_of_backend = args.enable_record_time_of_backend
    backend_repeat = args.backend_repeat
    dump_result = dict()
    end2end_statis = list()
    prepost_statis = list()
    h2d_d2h_statis = list()
    cpu_mem = list()
    gpu_mem = list()
    gpu_util = list()
@@ -317,18 +340,26 @@ if __name__ == '__main__':
            monitor = Monitor(enable_gpu, gpu_id)
            monitor.start()
        model.enable_record_time_of_runtime()
        im_ori = cv2.imread(args.image)
-        for i in tqdm(range(args.iter_num)):
+        if args.profile_mode == "runtime":
-            im = im_ori
+            result = model.predict(im_ori)
            profile_time = model.get_profile_time()
            dump_result["runtime"] = profile_time * 1000
            f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
            print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        else:
            # end2end
            for i in range(args.warmup):
                result = model.predict(im_ori)
            start = time.time()
-            result = model.predict(im)
+            for i in tqdm(range(args.repeat)):
-            end2end_statis.append(time.time() - start)
+                result = model.predict(im_ori)
            end = time.time()
            dump_result["end2end"] = ((end - start) / args.repeat) * 1000.0
            f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
            print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        runtime_statis = model.print_statis_info_of_runtime()
        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
        if enable_collect_memory_info:
            monitor.stop()
            mem_info = monitor.output()
@@ -339,13 +370,6 @@ if __name__ == '__main__':
            dump_result["gpu_util"] = mem_info['gpu'][
                'utilization.gpu'] if 'gpu' in mem_info else 0
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
@@ -356,7 +380,8 @@ if __name__ == '__main__':
            print("cpu_rss_mb: {} \n".format(str(dump_result["cpu_rss_mb"])))
            print("gpu_rss_mb: {} \n".format(str(dump_result["gpu_rss_mb"])))
            print("gpu_util: {} \n".format(str(dump_result["gpu_util"])))
-    except:
+    except Exception as e:
        f.writelines("!!!!!Infer Failed\n")
        raise e
    f.close()
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -39,6 +39,7 @@ function(fastdeploy_summary)
  message(STATUS "  ENABLE_POROS_BACKEND      : ${ENABLE_POROS_BACKEND}")
  message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
  message(STATUS "  ENABLE_OPENVINO_BACKEND   : ${ENABLE_OPENVINO_BACKEND}")
  message(STATUS "  ENABLE_BENCHMARK          : ${ENABLE_BENCHMARK}")
  message(STATUS "  WITH_GPU                  : ${WITH_GPU}")
  message(STATUS "  WITH_ASCEND               : ${WITH_ASCEND}")
  message(STATUS "  WITH_TIMVX                : ${WITH_TIMVX}")
--- a/fastdeploy/benchmark/benchmark.h
+++ b/fastdeploy/benchmark/benchmark.h
@@ -0,0 +1,86 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "fastdeploy/core/config.h"
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy/utils/perf.h"
 #include "fastdeploy/benchmark/option.h"
 #include "fastdeploy/benchmark/results.h"
 #ifdef ENABLE_BENCHMARK                            
  #define __RUNTIME_PROFILE_LOOP_BEGIN(option, base_loop)               \
    int __p_loop = (base_loop);                                         \
    const bool __p_enable_profile = option.enable_profile;              \
    const bool __p_include_h2d_d2h = option.include_h2d_d2h;            \
    const int __p_repeats = option.repeats;                             \
    const int __p_warmup = option.warmup;                               \
    if (__p_enable_profile && (!__p_include_h2d_d2h)) {                 \
      __p_loop = (__p_repeats) + (__p_warmup);                          \
      FDINFO << option << std::endl;                                    \
    }                                                                   \
    TimeCounter __p_tc;                                                 \
    bool __p_tc_start = false;                                          \
    for (int __p_i = 0; __p_i < __p_loop; ++__p_i) {                    \
      if (__p_i >= (__p_warmup) && (!__p_tc_start)) {                   \
        __p_tc.Start();                                                 \
        __p_tc_start = true;                                            \
      }                                                                 \
  #define __RUNTIME_PROFILE_LOOP_END(result)                            \
    }                                                                   \
    if ((__p_enable_profile && (!__p_include_h2d_d2h))) {               \
      if (__p_tc_start) {                                               \
        __p_tc.End();                                                   \
        double __p_tc_duration = __p_tc.Duration();                     \
        result.time_of_runtime =                                        \
          __p_tc_duration / static_cast<double>(__p_repeats);           \
      }                                                                 \
    }
  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN(option, base_loop)       \
    int __p_loop_h = (base_loop);                                       \
    const bool __p_enable_profile_h = option.enable_profile;            \
    const bool __p_include_h2d_d2h_h = option.include_h2d_d2h;          \
    const int __p_repeats_h = option.repeats;                           \
    const int __p_warmup_h = option.warmup;                             \
    if (__p_enable_profile_h && __p_include_h2d_d2h_h) {                \
      __p_loop_h = (__p_repeats_h) + (__p_warmup_h);                    \
      FDINFO << option << std::endl;                                    \
    }                                                                   \
    TimeCounter __p_tc_h;                                               \
    bool __p_tc_start_h = false;                                        \
    for (int __p_i_h = 0; __p_i_h < __p_loop_h; ++__p_i_h) {            \
      if (__p_i_h >= (__p_warmup_h) && (!__p_tc_start_h)) {             \
        __p_tc_h.Start();                                               \
        __p_tc_start_h = true;                                          \
      }                                                                 \
  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_END(result)                    \
    }                                                                   \
    if ((__p_enable_profile_h && __p_include_h2d_d2h_h)) {              \
      if (__p_tc_start_h) {                                             \
         __p_tc_h.End();                                                \
        double __p_tc_duration_h = __p_tc_h.Duration();                 \
        result.time_of_runtime =                                        \
          __p_tc_duration_h / static_cast<double>(__p_repeats_h);       \
      }                                                                 \
    }  
 #else
  #define __RUNTIME_PROFILE_LOOP_BEGIN(option, base_loop)               \
    for (int __p_i = 0; __p_i < (base_loop); ++ __p_i) { 
  #define __RUNTIME_PROFILE_LOOP_END(result) }
  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN(option, base_loop)       \
    for (int __p_i_h = 0; __p_i_h < (base_loop); ++ __p_i_h) {
  #define __RUNTIME_PROFILE_LOOP_H2D_D2H_END(result) }
 #endif
--- a/fastdeploy/benchmark/option.h
+++ b/fastdeploy/benchmark/option.h
@@ -0,0 +1,47 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 namespace fastdeploy {
 /** \brief All C++ FastDeploy benchmark profile APIs are defined inside this namespace
 *
 */
 namespace benchmark {
 /*! @brief Option object used to control the behavior of the benchmark profiling.
 */
 struct BenchmarkOption {
  int warmup = 50;              ///< Warmup for backend inference.
  int repeats = 100;            ///< Repeats for backend inference.
  bool enable_profile = false;  ///< Whether to use profile or not.
  bool include_h2d_d2h = false; ///< Whether to include time of H2D_D2H for time of runtime.
  friend std::ostream& operator<<(
    std::ostream& output, const BenchmarkOption &option) {
    if (!option.include_h2d_d2h) {                                    
      output << "Running profiling for Runtime "                     
             << "without H2D and D2H, ";                
    } else {                                                             
      output << "Running profiling for Runtime "                        
             << "with H2D and D2H, ";                   
    }          
    output << "Repeats: " << option.repeats << ", "                  
           << "Warmup: " << option.warmup;                   
    return output;         
  }
 };
 } // namespace benchmark
 } // namespace fastdeploy
--- a/fastdeploy/benchmark/results.h
+++ b/fastdeploy/benchmark/results.h
@@ -0,0 +1,27 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 namespace fastdeploy {
 namespace benchmark {
 /*! @brief Result object used to record the time of runtime after benchmark profiling is done.
 */
 struct BenchmarkResult {
  ///< Means pure_backend_time+time_of_h2d_d2h(if include_h2d_d2h=true).
  double time_of_runtime = 0.0f; 
 };
 } // namespace benchmark
 } // namespace fastdeploy
--- a/fastdeploy/core/config.h.in
+++ b/fastdeploy/core/config.h.in
@@ -56,3 +56,7 @@
 #ifndef ENABLE_TEXT
 #cmakedefine ENABLE_TEXT
 #endif
 #ifndef ENABLE_BENCHMARK
 #cmakedefine ENABLE_BENCHMARK
 #endif
--- a/fastdeploy/fastdeploy_model.cc
+++ b/fastdeploy/fastdeploy_model.cc
@@ -31,7 +31,8 @@ std::string Str(const std::vector<Backend>& backends) {
  return oss.str();
 }
-bool IsSupported(const std::vector<Backend>& backends, Backend backend) {
+bool CheckBackendSupported(const std::vector<Backend>& backends,
                           Backend backend) {
  for (size_t i = 0; i < backends.size(); ++i) {
    if (backends[i] == backend) {
      return true;
@@ -40,6 +41,22 @@ bool IsSupported(const std::vector<Backend>& backends, Backend backend) {
  return false;
 }
 bool FastDeployModel::IsSupported(const std::vector<Backend>& backends, 
                                  Backend backend) {
 #ifdef ENABLE_BENCHMARK
  if (runtime_option.benchmark_option.enable_profile) {
    FDWARNING << "In benchmark mode, we don't check to see if " 
              << "the backend [" << backend 
              << "] is supported for current model!"
              << std::endl;
    return true;
  }
  return CheckBackendSupported(backends, backend);  
 #else  
  return CheckBackendSupported(backends, backend);
 #endif  
 }
 bool FastDeployModel::InitRuntimeWithSpecifiedBackend() {
  if (!IsBackendAvailable(runtime_option.backend)) {
    FDERROR << runtime_option.backend
@@ -373,6 +390,7 @@ bool FastDeployModel::Infer(std::vector<FDTensor>& input_tensors,
    }
    time_of_runtime_.push_back(tc.Duration());
  }
  return ret;
 }
@@ -416,6 +434,7 @@ std::map<std::string, float> FastDeployModel::PrintStatisInfoOfRuntime() {
  statis_info_of_runtime_dict["warmup_iter"] = warmup_iter;
  statis_info_of_runtime_dict["avg_time"] = avg_time;
  statis_info_of_runtime_dict["iterations"] = time_of_runtime_.size();
  return statis_info_of_runtime_dict;
 }
 }  // namespace fastdeploy
--- a/fastdeploy/fastdeploy_model.h
+++ b/fastdeploy/fastdeploy_model.h
@@ -75,7 +75,7 @@ class FASTDEPLOY_DECL FastDeployModel {
    return runtime_initialized_ && initialized;
  }
-  /** \brief This is a debug interface, used to record the time of backend runtime
+  /** \brief This is a debug interface, used to record the time of runtime (backend + h2d + d2h)
   *
   * example code @code
   * auto model = fastdeploy::vision::PPYOLOE("model.pdmodel", "model.pdiparams", "infer_cfg.yml");
@@ -98,7 +98,7 @@ class FASTDEPLOY_DECL FastDeployModel {
    enable_record_time_of_runtime_ = true;
  }
-  /** \brief Disable to record the time of backend runtime, see `EnableRecordTimeOfRuntime()` for more detail
+  /** \brief Disable to record the time of runtime, see `EnableRecordTimeOfRuntime()` for more detail
  */
  virtual void DisableRecordTimeOfRuntime() {
    enable_record_time_of_runtime_ = false;
@@ -113,6 +113,11 @@ class FASTDEPLOY_DECL FastDeployModel {
  virtual bool EnabledRecordTimeOfRuntime() {
    return enable_record_time_of_runtime_;
  }
  /** \brief Get profile time of Runtime after the profile process is done.
   */
  virtual double GetProfileTime() {
    return runtime_->GetProfileTime();
  }            
  /** \brief Release reused input/output buffers
  */
@@ -153,13 +158,13 @@ class FASTDEPLOY_DECL FastDeployModel {
  bool CreateTimVXBackend();
  bool CreateKunlunXinBackend();
  bool CreateASCENDBackend();
  bool IsSupported(const std::vector<Backend>& backends,
                   Backend backend);
  std::shared_ptr<Runtime> runtime_;
  bool runtime_initialized_ = false;
  // whether to record inference time
  bool enable_record_time_of_runtime_ = false;
  // record inference time for backend
  std::vector<double> time_of_runtime_;
 };
--- a/fastdeploy/pybind/fastdeploy_model.cc
+++ b/fastdeploy/pybind/fastdeploy_model.cc
@@ -30,6 +30,8 @@ void BindFDModel(pybind11::module& m) {
           &FastDeployModel::DisableRecordTimeOfRuntime)
      .def("print_statis_info_of_runtime",
           &FastDeployModel::PrintStatisInfoOfRuntime)
      .def("get_profile_time",
           &FastDeployModel::GetProfileTime)     
      .def("initialized", &FastDeployModel::Initialized)
      .def_readwrite("runtime_option", &FastDeployModel::runtime_option)
      .def_readwrite("valid_cpu_backends", &FastDeployModel::valid_cpu_backends)
--- a/fastdeploy/pybind/runtime.cc
+++ b/fastdeploy/pybind/runtime.cc
@@ -77,6 +77,8 @@ void BindRuntime(pybind11::module& m) {
      .def("set_ipu_config", &RuntimeOption::SetIpuConfig)
      .def("delete_paddle_backend_pass",
           &RuntimeOption::DeletePaddleBackendPass)
      .def("enable_profiling", &RuntimeOption::EnableProfiling)
      .def("disable_profiling", &RuntimeOption::DisableProfiling)
      .def("disable_paddle_trt_ops", &RuntimeOption::DisablePaddleTrtOPs)
      .def_readwrite("model_file", &RuntimeOption::model_file)
      .def_readwrite("params_file", &RuntimeOption::params_file)
@@ -217,6 +219,7 @@ void BindRuntime(pybind11::module& m) {
      .def("num_outputs", &Runtime::NumOutputs)
      .def("get_input_info", &Runtime::GetInputInfo)
      .def("get_output_info", &Runtime::GetOutputInfo)
      .def("get_profile_time", &Runtime::GetProfileTime)
      .def_readonly("option", &Runtime::option);
  pybind11::enum_<Backend>(m, "Backend", pybind11::arithmetic(),
--- a/fastdeploy/runtime/backends/backend.h
+++ b/fastdeploy/runtime/backends/backend.h
@@ -22,6 +22,7 @@
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/fd_type.h"
 #include "fastdeploy/runtime/runtime_option.h"
 #include "fastdeploy/benchmark/benchmark.h"
 namespace fastdeploy {
@@ -79,7 +80,6 @@ class BaseBackend {
  virtual bool Infer(std::vector<FDTensor>& inputs,
                     std::vector<FDTensor>* outputs,
                     bool copy_to_fd = true) = 0;
  // Optional: For those backends which can share memory
  // while creating multiple inference engines with same model file
  virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
@@ -88,6 +88,70 @@ class BaseBackend {
    FDERROR << "Clone no support" << std::endl;
    return nullptr;
  }
  benchmark::BenchmarkOption benchmark_option_;  
  benchmark::BenchmarkResult benchmark_result_; 
 };
 /** \brief Macros for Runtime benchmark profiling. 
 * The param 'base_loop' for 'RUNTIME_PROFILE_LOOP_BEGIN' 
 * indicates that the least number of times the loop 
 * will repeat when profiling mode is not enabled.
 * In most cases, the value should be 1, i.e., results are 
 * obtained by running the inference process once, when 
 * the profile mode is turned off, such as ONNX Runtime, 
 * OpenVINO, TensorRT, Paddle Inference, Paddle Lite, 
 * RKNPU2, SOPHGO etc. 
 * 
 * example code @code
 * // OpenVINOBackend::Infer 
 * RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
 * // do something .... 
 * RUNTIME_PROFILE_LOOP_BEGIN(1)
 * // The codes which wrapped by 'BEGIN(1) ~ END' scope 
 * // will only run once when profiling mode is not enabled.
 * request_.infer();  
 * RUNTIME_PROFILE_LOOP_END
 * // do something .... 
 * RUNTIME_PROFILE_LOOP_H2D_D2H_END
 * 
 * @endcode In this case, No global variables inside a function
 * are wrapped by BEGIN and END, which may be required for 
 * subsequent tasks. But, some times we need to set 'base_loop'
 * as 0, such as POROS.
 * 
 * * example code @code
 * // PorosBackend::Infer
 * RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
 * // do something .... 
 * RUNTIME_PROFILE_LOOP_BEGIN(0) // set 'base_loop' as 0
 * // The codes which wrapped by 'BEGIN(0) ~ END' scope 
 * // will not run when profiling mode is not enabled.
 * auto poros_outputs = _poros_module->forward(poros_inputs); 
 * RUNTIME_PROFILE_LOOP_END
 * // Run another inference beyond the scope of 'BEGIN ~ END'
 * // to get valid outputs for subsequent tasks.
 * auto poros_outputs = _poros_module->forward(poros_inputs); 
 * // do something .... will use 'poros_outputs' ...
 * if (poros_outputs.isTensor()) {
 * // ...
 * }
 * RUNTIME_PROFILE_LOOP_H2D_D2H_END
 * 
 * @endcode In this case, 'poros_outputs' inside a function
 * are wrapped by BEGIN and END, which may be required for 
 * subsequent tasks. So, we set 'base_loop' as 0 and lanuch
 * another infer to get the valid outputs beyond the scope 
 * of 'BEGIN ~ END' for subsequent tasks.
 */
 #define RUNTIME_PROFILE_LOOP_BEGIN(base_loop)            \
  __RUNTIME_PROFILE_LOOP_BEGIN(benchmark_option_, (base_loop))
 #define RUNTIME_PROFILE_LOOP_END                         \
  __RUNTIME_PROFILE_LOOP_END(benchmark_result_)
 #define RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN               \
  __RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN(benchmark_option_, 1)
 #define RUNTIME_PROFILE_LOOP_H2D_D2H_END                 \
  __RUNTIME_PROFILE_LOOP_H2D_D2H_END(benchmark_result_)
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/lite/configure_hardware.cc
+++ b/fastdeploy/runtime/backends/lite/configure_hardware.cc
@@ -13,23 +13,6 @@
 // limitations under the License.
 #include "fastdeploy/runtime/backends/lite/lite_backend.h"
 // https://github.com/PaddlePaddle/Paddle-Lite/issues/8290
 // When compiling the FastDeploy dynamic library, namely,
 // WITH_STATIC_LIB=OFF, and depending on the Paddle Lite
 // static library, you need to include the fake registration
 // codes of Paddle Lite. When you compile the FastDeploy static
 // library and depends on the Paddle Lite static library,
 // WITH_STATIC_LIB=ON, you do not need to include the fake
 // registration codes for Paddle Lite, but wait until you
 // use the FastDeploy static library.
 #if (defined(WITH_LITE_STATIC) && (!defined(WITH_STATIC_LIB)))
 #warning You are compiling the FastDeploy dynamic library with \
 Paddle Lite static lib We will automatically add some registration \
 codes for ops, kernels and passes for Paddle Lite.
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT
 #include "paddle_use_passes.h"   // NOLINT
 #endif
 #include <cstring>
@@ -156,4 +139,5 @@ void LiteBackend::ConfigureNNAdapter(const LiteBackendOption& option) {
  config_.set_nnadapter_dynamic_shape_info(option.nnadapter_dynamic_shape_info);
 }
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/lite/lite_backend.cc
+++ b/fastdeploy/runtime/backends/lite/lite_backend.cc
@@ -100,7 +100,7 @@ bool LiteBackend::InitFromPaddle(const std::string& model_file,
    auto shape = tensor->shape();
    info.shape.assign(shape.begin(), shape.end());
    info.name = output_names[i];
-    if (!option_.device == Device::KUNLUNXIN) {
+    if (option_.device != Device::KUNLUNXIN) {
      info.dtype = LiteDataTypeToFD(tensor->precision());
    }
    outputs_desc_.emplace_back(info);
@@ -136,6 +136,8 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
            << inputs_desc_.size() << ")." << std::endl;
    return false;
  }
  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
  for (size_t i = 0; i < inputs.size(); ++i) {
    auto iter = inputs_order_.find(inputs[i].name);
    if (iter == inputs_order_.end()) {
@@ -143,6 +145,7 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
              << " in loaded model." << std::endl;
      return false;
    }
    auto tensor = predictor_->GetInput(iter->second);
    // Adjust dims only, allocate lazy.
    tensor->Resize(inputs[i].shape);
@@ -175,7 +178,9 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
    }
  }
  RUNTIME_PROFILE_LOOP_BEGIN(1)
  predictor_->Run();
  RUNTIME_PROFILE_LOOP_END
  outputs->resize(outputs_desc_.size());
  for (size_t i = 0; i < outputs_desc_.size(); ++i) {
@@ -188,6 +193,7 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
    memcpy((*outputs)[i].MutableData(), tensor->data<void>(),
           (*outputs)[i].Nbytes());
  }
  RUNTIME_PROFILE_LOOP_H2D_D2H_END
  return true;
 }
--- a/fastdeploy/runtime/backends/lite/lite_backend.h
+++ b/fastdeploy/runtime/backends/lite/lite_backend.h
--- a/fastdeploy/runtime/backends/openvino/ov_backend.cc
+++ b/fastdeploy/runtime/backends/openvino/ov_backend.cc
@@ -375,6 +375,7 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
    return false;
  }
  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
  for (size_t i = 0; i < inputs.size(); ++i) {
    ov::Shape shape(inputs[i].shape.begin(), inputs[i].shape.end());
    ov::Tensor ov_tensor(FDDataTypeToOV(inputs[i].dtype), shape,
@@ -382,7 +383,9 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
    request_.set_tensor(inputs[i].name, ov_tensor);
  }
  RUNTIME_PROFILE_LOOP_BEGIN(1)
  request_.infer();
  RUNTIME_PROFILE_LOOP_END
  outputs->resize(output_infos_.size());
  for (size_t i = 0; i < output_infos_.size(); ++i) {
@@ -403,6 +406,7 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
          out_tensor.data(), Device::CPU);
    }
  }
  RUNTIME_PROFILE_LOOP_H2D_D2H_END
  return true;
 }
--- a/fastdeploy/runtime/backends/ort/ort_backend.cc
+++ b/fastdeploy/runtime/backends/ort/ort_backend.cc
@@ -13,9 +13,6 @@
 // limitations under the License.
 #include "fastdeploy/runtime/backends/ort/ort_backend.h"
 #include <memory>
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/runtime/backends/ort/ops/adaptive_pool2d.h"
 #include "fastdeploy/runtime/backends/ort/ops/multiclass_nms.h"
@@ -25,6 +22,9 @@
 #include "paddle2onnx/converter.h"
 #endif
 #include <memory>
 namespace fastdeploy {
 std::vector<OrtCustomOp*> OrtBackend::custom_operators_ =
@@ -258,6 +258,7 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
  }
  // from FDTensor to Ort Inputs
  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
  for (size_t i = 0; i < inputs.size(); ++i) {
    auto ort_value = CreateOrtValue(inputs[i], option_.device == Device::GPU);
    binding_->BindInput(inputs[i].name.c_str(), ort_value);
@@ -270,12 +271,14 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
  }
  // Inference with inputs
  RUNTIME_PROFILE_LOOP_BEGIN(1)
  try {
    session_.Run({}, *(binding_.get()));
  } catch (const std::exception& e) {
    FDERROR << "Failed to Infer: " << e.what() << std::endl;
    return false;
  }
  RUNTIME_PROFILE_LOOP_END
  // Convert result after inference
  std::vector<Ort::Value> ort_outputs = binding_->GetOutputValues();
@@ -284,7 +287,7 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
    OrtValueToFDTensor(ort_outputs[i], &((*outputs)[i]), outputs_desc_[i].name,
                       copy_to_fd);
  }
-
+  RUNTIME_PROFILE_LOOP_H2D_D2H_END
  return true;
 }
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -222,12 +222,15 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
    return false;
  }
  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
  for (size_t i = 0; i < inputs.size(); ++i) {
    auto handle = predictor_->GetInputHandle(inputs[i].name);
    ShareTensorFromFDTensor(handle.get(), inputs[i]);
  }
  RUNTIME_PROFILE_LOOP_BEGIN(1)
  predictor_->Run();
  RUNTIME_PROFILE_LOOP_END
  // output share backend memory only support CPU or GPU
  if (option_.use_ipu) {
@@ -241,6 +244,7 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
    }
    PaddleTensorToFDTensor(handle, &((*outputs)[i]), copy_to_fd);
  }
  RUNTIME_PROFILE_LOOP_H2D_D2H_END
  return true;
 }
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
@@ -287,14 +287,18 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
    BuildTrtEngine();
  }
  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
  cudaSetDevice(option_.gpu_id);
  SetInputs(inputs);
  AllocateOutputsBuffer(outputs, copy_to_fd);
  RUNTIME_PROFILE_LOOP_BEGIN(1)
  if (!context_->enqueueV2(bindings_.data(), stream_, nullptr)) {
    FDERROR << "Failed to Infer with TensorRT." << std::endl;
    return false;
  }
  RUNTIME_PROFILE_LOOP_END
  for (size_t i = 0; i < outputs->size(); ++i) {
    // if the final output tensor's dtype is different from the model output
    // tensor's dtype, then we need cast the data to the final output's dtype
@@ -335,7 +339,7 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
    FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess,
             "[ERROR] Error occurs while sync cuda stream.");
  }
-
+  RUNTIME_PROFILE_LOOP_H2D_D2H_END
  return true;
 }
--- a/fastdeploy/runtime/runtime.cc
+++ b/fastdeploy/runtime/runtime.cc
@@ -275,6 +275,8 @@ void Runtime::CreatePaddleBackend() {
 #endif
  backend_ = utils::make_unique<PaddleBackend>();
  auto casted_backend = dynamic_cast<PaddleBackend*>(backend_.get());
  casted_backend->benchmark_option_ = option.benchmark_option;
  if (pd_option.model_from_memory_) {
    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
                                            option.params_file, pd_option),
@@ -303,6 +305,7 @@ void Runtime::CreatePaddleBackend() {
 void Runtime::CreateOpenVINOBackend() {
 #ifdef ENABLE_OPENVINO_BACKEND
  backend_ = utils::make_unique<OpenVINOBackend>();
  backend_->benchmark_option_ = option.benchmark_option;
  FDASSERT(backend_->Init(option), "Failed to initialize OpenVINOBackend.");
 #else
  FDASSERT(false,
@@ -316,6 +319,8 @@ void Runtime::CreateOpenVINOBackend() {
 void Runtime::CreateOrtBackend() {
 #ifdef ENABLE_ORT_BACKEND
  backend_ = utils::make_unique<OrtBackend>();
  backend_->benchmark_option_ = option.benchmark_option;
  FDASSERT(backend_->Init(option), "Failed to initialize Backend::ORT.");
 #else
  FDASSERT(false,
@@ -351,6 +356,8 @@ void Runtime::CreateTrtBackend() {
  trt_option.external_stream_ = option.external_stream_;
  backend_ = utils::make_unique<TrtBackend>();
  auto casted_backend = dynamic_cast<TrtBackend*>(backend_.get());
  casted_backend->benchmark_option_ = option.benchmark_option;
  if (option.model_format == ModelFormat::ONNX) {
    if (option.model_from_memory_) {
      FDASSERT(casted_backend->InitFromOnnx(option.model_file, trt_option),
@@ -403,6 +410,8 @@ void Runtime::CreateLiteBackend() {
           "LiteBackend only support model format of ModelFormat::PADDLE");
  backend_ = utils::make_unique<LiteBackend>();
  auto casted_backend = dynamic_cast<LiteBackend*>(backend_.get());
  casted_backend->benchmark_option_ = option.benchmark_option;
  FDASSERT(casted_backend->InitFromPaddle(option.model_file, option.params_file,
                                          option.paddle_lite_option),
           "Load model from nb file failed while initializing LiteBackend.");
--- a/fastdeploy/runtime/runtime.h
+++ b/fastdeploy/runtime/runtime.h
@@ -95,6 +95,11 @@ struct FASTDEPLOY_DECL Runtime {
   */
  bool Compile(std::vector<std::vector<FDTensor>>& prewarm_tensors,
               const RuntimeOption& _option);
  /** \brief Get profile time of Runtime after the profile process is done.
   */
  double GetProfileTime() {
    return backend_->benchmark_result_.time_of_runtime;
  }             
 private:
  void CreateOrtBackend();
--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -32,6 +32,7 @@
 #include "fastdeploy/runtime/backends/rknpu2/option.h"
 #include "fastdeploy/runtime/backends/sophgo/option.h"
 #include "fastdeploy/runtime/backends/tensorrt/option.h"
 #include "fastdeploy/benchmark/option.h"
 namespace fastdeploy {
@@ -347,6 +348,26 @@ struct FASTDEPLOY_DECL RuntimeOption {
                    float available_memory_proportion = 1.0,
                    bool enable_half_partial = false);
  /** \brief Set the profile mode as 'true'.
   *
   * \param[in] inclue_h2d_d2h Whether to include time of H2D_D2H for time of runtime.
   * \param[in] repeat Repeat times for runtime inference.
   * \param[in] warmup Warmup times for runtime inference.
   */
  void EnableProfiling(bool inclue_h2d_d2h = false, 
                       int repeat = 100, int warmup = 50) {
    benchmark_option.enable_profile = true;
    benchmark_option.warmup = warmup;
    benchmark_option.repeats = repeat;
    benchmark_option.include_h2d_d2h = inclue_h2d_d2h;
  }
  /** \brief Set the profile mode as 'false'.
   */
  void DisableProfiling() {
    benchmark_option.enable_profile = false;
  }
  Backend backend = Backend::UNKNOWN;
  // for cpu inference
@@ -419,6 +440,9 @@ struct FASTDEPLOY_DECL RuntimeOption {
  bool model_from_memory_ = false;
  // format of input model
  ModelFormat model_format = ModelFormat::PADDLE;
  // Benchmark option
  benchmark::BenchmarkOption benchmark_option;  
 };
 }  // namespace fastdeploy
--- a/python/fastdeploy/model.py
+++ b/python/fastdeploy/model.py
@@ -54,6 +54,11 @@ class FastDeployModel:
    def print_statis_info_of_runtime(self):
        return self._model.print_statis_info_of_runtime()
    def get_profile_time(self):
        """Get profile time of Runtime after the profile process is done.
        """
        return self._model.get_profile_time()    
    @property
    def runtime_option(self):
        return self._model.runtime_option if self._model is not None else None
--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -144,6 +144,11 @@ class Runtime:
            index, self.num_outputs)
        return self._runtime.get_output_info(index)
    def get_profile_time(self):
        """Get profile time of Runtime after the profile process is done.
        """
        return self._runtime.get_profile_time()      
 class RuntimeOption:
    """Options for FastDeploy Runtime.
@@ -552,6 +557,21 @@ class RuntimeOption:
                                           available_memory_proportion,
                                           enable_half_partial)
    def enable_profiling(self, 
                         inclue_h2d_d2h=False,
                         repeat=100, warmup=50):
        """Set the profile mode as 'true'.
        :param inclue_h2d_d2h Whether to include time of H2D_D2H for time of runtime.
        :param repeat Repeat times for runtime inference.
        :param warmup Warmup times for runtime inference.
        """                 
        return self._option.enable_profiling(inclue_h2d_d2h, repeat, warmup)   
    def disable_profiling(self):
        """Set the profile mode as 'false'.
        """
        return self._option.disable_profiling()
    def __repr__(self):
        attrs = dir(self._option)
        message = "RuntimeOption(\n"
--- a/python/setup.py
+++ b/python/setup.py
@@ -73,6 +73,7 @@ setup_configs["ENABLE_VISION"] = os.getenv("ENABLE_VISION", "OFF")
 setup_configs["ENABLE_ENCRYPTION"] = os.getenv("ENABLE_ENCRYPTION", "OFF")
 setup_configs["ENABLE_FLYCV"] = os.getenv("ENABLE_FLYCV", "OFF")
 setup_configs["ENABLE_TEXT"] = os.getenv("ENABLE_TEXT", "OFF")
 setup_configs["ENABLE_BENCHMARK"] = os.getenv("ENABLE_BENCHMARK", "OFF")
 setup_configs["WITH_GPU"] = os.getenv("WITH_GPU", "OFF")
 setup_configs["WITH_IPU"] = os.getenv("WITH_IPU", "OFF")
 setup_configs["WITH_KUNLUNXIN"] = os.getenv("WITH_KUNLUNXIN", "OFF")
--- a/scripts/linux/build_linux_x86_64_cpp_cpu.sh
+++ b/scripts/linux/build_linux_x86_64_cpp_cpu.sh
@@ -0,0 +1,80 @@
 #!/bin/bash
 set -e
 set +x
 # -------------------------------------------------------------------------------
 #                        readonly global variables
 # -------------------------------------------------------------------------------
 readonly ROOT_PATH=$(pwd)
 readonly BUILD_ROOT=build/Linux
 readonly BUILD_DIR=${BUILD_ROOT}/x86_64
 # -------------------------------------------------------------------------------
 #                                 tasks
 # -------------------------------------------------------------------------------
 __make_build_dir() {
  if [ ! -d "${BUILD_DIR}" ]; then
    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
    if [ ! -d "${BUILD_ROOT}" ]; then
      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
    fi
    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
  else
    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
  fi
 }
 __check_cxx_envs() {
  if [ $LDFLAGS ]; then
    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset LDFLAGS
  fi
  if [ $CPPFLAGS ]; then
    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset CPPFLAGS
  fi
  if [ $CPLUS_INCLUDE_PATH ]; then
    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset CPLUS_INCLUDE_PATH
  fi
  if [ $C_INCLUDE_PATH ]; then
    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset C_INCLUDE_PATH
  fi
 }
 __build_fastdeploy_linux_x86_64_shared() {
  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
  cmake -DCMAKE_BUILD_TYPE=Release \
        -DWITH_GPU=OFF \
        -DENABLE_ORT_BACKEND=ON \
        -DENABLE_PADDLE_BACKEND=ON \
        -DENABLE_OPENVINO_BACKEND=ON \
        -DENABLE_PADDLE2ONNX=ON \
        -DENABLE_VISION=ON \
        -DENABLE_BENCHMARK=ON \
        -DBUILD_EXAMPLES=ON \
        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
        -Wno-dev ../../.. && make -j8 && make install
  echo "-- [INFO][built][x86_64]][${BUILD_DIR}/install]"
 }
 main() {
  __make_build_dir
  __check_cxx_envs
  __build_fastdeploy_linux_x86_64_shared
  exit 0
 }
 main
 # Usage:
 # ./scripts/linux/build_linux_x86_64_cpp_cpu.sh
--- a/scripts/linux/build_linux_x86_64_cpp_gpu.sh
+++ b/scripts/linux/build_linux_x86_64_cpp_gpu.sh
@@ -0,0 +1,83 @@
 #!/bin/bash
 set -e
 set +x
 # -------------------------------------------------------------------------------
 #                        readonly global variables
 # -------------------------------------------------------------------------------
 readonly ROOT_PATH=$(pwd)
 readonly BUILD_ROOT=build/Linux
 readonly BUILD_DIR="${BUILD_ROOT}/x86_64_gpu"
 # -------------------------------------------------------------------------------
 #                                 tasks
 # -------------------------------------------------------------------------------
 __make_build_dir() {
  if [ ! -d "${BUILD_DIR}" ]; then
    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
    if [ ! -d "${BUILD_ROOT}" ]; then
      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
    fi
    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
  else
    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
  fi
 }
 __check_cxx_envs() {
  if [ $LDFLAGS ]; then
    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset LDFLAGS
  fi
  if [ $CPPFLAGS ]; then
    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset CPPFLAGS
  fi
  if [ $CPLUS_INCLUDE_PATH ]; then
    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset CPLUS_INCLUDE_PATH
  fi
  if [ $C_INCLUDE_PATH ]; then
    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset C_INCLUDE_PATH
  fi
 }
 __build_fastdeploy_linux_x86_64_gpu_shared() {
  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
  cmake -DCMAKE_BUILD_TYPE=Release \
        -DWITH_GPU=ON \
        -DTRT_DIRECTORY=${TRT_DIRECTORY} \
        -DCUDA_DIRECTORY=${CUDA_DIRECTORY} \
        -DENABLE_ORT_BACKEND=ON \
        -DENABLE_TRT_BACKEND=ON \
        -DENABLE_PADDLE_BACKEND=ON \
        -DENABLE_OPENVINO_BACKEND=ON \
        -DENABLE_PADDLE2ONNX=ON \
        -DENABLE_VISION=ON \
        -DENABLE_BENCHMARK=ON \
        -DBUILD_EXAMPLES=ON \
        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
        -Wno-dev ../../.. && make -j8 && make install
  echo "-- [INFO][built][x86_64_gpu}][${BUILD_DIR}/install]"
 }
 main() {
  __make_build_dir
  __check_cxx_envs
  __build_fastdeploy_linux_x86_64_gpu_shared
  exit 0
 }
 main
 # Usage:
 # ./scripts/linux/build_linux_x86_64_cpp_gpu.sh
--- a/scripts/macosx/build_macosx_cpp.sh
+++ b/scripts/macosx/build_macosx_cpp.sh
@@ -0,0 +1,102 @@
 #!/bin/bash
 set -e
 set +x
 # -------------------------------------------------------------------------------
 #                        readonly global variables
 # -------------------------------------------------------------------------------
 readonly ROOT_PATH=$(pwd)
 readonly BUILD_ROOT=build/MacOSX
 readonly OSX_ARCH=$1  # arm64, x86_64
 readonly BUILD_DIR=${BUILD_ROOT}/${OSX_ARCH}
 # -------------------------------------------------------------------------------
 #                                 tasks
 # -------------------------------------------------------------------------------
 __make_build_dir() {
  if [ ! -d "${BUILD_DIR}" ]; then
    echo "-- [INFO] BUILD_DIR: ${BUILD_DIR} not exists, setup manually ..."
    if [ ! -d "${BUILD_ROOT}" ]; then
      mkdir -p "${BUILD_ROOT}" && echo "-- [INFO] Created ${BUILD_ROOT} !"
    fi
    mkdir -p "${BUILD_DIR}" && echo "-- [INFO] Created ${BUILD_DIR} !"
  else
    echo "-- [INFO] Found BUILD_DIR: ${BUILD_DIR}"
  fi
 }
 __check_cxx_envs() {
  if [ $LDFLAGS ]; then
    echo "-- [INFO] Found LDFLAGS: ${LDFLAGS}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset LDFLAGS
  fi
  if [ $CPPFLAGS ]; then
    echo "-- [INFO] Found CPPFLAGS: ${CPPFLAGS}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset CPPFLAGS
  fi
  if [ $CPLUS_INCLUDE_PATH ]; then
    echo "-- [INFO] Found CPLUS_INCLUDE_PATH: ${CPLUS_INCLUDE_PATH}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset CPLUS_INCLUDE_PATH
  fi
  if [ $C_INCLUDE_PATH ]; then
    echo "-- [INFO] Found C_INCLUDE_PATH: ${C_INCLUDE_PATH}, \c"
    echo "unset it before crossing compiling ${BUILD_DIR}"
    unset C_INCLUDE_PATH
  fi
 }
 __build_fastdeploy_osx_arm64_shared() {
  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
  cmake -DCMAKE_BUILD_TYPE=MinSizeRel \
        -DENABLE_ORT_BACKEND=ON \
        -DENABLE_PADDLE2ONNX=ON \
        -DENABLE_VISION=ON \
        -DENABLE_BENCHMARK=ON \
        -DBUILD_EXAMPLES=ON \
        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
        -Wno-dev ../../.. && make -j8 && make install
  echo "-- [INFO][built][${OSX_ARCH}][${BUILD_DIR}/install]"
 }
 __build_fastdeploy_osx_x86_64_shared() {
  local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
  cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
  cmake -DCMAKE_BUILD_TYPE=MinSizeRel \
        -DENABLE_ORT_BACKEND=ON \
        -DENABLE_PADDLE_BACKEND=ON \
        -DENABLE_OPENVINO_BACKEND=ON \
        -DENABLE_PADDLE2ONNX=ON \
        -DENABLE_VISION=ON \
        -DENABLE_BENCHMARK=ON \
        -DBUILD_EXAMPLES=ON \
        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
        -Wno-dev ../../.. && make -j8 && make install
  echo "-- [INFO][built][${OSX_ARCH}][${BUILD_DIR}/install]"
 }
 main() {
  __make_build_dir
  __check_cxx_envs
  if [ "$OSX_ARCH" = "arm64" ]; then
    __build_fastdeploy_osx_arm64_shared
  else
    __build_fastdeploy_osx_x86_64_shared
  fi
  exit 0
 }
 main
 # Usage:
 # ./scripts/macosx/build_macosx_cpp.sh arm64
 # ./scripts/macosx/build_macosx_cpp.sh x86_64