[Backend] support bechmark mode for runtime and backend (#1201)

* [backend] support bechmark mode for runtime and backend * [backend] support bechmark mode for runtime and backend * [pybind11] add benchmark methods pybind * [pybind11] add benchmark methods pybind * [Other] Update build scripts * [Other] Update cmake/summary.cmake * [Other] update build scripts * [Other] add ENABLE_BENCHMARK option -> setup.py * optimize backend time recording * optimize backend time recording * optimize trt backend time record * [backend] optimze backend_time recording for trt * [benchmark] remove redundant logs * fixed ov_backend confilct * [benchmark] fixed paddle_backend conflicts * [benchmark] fixed paddle_backend conflicts * [benchmark] fixed paddle_backend conflicts * [benchmark] remove use_gpu option from ort backend option * [benchmark] update benchmark_ppdet.py * [benchmark] update benchmark_ppcls.py * fixed lite backend conflicts * [Lite] fixed lite xpu * add benchmark macro * add RUNTIME_PROFILE_LOOP macros * add comments for RUNTIME_PROFILE macros * add comments for new apis * add comments for new apis * update benchmark_ppdet.py * afixed bugs * remove unused codes * optimize RUNTIME_PROFILE_LOOP macros * optimize RUNTIME_PROFILE_LOOP macros * add comments for benchmark option and result * add docs for benchmark namespace
2025-10-05 16:48:03 +08:00 · 2023-02-06 14:29:35 +08:00
parent 42d14e7119
commit f73a538f61
34 changed files with 741 additions and 91 deletions
--- a/fastdeploy/runtime/backends/backend.h
+++ b/fastdeploy/runtime/backends/backend.h
@@ -22,6 +22,7 @@
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/fd_type.h"
 #include "fastdeploy/runtime/runtime_option.h"
+#include "fastdeploy/benchmark/benchmark.h"

 namespace fastdeploy {

@@ -79,7 +80,6 @@ class BaseBackend {
  virtual bool Infer(std::vector<FDTensor>& inputs,
                     std::vector<FDTensor>* outputs,
                     bool copy_to_fd = true) = 0;
-
  // Optional: For those backends which can share memory
  // while creating multiple inference engines with same model file
  virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
@@ -88,6 +88,70 @@ class BaseBackend {
    FDERROR << "Clone no support" << std::endl;
    return nullptr;
  }
+
+  benchmark::BenchmarkOption benchmark_option_;  
+  benchmark::BenchmarkResult benchmark_result_; 
 };

+/** \brief Macros for Runtime benchmark profiling. 
+ * The param 'base_loop' for 'RUNTIME_PROFILE_LOOP_BEGIN' 
+ * indicates that the least number of times the loop 
+ * will repeat when profiling mode is not enabled.
+ * In most cases, the value should be 1, i.e., results are 
+ * obtained by running the inference process once, when 
+ * the profile mode is turned off, such as ONNX Runtime, 
+ * OpenVINO, TensorRT, Paddle Inference, Paddle Lite, 
+ * RKNPU2, SOPHGO etc. 
+ * 
+ * example code @code
+ * // OpenVINOBackend::Infer 
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
+ * // do something .... 
+ * RUNTIME_PROFILE_LOOP_BEGIN(1)
+ * // The codes which wrapped by 'BEGIN(1) ~ END' scope 
+ * // will only run once when profiling mode is not enabled.
+ * request_.infer();  
+ * RUNTIME_PROFILE_LOOP_END
+ * // do something .... 
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_END
+ * 
+ * @endcode In this case, No global variables inside a function
+ * are wrapped by BEGIN and END, which may be required for 
+ * subsequent tasks. But, some times we need to set 'base_loop'
+ * as 0, such as POROS.
+ * 
+ * * example code @code
+ * // PorosBackend::Infer
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
+ * // do something .... 
+ * RUNTIME_PROFILE_LOOP_BEGIN(0) // set 'base_loop' as 0
+ * // The codes which wrapped by 'BEGIN(0) ~ END' scope 
+ * // will not run when profiling mode is not enabled.
+ * auto poros_outputs = _poros_module->forward(poros_inputs); 
+ * RUNTIME_PROFILE_LOOP_END
+ * // Run another inference beyond the scope of 'BEGIN ~ END'
+ * // to get valid outputs for subsequent tasks.
+ * auto poros_outputs = _poros_module->forward(poros_inputs); 
+ * // do something .... will use 'poros_outputs' ...
+ * if (poros_outputs.isTensor()) {
+ * // ...
+ * }
+ * RUNTIME_PROFILE_LOOP_H2D_D2H_END
+ * 
+ * @endcode In this case, 'poros_outputs' inside a function
+ * are wrapped by BEGIN and END, which may be required for 
+ * subsequent tasks. So, we set 'base_loop' as 0 and lanuch
+ * another infer to get the valid outputs beyond the scope 
+ * of 'BEGIN ~ END' for subsequent tasks.
+ */
+
+#define RUNTIME_PROFILE_LOOP_BEGIN(base_loop)            \
+  __RUNTIME_PROFILE_LOOP_BEGIN(benchmark_option_, (base_loop))
+#define RUNTIME_PROFILE_LOOP_END                         \
+  __RUNTIME_PROFILE_LOOP_END(benchmark_result_)
+#define RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN               \
+  __RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN(benchmark_option_, 1)
+#define RUNTIME_PROFILE_LOOP_H2D_D2H_END                 \
+  __RUNTIME_PROFILE_LOOP_H2D_D2H_END(benchmark_result_)
+
 }  // namespace fastdeploy