[Example] Update runtime examples (#1542)

* Add notes for tensors * Optimize some apis * move some warnings
2025-10-16 05:30:58 +08:00 · 2023-03-08 16:56:04 +08:00
parent 3d31834193
commit 6be2c0367b
17 changed files with 425 additions and 277 deletions
--- a/fastdeploy/runtime/backends/backend.h
+++ b/fastdeploy/runtime/backends/backend.h
@@ -85,10 +85,10 @@ class BaseBackend {
                     bool copy_to_fd = true) = 0;
  // Optional: For those backends which can share memory
  // while creating multiple inference engines with same model file
-  virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
-                                             void *stream = nullptr,
+  virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption& runtime_option,
+                                             void* stream = nullptr,
                                             int device_id = -1) {
-    FDERROR << "Clone no support" << std::endl;
+    FDERROR << "Clone no support " << runtime_option.backend << " " << stream << " " << device_id << std::endl;
    return nullptr;
  }

--- a/fastdeploy/runtime/backends/lite/option.h
+++ b/fastdeploy/runtime/backends/lite/option.h
@@ -53,32 +53,46 @@ enum LitePowerMode {
 struct LiteBackendOption {
  /// Paddle Lite power mode for mobile device.
  int power_mode = 3;
-  /// Number of threads while use CPU
+  // Number of threads while use CPU
  int cpu_threads = 1;
  /// Enable use half precision
  bool enable_fp16 = false;
-  /// Inference device, Paddle Lite support CPU/KUNLUNXIN/TIMVX/ASCEND
+  // Inference device, Paddle Lite support CPU/KUNLUNXIN/TIMVX/ASCEND
  Device device = Device::CPU;
-  /// Index of inference device
+  // Index of inference device
  int device_id = 0;

+  /// kunlunxin_l3_workspace_size
  int kunlunxin_l3_workspace_size = 0xfffc00;
+  /// kunlunxin_locked
  bool kunlunxin_locked = false;
+  /// kunlunxin_autotune
  bool kunlunxin_autotune = true;
+  /// kunlunxin_autotune_file
  std::string kunlunxin_autotune_file = "";
+  /// kunlunxin_precision
  std::string kunlunxin_precision = "int16";
+  /// kunlunxin_adaptive_seqlen
  bool kunlunxin_adaptive_seqlen = false;
+  /// kunlunxin_enable_multi_stream
  bool kunlunxin_enable_multi_stream = false;

  /// Optimized model dir for CxxConfig
  std::string optimized_model_dir = "";
+  /// nnadapter_subgraph_partition_config_path
  std::string nnadapter_subgraph_partition_config_path = "";
+  /// nnadapter_subgraph_partition_config_buffer
  std::string nnadapter_subgraph_partition_config_buffer = "";
+  /// nnadapter_context_properties
  std::string nnadapter_context_properties = "";
+  /// nnadapter_model_cache_dir
  std::string nnadapter_model_cache_dir = "";
+  /// nnadapter_mixed_precision_quantization_config_path
  std::string nnadapter_mixed_precision_quantization_config_path = "";
+  /// nnadapter_dynamic_shape_info
  std::map<std::string, std::vector<std::vector<int64_t>>>
    nnadapter_dynamic_shape_info = {{"", {{0}}}};
+  /// nnadapter_device_names
  std::vector<std::string> nnadapter_device_names = {};
 };
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/ort/option.h
+++ b/fastdeploy/runtime/backends/ort/option.h
@@ -25,27 +25,18 @@ namespace fastdeploy {
 /*! @brief Option object to configure ONNX Runtime backend
 */
 struct OrtBackendOption {
-  /*
-   * @brief Level of graph optimization, -1: mean default(Enable all the optimization strategy)/0: disable all the optimization strategy/1: enable basic strategy/2:enable extend strategy/99: enable all
-   */
+  /// Level of graph optimization, -1: mean default(Enable all the optimization strategy)/0: disable all the optimization strategy/1: enable basic strategy/2:enable extend strategy/99: enable all
  int graph_optimization_level = -1;
-  /*
-   * @brief Number of threads to execute the operator, -1: default
-   */
+  /// Number of threads to execute the operator, -1: default
  int intra_op_num_threads = -1;
-  /*
-   * @brief Number of threads to execute the graph, -1: default. This parameter only will bring effects while the `OrtBackendOption::execution_mode` set to 1.
-   */
+  /// Number of threads to execute the graph, -1: default. This parameter only will bring effects while the `OrtBackendOption::execution_mode` set to 1.
  int inter_op_num_threads = -1;
-  /*
-   * @brief Execution mode for the graph, -1: default(Sequential mode)/0: Sequential mode, execute the operators in graph one by one. /1: Parallel mode, execute the operators in graph parallelly.
-   */
+  /// Execution mode for the graph, -1: default(Sequential mode)/0: Sequential mode, execute the operators in graph one by one. /1: Parallel mode, execute the operators in graph parallelly.
  int execution_mode = -1;
-  /// Inference device, OrtBackend supports CPU/GPU
+  // Inference device, OrtBackend supports CPU/GPU
  Device device = Device::CPU;
-  /// Inference device id
+  // Inference device id
  int device_id = 0;
-
  void* external_stream_ = nullptr;
 };
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -54,6 +54,8 @@ struct PaddleBackendOption {
  bool enable_mkldnn = true;
  /// Use Paddle Inference + TensorRT to inference model on GPU
  bool enable_trt = false;
+  /// Whether enable memory optimize, default true
+  bool enable_memory_optimize = true;

  /*
   * @brief IPU option, this will configure the IPU hardware, if inference model in IPU
--- a/fastdeploy/runtime/backends/paddle/option_pybind.cc
+++ b/fastdeploy/runtime/backends/paddle/option_pybind.cc
@@ -41,6 +41,7 @@ void BindPaddleOption(pybind11::module& m) {
      .def_readwrite("enable_log_info", &PaddleBackendOption::enable_log_info)
      .def_readwrite("enable_mkldnn", &PaddleBackendOption::enable_mkldnn)
      .def_readwrite("enable_trt", &PaddleBackendOption::enable_trt)
+      .def_readwrite("enable_memory_optimize", &PaddleBackendOption::enable_memory_optimize)
      .def_readwrite("ipu_option", &PaddleBackendOption::ipu_option)
      .def_readwrite("collect_trt_shape",
                     &PaddleBackendOption::collect_trt_shape)
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -147,7 +147,9 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
  }
  config_.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
                         params_buffer.c_str(), params_buffer.size());
-  config_.EnableMemoryOptim();
+  if (option.enable_memory_optimize) {
+    config_.EnableMemoryOptim();
+  }
  BuildOption(option);

  // The input/output information get from predictor is not right, use
--- a/fastdeploy/runtime/backends/tensorrt/option.h
+++ b/fastdeploy/runtime/backends/tensorrt/option.h
@@ -33,9 +33,8 @@ struct TrtBackendOption {
  /// Enable log while converting onnx model to tensorrt
  bool enable_log_info = false;

-  /*
-   * @brief Enable half precison inference, on some device not support half precision, it will fallback to float32 mode
-   */
+  
+  /// Enable half precison inference, on some device not support half precision, it will fallback to float32 mode
  bool enable_fp16 = false;

  /** \brief Set shape range of input tensor for the model that contain dynamic input shape while using TensorRT backend
@@ -64,9 +63,7 @@ struct TrtBackendOption {
      max_shape[tensor_name].assign(max.begin(), max.end());
    }
  }
-  /**
-   * @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
-   */
+  /// Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
  std::string serialize_file = "";

  // The below parameters may be removed in next version, please do not