[Other] Deprecate some option api and parameters (#1243)

* Optimize Poros backend * fix error * Add more pybind * fix conflicts * add some deprecate notices * [Other] Deprecate some apis in RuntimeOption (#1240) * Deprecate more options * modify serving * Update option.h * fix tensorrt error * Update option_pybind.cc * Update option_pybind.cc * Fix error in serving * fix word spell error
2025-10-17 22:21:48 +08:00 · 2023-02-07 17:57:46 +08:00
parent a18cc0f94c
commit 713afe7f1c
15 changed files with 380 additions and 229 deletions
--- a/fastdeploy/runtime/backends/lite/option.h
+++ b/fastdeploy/runtime/backends/lite/option.h
@@ -48,6 +48,8 @@ enum LitePowerMode {
  LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
 };

+/*! @brief Option object to configure Paddle Lite backend
+ */
 struct LiteBackendOption {
  /// Paddle Lite power mode for mobile device.
  int power_mode = 3;
@@ -55,12 +57,20 @@ struct LiteBackendOption {
  int cpu_threads = 1;
  /// Enable use half precision
  bool enable_fp16 = false;
-  /// Enable use int8 precision for quantized model
-  bool enable_int8 = false;
-
+  /// Inference device, Paddle Lite support CPU/KUNLUNXIN/TIMVX/ASCEND
  Device device = Device::CPU;
+  /// Index of inference device
+  int device_id = 0;

-  // optimized model dir for CxxConfig
+  int kunlunxin_l3_workspace_size = 0xfffc00;
+  bool kunlunxin_locked = false;
+  bool kunlunxin_autotune = true;
+  std::string kunlunxin_autotune_file = "";
+  std::string kunlunxin_precision = "int16";
+  bool kunlunxin_adaptive_seqlen = false;
+  bool kunlunxin_enable_multi_stream = false;
+
+  /// Optimized model dir for CxxConfig
  std::string optimized_model_dir = "";
  std::string nnadapter_subgraph_partition_config_path = "";
  std::string nnadapter_subgraph_partition_config_buffer = "";
@@ -70,13 +80,5 @@ struct LiteBackendOption {
  std::map<std::string, std::vector<std::vector<int64_t>>>
    nnadapter_dynamic_shape_info = {{"", {{0}}}};
  std::vector<std::string> nnadapter_device_names = {};
-  int device_id = 0;
-  int kunlunxin_l3_workspace_size = 0xfffc00;
-  bool kunlunxin_locked = false;
-  bool kunlunxin_autotune = true;
-  std::string kunlunxin_autotune_file = "";
-  std::string kunlunxin_precision = "int16";
-  bool kunlunxin_adaptive_seqlen = false;
-  bool kunlunxin_enable_multi_stream = false;
 };
-}  // namespace fastdeploy
+}  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/lite/option_pybind.cc
+++ b/fastdeploy/runtime/backends/lite/option_pybind.cc
@@ -23,7 +23,6 @@ void BindLiteOption(pybind11::module& m) {
      .def_readwrite("power_mode", &LiteBackendOption::power_mode)
      .def_readwrite("cpu_threads", &LiteBackendOption::cpu_threads)
      .def_readwrite("enable_fp16", &LiteBackendOption::enable_fp16)
-      .def_readwrite("enable_int8", &LiteBackendOption::enable_int8)
      .def_readwrite("device", &LiteBackendOption::device)
      .def_readwrite("optimized_model_dir",
                     &LiteBackendOption::optimized_model_dir)
--- a/fastdeploy/runtime/backends/openvino/option.h
+++ b/fastdeploy/runtime/backends/openvino/option.h
@@ -23,9 +23,13 @@
 #include <set>
 namespace fastdeploy {

+/*! @brief Option object to configure OpenVINO backend
+ */
 struct OpenVINOBackendOption {
  std::string device = "CPU";
  int cpu_thread_num = -1;
+
+  /// Number of streams while use OpenVINO
  int num_streams = 0;

  /**
--- a/fastdeploy/runtime/backends/ort/option.h
+++ b/fastdeploy/runtime/backends/ort/option.h
@@ -22,20 +22,30 @@
 #include <map>
 namespace fastdeploy {

+/*! @brief Option object to configure ONNX Runtime backend
+ */
 struct OrtBackendOption {
-  // -1 means default
-  // 0: ORT_DISABLE_ALL
-  // 1: ORT_ENABLE_BASIC
-  // 2: ORT_ENABLE_EXTENDED
-  // 99: ORT_ENABLE_ALL (enable some custom optimizations e.g bert)
+  /*
+   * @brief Level of graph optimization, -1: mean default(Enable all the optimization strategy)/0: disable all the optimization strategy/1: enable basic strategy/2:enable extend strategy/99: enable all
+   */
  int graph_optimization_level = -1;
+  /*
+   * @brief Number of threads to execute the operator, -1: default
+   */
  int intra_op_num_threads = -1;
+  /*
+   * @brief Number of threads to execute the graph, -1: default. This parameter only will bring effects while the `OrtBackendOption::execution_mode` set to 1.
+   */
  int inter_op_num_threads = -1;
-  // 0: ORT_SEQUENTIAL
-  // 1: ORT_PARALLEL
+  /*
+   * @brief Execution mode for the graph, -1: default(Sequential mode)/0: Sequential mode, execute the operators in graph one by one. /1: Parallel mode, execute the operators in graph parallelly.
+   */
  int execution_mode = -1;
+  /// Inference device, OrtBackend supports CPU/GPU
  Device device = Device::CPU;
+  /// Inference device id
  int device_id = 0;
+
  void* external_stream_ = nullptr;
 };
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/poros/option.h
+++ b/fastdeploy/runtime/backends/poros/option.h
@@ -22,6 +22,8 @@

 namespace fastdeploy {

+/*! @brief Option object to configure Poros backend
+ */
 struct PorosBackendOption {
  Device device = Device::CPU;
  int device_id = 0;
--- a/fastdeploy/runtime/backends/tensorrt/option.h
+++ b/fastdeploy/runtime/backends/tensorrt/option.h
@@ -21,23 +21,64 @@

 namespace fastdeploy {

+/*! @brief Option object to configure TensorRT backend
+ */
 struct TrtBackendOption {
-  std::string model_file = "";   // Path of model file
-  std::string params_file = "";  // Path of parameters file, can be empty
-
-  // format of input model
-  ModelFormat model_format = ModelFormat::AUTOREC;
-
-  int gpu_id = 0;
-  bool enable_fp16 = false;
-  bool enable_int8 = false;
+  /// `max_batch_size`, it's deprecated in TensorRT 8.x
  size_t max_batch_size = 32;
+
+  /// `max_workspace_size` for TensorRT
  size_t max_workspace_size = 1 << 30;
+
+  /*
+   * @brief Enable half precison inference, on some device not support half precision, it will fallback to float32 mode
+   */
+  bool enable_fp16 = false;
+
+  /** \brief Set shape range of input tensor for the model that contain dynamic input shape while using TensorRT backend
+   *
+   * \param[in] tensor_name The name of input for the model which is dynamic shape
+   * \param[in] min The minimal shape for the input tensor
+   * \param[in] opt The optimized shape for the input tensor, just set the most common shape, if set as default value, it will keep same with min_shape
+   * \param[in] max The maximum shape for the input tensor, if set as default value, it will keep same with min_shape
+   */
+  void SetShape(const std::string& tensor_name,
+                const std::vector<int32_t>& min,
+                const std::vector<int32_t>& opt,
+                const std::vector<int32_t>& max) {
+    min_shape[tensor_name].clear();
+    max_shape[tensor_name].clear();
+    opt_shape[tensor_name].clear();
+    min_shape[tensor_name].assign(min.begin(), min.end());
+    if (opt.size() == 0) {
+      opt_shape[tensor_name].assign(min.begin(), min.end());
+    } else {
+      opt_shape[tensor_name].assign(opt.begin(), opt.end());
+    }
+    if (max.size() == 0) {
+      max_shape[tensor_name].assign(min.begin(), min.end());
+    } else {
+      max_shape[tensor_name].assign(max.begin(), max.end());
+    }
+  }
+  /**
+   * @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
+   */
+  std::string serialize_file = "";
+
+  // The below parameters may be removed in next version, please do not
+  // visit or use them directly
  std::map<std::string, std::vector<int32_t>> max_shape;
  std::map<std::string, std::vector<int32_t>> min_shape;
  std::map<std::string, std::vector<int32_t>> opt_shape;
-  std::string serialize_file = "";
  bool enable_pinned_memory = false;
  void* external_stream_ = nullptr;
+  int gpu_id = 0;
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+  // format of input model
+  ModelFormat model_format = ModelFormat::AUTOREC;
 };
+
+
 }  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/tensorrt/option_pybind.cc
+++ b/fastdeploy/runtime/backends/tensorrt/option_pybind.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+#include "fastdeploy/runtime/backends/tensorrt/option.h"
+
+namespace fastdeploy {
+
+void BindTrtOption(pybind11::module& m) {
+  pybind11::class_<TrtBackendOption>(m, "TrtBackendOption")
+      .def(pybind11::init())
+      .def_readwrite("enable_fp16", &TrtBackendOption::enable_fp16)
+      .def_readwrite("max_batch_size", &TrtBackendOption::max_batch_size)
+      .def_readwrite("max_workspace_size",
+                     &TrtBackendOption::max_workspace_size)
+      .def_readwrite("serialize_file", &TrtBackendOption::serialize_file)
+      .def("set_shape", &TrtBackendOption::SetShape);
+}
+
+}  // namespace fastdeploy