mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-16 05:30:58 +08:00
[Example] Update runtime examples (#1542)
* Add notes for tensors * Optimize some apis * move some warnings
This commit is contained in:
@@ -85,10 +85,10 @@ class BaseBackend {
|
||||
bool copy_to_fd = true) = 0;
|
||||
// Optional: For those backends which can share memory
|
||||
// while creating multiple inference engines with same model file
|
||||
virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
|
||||
void *stream = nullptr,
|
||||
virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption& runtime_option,
|
||||
void* stream = nullptr,
|
||||
int device_id = -1) {
|
||||
FDERROR << "Clone no support" << std::endl;
|
||||
FDERROR << "Clone no support " << runtime_option.backend << " " << stream << " " << device_id << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@@ -53,32 +53,46 @@ enum LitePowerMode {
|
||||
struct LiteBackendOption {
|
||||
/// Paddle Lite power mode for mobile device.
|
||||
int power_mode = 3;
|
||||
/// Number of threads while use CPU
|
||||
// Number of threads while use CPU
|
||||
int cpu_threads = 1;
|
||||
/// Enable use half precision
|
||||
bool enable_fp16 = false;
|
||||
/// Inference device, Paddle Lite support CPU/KUNLUNXIN/TIMVX/ASCEND
|
||||
// Inference device, Paddle Lite support CPU/KUNLUNXIN/TIMVX/ASCEND
|
||||
Device device = Device::CPU;
|
||||
/// Index of inference device
|
||||
// Index of inference device
|
||||
int device_id = 0;
|
||||
|
||||
/// kunlunxin_l3_workspace_size
|
||||
int kunlunxin_l3_workspace_size = 0xfffc00;
|
||||
/// kunlunxin_locked
|
||||
bool kunlunxin_locked = false;
|
||||
/// kunlunxin_autotune
|
||||
bool kunlunxin_autotune = true;
|
||||
/// kunlunxin_autotune_file
|
||||
std::string kunlunxin_autotune_file = "";
|
||||
/// kunlunxin_precision
|
||||
std::string kunlunxin_precision = "int16";
|
||||
/// kunlunxin_adaptive_seqlen
|
||||
bool kunlunxin_adaptive_seqlen = false;
|
||||
/// kunlunxin_enable_multi_stream
|
||||
bool kunlunxin_enable_multi_stream = false;
|
||||
|
||||
/// Optimized model dir for CxxConfig
|
||||
std::string optimized_model_dir = "";
|
||||
/// nnadapter_subgraph_partition_config_path
|
||||
std::string nnadapter_subgraph_partition_config_path = "";
|
||||
/// nnadapter_subgraph_partition_config_buffer
|
||||
std::string nnadapter_subgraph_partition_config_buffer = "";
|
||||
/// nnadapter_context_properties
|
||||
std::string nnadapter_context_properties = "";
|
||||
/// nnadapter_model_cache_dir
|
||||
std::string nnadapter_model_cache_dir = "";
|
||||
/// nnadapter_mixed_precision_quantization_config_path
|
||||
std::string nnadapter_mixed_precision_quantization_config_path = "";
|
||||
/// nnadapter_dynamic_shape_info
|
||||
std::map<std::string, std::vector<std::vector<int64_t>>>
|
||||
nnadapter_dynamic_shape_info = {{"", {{0}}}};
|
||||
/// nnadapter_device_names
|
||||
std::vector<std::string> nnadapter_device_names = {};
|
||||
};
|
||||
} // namespace fastdeploy
|
||||
|
@@ -25,27 +25,18 @@ namespace fastdeploy {
|
||||
/*! @brief Option object to configure ONNX Runtime backend
|
||||
*/
|
||||
struct OrtBackendOption {
|
||||
/*
|
||||
* @brief Level of graph optimization, -1: mean default(Enable all the optimization strategy)/0: disable all the optimization strategy/1: enable basic strategy/2:enable extend strategy/99: enable all
|
||||
*/
|
||||
/// Level of graph optimization, -1: mean default(Enable all the optimization strategy)/0: disable all the optimization strategy/1: enable basic strategy/2:enable extend strategy/99: enable all
|
||||
int graph_optimization_level = -1;
|
||||
/*
|
||||
* @brief Number of threads to execute the operator, -1: default
|
||||
*/
|
||||
/// Number of threads to execute the operator, -1: default
|
||||
int intra_op_num_threads = -1;
|
||||
/*
|
||||
* @brief Number of threads to execute the graph, -1: default. This parameter only will bring effects while the `OrtBackendOption::execution_mode` set to 1.
|
||||
*/
|
||||
/// Number of threads to execute the graph, -1: default. This parameter only will bring effects while the `OrtBackendOption::execution_mode` set to 1.
|
||||
int inter_op_num_threads = -1;
|
||||
/*
|
||||
* @brief Execution mode for the graph, -1: default(Sequential mode)/0: Sequential mode, execute the operators in graph one by one. /1: Parallel mode, execute the operators in graph parallelly.
|
||||
*/
|
||||
/// Execution mode for the graph, -1: default(Sequential mode)/0: Sequential mode, execute the operators in graph one by one. /1: Parallel mode, execute the operators in graph parallelly.
|
||||
int execution_mode = -1;
|
||||
/// Inference device, OrtBackend supports CPU/GPU
|
||||
// Inference device, OrtBackend supports CPU/GPU
|
||||
Device device = Device::CPU;
|
||||
/// Inference device id
|
||||
// Inference device id
|
||||
int device_id = 0;
|
||||
|
||||
void* external_stream_ = nullptr;
|
||||
};
|
||||
} // namespace fastdeploy
|
||||
|
@@ -54,6 +54,8 @@ struct PaddleBackendOption {
|
||||
bool enable_mkldnn = true;
|
||||
/// Use Paddle Inference + TensorRT to inference model on GPU
|
||||
bool enable_trt = false;
|
||||
/// Whether enable memory optimize, default true
|
||||
bool enable_memory_optimize = true;
|
||||
|
||||
/*
|
||||
* @brief IPU option, this will configure the IPU hardware, if inference model in IPU
|
||||
|
@@ -41,6 +41,7 @@ void BindPaddleOption(pybind11::module& m) {
|
||||
.def_readwrite("enable_log_info", &PaddleBackendOption::enable_log_info)
|
||||
.def_readwrite("enable_mkldnn", &PaddleBackendOption::enable_mkldnn)
|
||||
.def_readwrite("enable_trt", &PaddleBackendOption::enable_trt)
|
||||
.def_readwrite("enable_memory_optimize", &PaddleBackendOption::enable_memory_optimize)
|
||||
.def_readwrite("ipu_option", &PaddleBackendOption::ipu_option)
|
||||
.def_readwrite("collect_trt_shape",
|
||||
&PaddleBackendOption::collect_trt_shape)
|
||||
|
@@ -147,7 +147,9 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
|
||||
}
|
||||
config_.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
|
||||
params_buffer.c_str(), params_buffer.size());
|
||||
config_.EnableMemoryOptim();
|
||||
if (option.enable_memory_optimize) {
|
||||
config_.EnableMemoryOptim();
|
||||
}
|
||||
BuildOption(option);
|
||||
|
||||
// The input/output information get from predictor is not right, use
|
||||
|
@@ -33,9 +33,8 @@ struct TrtBackendOption {
|
||||
/// Enable log while converting onnx model to tensorrt
|
||||
bool enable_log_info = false;
|
||||
|
||||
/*
|
||||
* @brief Enable half precison inference, on some device not support half precision, it will fallback to float32 mode
|
||||
*/
|
||||
|
||||
/// Enable half precison inference, on some device not support half precision, it will fallback to float32 mode
|
||||
bool enable_fp16 = false;
|
||||
|
||||
/** \brief Set shape range of input tensor for the model that contain dynamic input shape while using TensorRT backend
|
||||
@@ -64,9 +63,7 @@ struct TrtBackendOption {
|
||||
max_shape[tensor_name].assign(max.begin(), max.end());
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @brief Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
|
||||
*/
|
||||
/// Set cache file path while use TensorRT backend. Loadding a Paddle/ONNX model and initialize TensorRT will take a long time, by this interface it will save the tensorrt engine to `cache_file_path`, and load it directly while execute the code again
|
||||
std::string serialize_file = "";
|
||||
|
||||
// The below parameters may be removed in next version, please do not
|
||||
|
Reference in New Issue
Block a user