mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00

* add fp16 support for ort-gpu * add enable_ort_fp16 option * fix * fix bugs * use fp16 for test * update code * update code * fix windows bug
277 lines
11 KiB
C++
Executable File
277 lines
11 KiB
C++
Executable File
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
/*! \file runtime_option.h
|
|
\brief A brief file description.
|
|
More details
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <map>
|
|
#include <vector>
|
|
#include "fastdeploy/runtime/enum_variables.h"
|
|
#include "fastdeploy/runtime/backends/lite/option.h"
|
|
#include "fastdeploy/runtime/backends/openvino/option.h"
|
|
#include "fastdeploy/runtime/backends/ort/option.h"
|
|
#include "fastdeploy/runtime/backends/paddle/option.h"
|
|
#include "fastdeploy/runtime/backends/poros/option.h"
|
|
#include "fastdeploy/runtime/backends/rknpu2/option.h"
|
|
#include "fastdeploy/runtime/backends/sophgo/option.h"
|
|
#include "fastdeploy/runtime/backends/tensorrt/option.h"
|
|
#include "fastdeploy/benchmark/option.h"
|
|
|
|
namespace fastdeploy {
|
|
|
|
/*! @brief Option object used when create a new Runtime object
|
|
*/
|
|
struct FASTDEPLOY_DECL RuntimeOption {
|
|
/** \brief Set path of model file and parameter file
|
|
*
|
|
* \param[in] model_path Path of model file, e.g ResNet50/model.pdmodel for Paddle format model / ResNet50/model.onnx for ONNX format model
|
|
* \param[in] params_path Path of parameter file, this only used when the model format is Paddle, e.g Resnet50/model.pdiparams
|
|
* \param[in] format Format of the loaded model
|
|
*/
|
|
void SetModelPath(const std::string& model_path,
|
|
const std::string& params_path = "",
|
|
const ModelFormat& format = ModelFormat::PADDLE);
|
|
|
|
/** \brief Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory
|
|
*
|
|
* \param[in] model_buffer The string of model memory buffer
|
|
* \param[in] params_buffer The string of parameters memory buffer
|
|
* \param[in] format Format of the loaded model
|
|
*/
|
|
void SetModelBuffer(const std::string& model_buffer,
|
|
const std::string& params_buffer = "",
|
|
const ModelFormat& format = ModelFormat::PADDLE);
|
|
|
|
/** \brief When loading encrypted model, encryption_key is required to decrypte model
|
|
*
|
|
* \param[in] encryption_key The key for decrypting model
|
|
*/
|
|
void SetEncryptionKey(const std::string& encryption_key);
|
|
|
|
/// Use cpu to inference, the runtime will inference on CPU by default
|
|
void UseCpu();
|
|
/// Use Nvidia GPU to inference
|
|
void UseGpu(int gpu_id = 0);
|
|
/// Use RKNPU2 e.g RK3588/RK356X to inference
|
|
void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name =
|
|
fastdeploy::rknpu2::CpuName::RK356X,
|
|
fastdeploy::rknpu2::CoreMask rknpu2_core =
|
|
fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO);
|
|
/// Use TimVX e.g RV1126/A311D to inference
|
|
void UseTimVX();
|
|
/// Use Huawei Ascend to inference
|
|
void UseAscend();
|
|
|
|
/// Use onnxruntime DirectML to inference
|
|
void UseDirectML();
|
|
|
|
/// Use Sophgo to inference
|
|
void UseSophgo();
|
|
/// \brief Turn on KunlunXin XPU.
|
|
///
|
|
/// \param kunlunxin_id the KunlunXin XPU card to use (default is 0).
|
|
/// \param l3_workspace_size The size of the video memory allocated by the l3
|
|
/// cache, the maximum is 16M.
|
|
/// \param locked Whether the allocated L3 cache can be locked. If false,
|
|
/// it means that the L3 cache is not locked, and the allocated L3
|
|
/// cache can be shared by multiple models, and multiple models
|
|
/// sharing the L3 cache will be executed sequentially on the card.
|
|
/// \param autotune Whether to autotune the conv operator in the model. If
|
|
/// true, when the conv operator of a certain dimension is executed
|
|
/// for the first time, it will automatically search for a better
|
|
/// algorithm to improve the performance of subsequent conv operators
|
|
/// of the same dimension.
|
|
/// \param autotune_file Specify the path of the autotune file. If
|
|
/// autotune_file is specified, the algorithm specified in the
|
|
/// file will be used and autotune will not be performed again.
|
|
/// \param precision Calculation accuracy of multi_encoder
|
|
/// \param adaptive_seqlen Is the input of multi_encoder variable length
|
|
/// \param enable_multi_stream Whether to enable the multi stream of
|
|
/// KunlunXin XPU.
|
|
///
|
|
void UseKunlunXin(int kunlunxin_id = 0, int l3_workspace_size = 0xfffc00,
|
|
bool locked = false, bool autotune = true,
|
|
const std::string& autotune_file = "",
|
|
const std::string& precision = "int16",
|
|
bool adaptive_seqlen = false,
|
|
bool enable_multi_stream = false);
|
|
|
|
void SetExternalStream(void* external_stream);
|
|
|
|
/*
|
|
* @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
|
|
*/
|
|
void SetCpuThreadNum(int thread_num);
|
|
/// Set Paddle Inference as inference backend, support CPU/GPU
|
|
void UsePaddleInferBackend() { return UsePaddleBackend(); }
|
|
/// Set ONNX Runtime as inference backend, support CPU/GPU
|
|
void UseOrtBackend();
|
|
/// Set SOPHGO Runtime as inference backend, support SOPHGO
|
|
void UseSophgoBackend();
|
|
/// Set TensorRT as inference backend, only support GPU
|
|
void UseTrtBackend();
|
|
/// Set Poros backend as inference backend, support CPU/GPU
|
|
void UsePorosBackend();
|
|
/// Set OpenVINO as inference backend, only support CPU
|
|
void UseOpenVINOBackend();
|
|
/// Set Paddle Lite as inference backend, only support arm cpu
|
|
void UsePaddleLiteBackend() { return UseLiteBackend(); }
|
|
/** \Use Graphcore IPU to inference.
|
|
*
|
|
* \param[in] device_num the number of IPUs.
|
|
* \param[in] micro_batch_size the batch size in the graph, only work when graph has no batch shape info.
|
|
* \param[in] enable_pipelining enable pipelining.
|
|
* \param[in] batches_per_step the number of batches per run in pipelining.
|
|
*/
|
|
void UseIpu(int device_num = 1, int micro_batch_size = 1,
|
|
bool enable_pipelining = false, int batches_per_step = 1);
|
|
|
|
/// Option to configure ONNX Runtime backend
|
|
OrtBackendOption ort_option;
|
|
/// Option to configure TensorRT backend
|
|
TrtBackendOption trt_option;
|
|
/// Option to configure Paddle Inference backend
|
|
PaddleBackendOption paddle_infer_option;
|
|
/// Option to configure Poros backend
|
|
PorosBackendOption poros_option;
|
|
/// Option to configure OpenVINO backend
|
|
OpenVINOBackendOption openvino_option;
|
|
/// Option to configure Paddle Lite backend
|
|
LiteBackendOption paddle_lite_option;
|
|
/// Option to configure RKNPU2 backend
|
|
RKNPU2BackendOption rknpu2_option;
|
|
|
|
// \brief Set the profile mode as 'true'.
|
|
//
|
|
// \param[in] inclue_h2d_d2h Whether to
|
|
// include time of H2D_D2H for time of runtime.
|
|
// \param[in] repeat Repeat times for runtime inference.
|
|
// \param[in] warmup Warmup times for runtime inference.
|
|
//
|
|
void EnableProfiling(bool inclue_h2d_d2h = false,
|
|
int repeat = 100, int warmup = 50) {
|
|
benchmark_option.enable_profile = true;
|
|
benchmark_option.warmup = warmup;
|
|
benchmark_option.repeats = repeat;
|
|
benchmark_option.include_h2d_d2h = inclue_h2d_d2h;
|
|
}
|
|
|
|
// \brief Set the profile mode as 'false'.
|
|
//
|
|
void DisableProfiling() {
|
|
benchmark_option.enable_profile = false;
|
|
}
|
|
|
|
// \brief Enable to check if current backend set by
|
|
// user can be found at valid_xxx_backend.
|
|
//
|
|
void EnableValidBackendCheck() {
|
|
enable_valid_backend_check = true;
|
|
}
|
|
// \brief Disable to check if current backend set by
|
|
// user can be found at valid_xxx_backend.
|
|
//
|
|
void DisableValidBackendCheck() {
|
|
enable_valid_backend_check = false;
|
|
}
|
|
|
|
// Benchmark option
|
|
benchmark::BenchmarkOption benchmark_option;
|
|
// enable the check for valid backend, default true.
|
|
bool enable_valid_backend_check = true;
|
|
|
|
// If model_from_memory is true, the model_file and params_file is
|
|
// binary stream in memory;
|
|
// Otherwise, the model_file and params_file means the path of file
|
|
std::string model_file = "";
|
|
std::string params_file = "";
|
|
bool model_from_memory_ = false;
|
|
// format of input model
|
|
ModelFormat model_format = ModelFormat::PADDLE;
|
|
|
|
std::string encryption_key_ = "";
|
|
|
|
// for cpu inference
|
|
// default will let the backend choose their own default value
|
|
int cpu_thread_num = -1;
|
|
int device_id = 0;
|
|
Backend backend = Backend::UNKNOWN;
|
|
|
|
Device device = Device::CPU;
|
|
|
|
void* external_stream_ = nullptr;
|
|
|
|
bool enable_pinned_memory = false;
|
|
|
|
// *** The belowing api are deprecated, will be removed in v1.2.0
|
|
// *** Do not use it anymore
|
|
void SetPaddleMKLDNN(bool pd_mkldnn = true);
|
|
void EnablePaddleToTrt();
|
|
void DeletePaddleBackendPass(const std::string& delete_pass_name);
|
|
void EnablePaddleLogInfo();
|
|
void DisablePaddleLogInfo();
|
|
void SetPaddleMKLDNNCacheSize(int size);
|
|
void SetOpenVINODevice(const std::string& name = "CPU");
|
|
void SetOpenVINOShapeInfo(
|
|
const std::map<std::string, std::vector<int64_t>>& shape_info) {
|
|
openvino_option.shape_infos = shape_info;
|
|
}
|
|
void SetOpenVINOCpuOperators(const std::vector<std::string>& operators) {
|
|
openvino_option.SetCpuOperators(operators);
|
|
}
|
|
void SetLiteOptimizedModelDir(const std::string& optimized_model_dir);
|
|
void SetLiteSubgraphPartitionPath(
|
|
const std::string& nnadapter_subgraph_partition_config_path);
|
|
void SetLiteSubgraphPartitionConfigBuffer(
|
|
const std::string& nnadapter_subgraph_partition_config_buffer);
|
|
void
|
|
SetLiteContextProperties(const std::string& nnadapter_context_properties);
|
|
void SetLiteModelCacheDir(const std::string& nnadapter_model_cache_dir);
|
|
void SetLiteDynamicShapeInfo(
|
|
const std::map<std::string, std::vector<std::vector<int64_t>>>&
|
|
nnadapter_dynamic_shape_info);
|
|
void SetLiteMixedPrecisionQuantizationConfigPath(
|
|
const std::string& nnadapter_mixed_precision_quantization_config_path);
|
|
void EnableLiteFP16();
|
|
void DisableLiteFP16();
|
|
void EnableLiteInt8();
|
|
void DisableLiteInt8();
|
|
void SetLitePowerMode(LitePowerMode mode);
|
|
void SetTrtInputShape(
|
|
const std::string& input_name, const std::vector<int32_t>& min_shape,
|
|
const std::vector<int32_t>& opt_shape = std::vector<int32_t>(),
|
|
const std::vector<int32_t>& max_shape = std::vector<int32_t>());
|
|
void SetTrtMaxWorkspaceSize(size_t trt_max_workspace_size);
|
|
void SetTrtMaxBatchSize(size_t max_batch_size);
|
|
void EnableTrtFP16();
|
|
void DisableTrtFP16();
|
|
void SetTrtCacheFile(const std::string& cache_file_path);
|
|
void EnablePinnedMemory();
|
|
void DisablePinnedMemory();
|
|
void EnablePaddleTrtCollectShape();
|
|
void DisablePaddleTrtCollectShape();
|
|
void DisablePaddleTrtOPs(const std::vector<std::string>& ops);
|
|
void SetOpenVINOStreams(int num_streams);
|
|
void SetOrtGraphOptLevel(int level = -1);
|
|
void UsePaddleBackend();
|
|
void UseLiteBackend();
|
|
};
|
|
|
|
} // namespace fastdeploy
|