// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /*! \file runtime_option.h \brief A brief file description. More details */ #pragma once #include #include #include #include "fastdeploy/runtime/enum_variables.h" #include "fastdeploy/runtime/backends/lite/option.h" #include "fastdeploy/runtime/backends/openvino/option.h" #include "fastdeploy/runtime/backends/ort/option.h" #include "fastdeploy/runtime/backends/paddle/option.h" #include "fastdeploy/runtime/backends/poros/option.h" #include "fastdeploy/runtime/backends/rknpu2/option.h" #include "fastdeploy/runtime/backends/sophgo/option.h" #include "fastdeploy/runtime/backends/tensorrt/option.h" #include "fastdeploy/runtime/backends/tvm/option.h" #include "fastdeploy/benchmark/option.h" namespace fastdeploy { /*! @brief Option object used when create a new Runtime object */ struct FASTDEPLOY_DECL RuntimeOption { /** \brief Set path of model file and parameter file * * \param[in] model_path Path of model file, e.g ResNet50/model.pdmodel for Paddle format model / ResNet50/model.onnx for ONNX format model * \param[in] params_path Path of parameter file, this only used when the model format is Paddle, e.g Resnet50/model.pdiparams * \param[in] format Format of the loaded model */ void SetModelPath(const std::string& model_path, const std::string& params_path = "", const ModelFormat& format = ModelFormat::PADDLE); /** \brief Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory * * \param[in] model_buffer The string of model memory buffer * \param[in] params_buffer The string of parameters memory buffer * \param[in] format Format of the loaded model */ void SetModelBuffer(const std::string& model_buffer, const std::string& params_buffer = "", const ModelFormat& format = ModelFormat::PADDLE); /** \brief When loading encrypted model, encryption_key is required to decrypte model * * \param[in] encryption_key The key for decrypting model */ void SetEncryptionKey(const std::string& encryption_key); /// Use cpu to inference, the runtime will inference on CPU by default void UseCpu(); /// Use Nvidia GPU to inference void UseGpu(int gpu_id = 0); /// Use RKNPU2 e.g RK3588/RK356X to inference void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name = fastdeploy::rknpu2::CpuName::RK356X, fastdeploy::rknpu2::CoreMask rknpu2_core = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO); // Use Horizon NPU to inference void UseHorizon(); /// Use TimVX e.g RV1126/A311D to inference void UseTimVX(); /// Use Huawei Ascend to inference void UseAscend(); /// Use onnxruntime DirectML to inference void UseDirectML(int adapter_id = 0); /// Use Sophgo to inference void UseSophgo(); /// \brief Turn on KunlunXin XPU. /// /// \param kunlunxin_id the KunlunXin XPU card to use (default is 0). /// \param l3_workspace_size The size of the video memory allocated by the l3 /// cache, the maximum is 16M. /// \param locked Whether the allocated L3 cache can be locked. If false, /// it means that the L3 cache is not locked, and the allocated L3 /// cache can be shared by multiple models, and multiple models /// sharing the L3 cache will be executed sequentially on the card. /// \param autotune Whether to autotune the conv operator in the model. If /// true, when the conv operator of a certain dimension is executed /// for the first time, it will automatically search for a better /// algorithm to improve the performance of subsequent conv operators /// of the same dimension. /// \param autotune_file Specify the path of the autotune file. If /// autotune_file is specified, the algorithm specified in the /// file will be used and autotune will not be performed again. /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length /// \param enable_multi_stream Whether to enable the multi stream of /// KunlunXin XPU. /// \param gm_default_size The default size of global memory of KunlunXin XPU. /// void UseKunlunXin(int kunlunxin_id = 0, int l3_workspace_size = 0xfffc00, bool locked = false, bool autotune = true, const std::string& autotune_file = "", const std::string& precision = "int16", bool adaptive_seqlen = false, bool enable_multi_stream = false, int64_t gm_default_size = 0); void SetExternalStream(void* external_stream); /* * @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends */ void SetCpuThreadNum(int thread_num); /// Set Paddle Inference as inference backend, support CPU/GPU void UsePaddleInferBackend() { return UsePaddleBackend(); } /// Set ONNX Runtime as inference backend, support CPU/GPU void UseOrtBackend(); /// Set SOPHGO Runtime as inference backend, support SOPHGO void UseSophgoBackend(); /// Set TensorRT as inference backend, only support GPU void UseTrtBackend(); /// Set Poros backend as inference backend, support CPU/GPU void UsePorosBackend(); /// Set OpenVINO as inference backend, only support CPU void UseOpenVINOBackend(); /// Set Paddle Lite as inference backend, only support arm cpu void UsePaddleLiteBackend() { return UseLiteBackend(); } /** \Use Graphcore IPU to inference. * * \param[in] device_num the number of IPUs. * \param[in] micro_batch_size the batch size in the graph, only work when graph has no batch shape info. * \param[in] enable_pipelining enable pipelining. * \param[in] batches_per_step the number of batches per run in pipelining. */ void UseIpu(int device_num = 1, int micro_batch_size = 1, bool enable_pipelining = false, int batches_per_step = 1); /// Option to configure ONNX Runtime backend OrtBackendOption ort_option; /// Option to configure TensorRT backend TrtBackendOption trt_option; /// Option to configure Paddle Inference backend PaddleBackendOption paddle_infer_option; /// Option to configure Poros backend PorosBackendOption poros_option; /// Option to configure OpenVINO backend OpenVINOBackendOption openvino_option; /// Option to configure Paddle Lite backend LiteBackendOption paddle_lite_option; /// Option to configure RKNPU2 backend RKNPU2BackendOption rknpu2_option; /// Option to configure TVM backend TVMBackendOption tvm_option; // \brief Set the profile mode as 'true'. // // \param[in] inclue_h2d_d2h Whether to // include time of H2D_D2H for time of runtime. // \param[in] repeat Repeat times for runtime inference. // \param[in] warmup Warmup times for runtime inference. // void EnableProfiling(bool inclue_h2d_d2h = false, int repeat = 100, int warmup = 50) { benchmark_option.enable_profile = true; benchmark_option.warmup = warmup; benchmark_option.repeats = repeat; benchmark_option.include_h2d_d2h = inclue_h2d_d2h; } // \brief Set the profile mode as 'false'. // void DisableProfiling() { benchmark_option.enable_profile = false; } // \brief Enable to check if current backend set by // user can be found at valid_xxx_backend. // void EnableValidBackendCheck() { enable_valid_backend_check = true; } // \brief Disable to check if current backend set by // user can be found at valid_xxx_backend. // void DisableValidBackendCheck() { enable_valid_backend_check = false; } // Benchmark option benchmark::BenchmarkOption benchmark_option; // enable the check for valid backend, default true. bool enable_valid_backend_check = true; // If model_from_memory is true, the model_file and params_file is // binary stream in memory; // Otherwise, the model_file and params_file means the path of file std::string model_file = ""; std::string params_file = ""; bool model_from_memory_ = false; // format of input model ModelFormat model_format = ModelFormat::PADDLE; std::string encryption_key_ = ""; // for cpu inference // default will let the backend choose their own default value int cpu_thread_num = -1; int device_id = 0; Backend backend = Backend::UNKNOWN; Device device = Device::CPU; void* external_stream_ = nullptr; bool enable_pinned_memory = false; // *** The belowing api are deprecated, will be removed in v1.2.0 // *** Do not use it anymore void SetPaddleMKLDNN(bool pd_mkldnn = true); void EnablePaddleToTrt(); void DeletePaddleBackendPass(const std::string& delete_pass_name); void EnablePaddleLogInfo(); void DisablePaddleLogInfo(); void SetPaddleMKLDNNCacheSize(int size); void SetOpenVINODevice(const std::string& name = "CPU"); void SetOpenVINOShapeInfo( const std::map>& shape_info) { openvino_option.shape_infos = shape_info; } void SetOpenVINOCpuOperators(const std::vector& operators) { openvino_option.SetCpuOperators(operators); } void SetLiteOptimizedModelDir(const std::string& optimized_model_dir); void SetLiteSubgraphPartitionPath( const std::string& nnadapter_subgraph_partition_config_path); void SetLiteSubgraphPartitionConfigBuffer( const std::string& nnadapter_subgraph_partition_config_buffer); void SetLiteContextProperties(const std::string& nnadapter_context_properties); void SetLiteModelCacheDir(const std::string& nnadapter_model_cache_dir); void SetLiteDynamicShapeInfo( const std::map>>& nnadapter_dynamic_shape_info); void SetLiteMixedPrecisionQuantizationConfigPath( const std::string& nnadapter_mixed_precision_quantization_config_path); void EnableLiteFP16(); void DisableLiteFP16(); void EnableLiteInt8(); void DisableLiteInt8(); void SetLitePowerMode(LitePowerMode mode); void SetTrtInputShape( const std::string& input_name, const std::vector& min_shape, const std::vector& opt_shape = std::vector(), const std::vector& max_shape = std::vector()); void SetTrtInputData( const std::string& input_name, const std::vector& min_shape_data, const std::vector& opt_shape_data = std::vector(), const std::vector& max_shape_data = std::vector()); void SetTrtMaxWorkspaceSize(size_t trt_max_workspace_size); void SetTrtMaxBatchSize(size_t max_batch_size); void EnableTrtFP16(); void DisableTrtFP16(); void SetTrtCacheFile(const std::string& cache_file_path); void EnablePinnedMemory(); void DisablePinnedMemory(); void EnablePaddleTrtCollectShape(); void DisablePaddleTrtCollectShape(); void DisablePaddleTrtOPs(const std::vector& ops); void SetOpenVINOStreams(int num_streams); void SetOrtGraphOptLevel(int level = -1); void UsePaddleBackend(); void UseLiteBackend(); void UseHorizonNPUBackend(); void UseTVMBackend(); }; } // namespace fastdeploy