mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 11:56:44 +08:00 
			
		
		
		
	 c25d1cc1bc
			
		
	
	c25d1cc1bc
	
	
	
		
			
			* add GPL lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * support yolov8 * add pybind for yolov8 * add yolov8 readme * add cpp benchmark * add cpu and gpu mem * public part split * add runtime mode * fixed bugs * add cpu_thread_nums * deal with comments * deal with comments * deal with comments * rm useless code * add FASTDEPLOY_DECL * add FASTDEPLOY_DECL * fixed for windows * mv rss to pss * mv rss to pss * Update utils.cc * use thread to collect mem * Add ResourceUsageMonitor * rm useless code * fixed bug * fixed typo * update ResourceUsageMonitor * fixed bug * fixed bug * add note for ResourceUsageMonitor * deal with comments * add macros * deal with comments * deal with comments * deal with comments * re-lint * rm pmap and use mem api * rm pmap and use mem api * add mem api * Add PrintBenchmarkInfo func * Add PrintBenchmarkInfo func * Add PrintBenchmarkInfo func * deal with comments * fixed enable_paddle_to_trt * add log for paddle_trt --------- Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
		
			
				
	
	
		
			427 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			427 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
		
			Executable File
		
	
	
	
	
| // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| //     http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| #include "fastdeploy/runtime/backends/paddle/paddle_backend.h"
 | |
| 
 | |
| #include <sstream>
 | |
| 
 | |
| #include "fastdeploy/utils/path.h"
 | |
| 
 | |
| namespace fastdeploy {
 | |
| 
 | |
| void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
 | |
|   option_ = option;
 | |
|   if (option.device == Device::GPU) {
 | |
|     config_.EnableUseGpu(option.gpu_mem_init_size, option.device_id);
 | |
|     if (option_.external_stream_) {
 | |
|       FDINFO << "Will use external stream for Paddle Backend." << std::endl;
 | |
|       config_.SetExecStream(option_.external_stream_);
 | |
|     }
 | |
|     if (option.enable_trt) {
 | |
|       if (!option.trt_option.enable_fp16) {
 | |
|         FDINFO << "Will try to use tensorrt inference with Paddle Backend."
 | |
|                << std::endl;
 | |
|       }
 | |
|       config_.Exp_DisableTensorRtOPs(option.trt_disabled_ops_);
 | |
|       auto precision = paddle_infer::PrecisionType::kFloat32;
 | |
|       if (option.trt_option.enable_fp16) {
 | |
|         FDINFO << "Will try to use tensorrt fp16 inference with Paddle Backend."
 | |
|                << std::endl;
 | |
|         precision = paddle_infer::PrecisionType::kHalf;
 | |
|       }
 | |
|       bool use_static = false;
 | |
|       if (option.trt_option.serialize_file != "") {
 | |
|         FDWARNING
 | |
|             << "Detect that tensorrt cache file has been set to "
 | |
|             << option.trt_option.serialize_file
 | |
|             << ", but while enable paddle2trt, please notice that the cache "
 | |
|                "file will save to the directory where paddle model saved."
 | |
|             << std::endl;
 | |
|         use_static = true;
 | |
|         std::string opt_cache_dir =
 | |
|             GetDirFromPath(option.trt_option.serialize_file);
 | |
| 
 | |
|         config_.SetOptimCacheDir(opt_cache_dir);
 | |
|       }
 | |
|       config_.EnableTensorRtEngine(option.trt_option.max_workspace_size,
 | |
|                                    option.trt_option.max_batch_size, 3,
 | |
|                                    precision, use_static);
 | |
|       SetTRTDynamicShapeToConfig(option);
 | |
|     }
 | |
|   } else if (option.device == Device::IPU) {
 | |
| #ifdef WITH_IPU
 | |
|     config_.EnableIpu(option.ipu_option.ipu_device_num,
 | |
|                       option.ipu_option.ipu_micro_batch_size,
 | |
|                       option.ipu_option.ipu_enable_pipelining,
 | |
|                       option.ipu_option.ipu_batches_per_step);
 | |
|     config_.SetIpuConfig(option.ipu_option.ipu_enable_fp16,
 | |
|                          option.ipu_option.ipu_replica_num,
 | |
|                          option.ipu_option.ipu_available_memory_proportion,
 | |
|                          option.ipu_option.ipu_enable_half_partial);
 | |
| #else
 | |
|     FDWARNING << "The FastDeploy is not compiled with IPU backend, so will "
 | |
|                  "fallback to CPU with Paddle Inference Backend."
 | |
|               << std::endl;
 | |
| #endif
 | |
|   } else {
 | |
|     config_.DisableGpu();
 | |
|     if (option.enable_mkldnn) {
 | |
|       config_.EnableMKLDNN();
 | |
|       config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size);
 | |
|     }
 | |
|   }
 | |
|   if (!option.enable_log_info) {
 | |
|     config_.DisableGlogInfo();
 | |
|   }
 | |
|   if (!option.delete_pass_names.empty()) {
 | |
|     auto pass_builder = config_.pass_builder();
 | |
|     for (int i = 0; i < option.delete_pass_names.size(); i++) {
 | |
|       FDINFO << "Delete pass : " << option.delete_pass_names[i] << std::endl;
 | |
|       pass_builder->DeletePass(option.delete_pass_names[i]);
 | |
|     }
 | |
|   }
 | |
|   if (option.cpu_thread_num <= 0) {
 | |
|     config_.SetCpuMathLibraryNumThreads(8);
 | |
|   } else {
 | |
|     config_.SetCpuMathLibraryNumThreads(option.cpu_thread_num);
 | |
|   }
 | |
| }
 | |
| 
 | |
| bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
 | |
|                                    const std::string& params_buffer,
 | |
|                                    const PaddleBackendOption& option) {
 | |
|   if (initialized_) {
 | |
|     FDERROR << "PaddleBackend is already initlized, cannot initialize again."
 | |
|             << std::endl;
 | |
|     return false;
 | |
|   }
 | |
|   config_.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
 | |
|                          params_buffer.c_str(), params_buffer.size());
 | |
|   config_.EnableMemoryOptim();
 | |
|   BuildOption(option);
 | |
| 
 | |
|   // The input/output information get from predictor is not right, use
 | |
|   // PaddleReader instead now
 | |
|   auto reader =
 | |
|       paddle2onnx::PaddleReader(model_buffer.c_str(), model_buffer.size());
 | |
|   // If it's a quantized model, and use cpu with mkldnn, automaticaly switch to
 | |
|   // int8 mode
 | |
|   if (reader.is_quantize_model) {
 | |
|     if (option.device == Device::GPU) {
 | |
|       FDWARNING << "The loaded model is a quantized model, while inference on "
 | |
|                    "GPU, please use TensorRT backend to get better performance."
 | |
|                 << std::endl;
 | |
|       if (option.enable_trt) {
 | |
|         bool use_static = false;
 | |
|         if (option.trt_option.serialize_file != "") {
 | |
|           FDWARNING
 | |
|               << "Detect that tensorrt cache file has been set to "
 | |
|               << option.trt_option.serialize_file
 | |
|               << ", but while enable paddle2trt, please notice that the cache "
 | |
|                  "file will save to the directory where paddle model saved."
 | |
|               << std::endl;
 | |
|           use_static = true;
 | |
|         }
 | |
|         config_.EnableTensorRtEngine(option.trt_option.max_workspace_size,
 | |
|                                      option.trt_option.max_batch_size, 3,
 | |
|                                      paddle_infer::PrecisionType::kInt8,
 | |
|                                      use_static, false);
 | |
|         SetTRTDynamicShapeToConfig(option);
 | |
|       }
 | |
|     }
 | |
|     if (option.enable_mkldnn) {
 | |
|       config_.EnableMkldnnInt8();
 | |
|     } else {
 | |
|       FDWARNING << "The loaded model is a quantized model, while inference on "
 | |
|                    "CPU, please enable MKLDNN to get better performance."
 | |
|                 << std::endl;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   inputs_desc_.resize(reader.num_inputs);
 | |
|   for (int i = 0; i < reader.num_inputs; ++i) {
 | |
|     std::string name(reader.inputs[i].name);
 | |
|     std::vector<int64_t> shape(reader.inputs[i].shape,
 | |
|                                reader.inputs[i].shape + reader.inputs[i].rank);
 | |
|     inputs_desc_[i].name = name;
 | |
|     inputs_desc_[i].shape.assign(shape.begin(), shape.end());
 | |
|     inputs_desc_[i].dtype = ReaderDataTypeToFD(reader.inputs[i].dtype);
 | |
|   }
 | |
|   outputs_desc_.resize(reader.num_outputs);
 | |
|   for (int i = 0; i < reader.num_outputs; ++i) {
 | |
|     std::string name(reader.outputs[i].name);
 | |
|     std::vector<int64_t> shape(
 | |
|         reader.outputs[i].shape,
 | |
|         reader.outputs[i].shape + reader.outputs[i].rank);
 | |
|     outputs_desc_[i].name = name;
 | |
|     outputs_desc_[i].shape.assign(shape.begin(), shape.end());
 | |
|     outputs_desc_[i].dtype = ReaderDataTypeToFD(reader.outputs[i].dtype);
 | |
|   }
 | |
|   if (option.collect_trt_shape) {
 | |
|     // Set the shape info file.
 | |
|     std::string curr_model_dir = "./";
 | |
|     if (!option.model_from_memory_) {
 | |
|       curr_model_dir = GetDirFromPath(option.model_file);
 | |
|     }
 | |
|     std::string shape_range_info =
 | |
|         PathJoin(curr_model_dir, "shape_range_info.pbtxt");
 | |
|     if (!CheckFileExists(shape_range_info)) {
 | |
|       FDINFO << "Start generating shape range info file." << std::endl;
 | |
|       paddle_infer::Config analysis_config;
 | |
|       analysis_config.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
 | |
|                                      params_buffer.c_str(),
 | |
|                                      params_buffer.size());
 | |
|       analysis_config.CollectShapeRangeInfo(shape_range_info);
 | |
|       auto predictor_tmp = paddle_infer::CreatePredictor(analysis_config);
 | |
|       std::map<std::string, std::vector<int>> max_shape;
 | |
|       std::map<std::string, std::vector<int>> min_shape;
 | |
|       std::map<std::string, std::vector<int>> opt_shape;
 | |
|       GetDynamicShapeFromOption(option, &max_shape, &min_shape, &opt_shape);
 | |
|       // Need to run once to get the shape range info file.
 | |
|       CollectShapeRun(predictor_tmp.get(), max_shape);
 | |
|       CollectShapeRun(predictor_tmp.get(), min_shape);
 | |
|       CollectShapeRun(predictor_tmp.get(), opt_shape);
 | |
|       FDINFO << "Finish generating shape range info file." << std::endl;
 | |
|     }
 | |
|     FDINFO << "Start loading shape range info file " << shape_range_info
 | |
|            << " to set TensorRT dynamic shape." << std::endl;
 | |
|     config_.EnableTunedTensorRtDynamicShape(shape_range_info, false);
 | |
|   }
 | |
|   predictor_ = paddle_infer::CreatePredictor(config_);
 | |
|   initialized_ = true;
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| TensorInfo PaddleBackend::GetInputInfo(int index) {
 | |
|   FDASSERT(index < NumInputs(),
 | |
|            "The index: %d should less than the number of inputs: %d.", index,
 | |
|            NumInputs());
 | |
|   return inputs_desc_[index];
 | |
| }
 | |
| 
 | |
| std::vector<TensorInfo> PaddleBackend::GetInputInfos() { return inputs_desc_; }
 | |
| 
 | |
| TensorInfo PaddleBackend::GetOutputInfo(int index) {
 | |
|   FDASSERT(index < NumOutputs(),
 | |
|            "The index: %d should less than the number of outputs %d.", index,
 | |
|            NumOutputs());
 | |
|   return outputs_desc_[index];
 | |
| }
 | |
| 
 | |
| std::vector<TensorInfo> PaddleBackend::GetOutputInfos() {
 | |
|   return outputs_desc_;
 | |
| }
 | |
| 
 | |
| bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
 | |
|                           std::vector<FDTensor>* outputs, bool copy_to_fd) {
 | |
|   if (inputs.size() != inputs_desc_.size()) {
 | |
|     FDERROR << "[PaddleBackend] Size of inputs(" << inputs.size()
 | |
|             << ") should keep same with the inputs of this model("
 | |
|             << inputs_desc_.size() << ")." << std::endl;
 | |
|     return false;
 | |
|   }
 | |
|   // output share backend memory only support CPU or GPU
 | |
|   if (option_.device == Device::IPU) {
 | |
|     copy_to_fd = true;
 | |
|   }
 | |
| 
 | |
|   RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
 | |
|   for (size_t i = 0; i < inputs.size(); ++i) {
 | |
|     auto handle = predictor_->GetInputHandle(inputs[i].name);
 | |
|     ShareTensorFromFDTensor(handle.get(), inputs[i]);
 | |
|   }
 | |
|   std::unordered_set<std::string> prebinded_output_name;
 | |
|   // prebinded output only support for GPU
 | |
|   if (!copy_to_fd) {
 | |
|     for (size_t i = 0; i < (*outputs).size(); ++i) {
 | |
|       auto output_name = (*outputs)[i].name;
 | |
|       // if a output is not prebinded,
 | |
|       // the name of output is expected to be empty.
 | |
|       // We skip here
 | |
|       if (output_name.empty()) {
 | |
|         continue;
 | |
|       }
 | |
|       // Record the prebinded output_name.
 | |
|       // Those outputs do not need PaddleTensorToFDTensor
 | |
|       // after predictor_.Run()
 | |
|       prebinded_output_name.insert(output_name);
 | |
|       auto handle = predictor_->GetOutputHandle(output_name);
 | |
|       ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   RUNTIME_PROFILE_LOOP_BEGIN(1)
 | |
|   predictor_->Run();
 | |
|   RUNTIME_PROFILE_LOOP_END
 | |
| 
 | |
|   outputs->resize(outputs_desc_.size());
 | |
|   for (size_t i = 0; i < outputs_desc_.size(); ++i) {
 | |
|     // skip prebinded output
 | |
|     if (copy_to_fd == false &&
 | |
|         prebinded_output_name.count(outputs_desc_[i].name)) {
 | |
|       continue;
 | |
|     }
 | |
|     auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name);
 | |
|     if (copy_to_fd) {
 | |
|       (*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
 | |
|     }
 | |
|     PaddleTensorToFDTensor(handle, &((*outputs)[i]), copy_to_fd);
 | |
|   }
 | |
|   RUNTIME_PROFILE_LOOP_H2D_D2H_END
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| std::unique_ptr<BaseBackend> PaddleBackend::Clone(RuntimeOption& runtime_option,
 | |
|                                                   void* stream, int device_id) {
 | |
|   std::unique_ptr<BaseBackend> new_backend =
 | |
|       utils::make_unique<PaddleBackend>();
 | |
|   auto casted_backend = dynamic_cast<PaddleBackend*>(new_backend.get());
 | |
|   if (device_id > 0 && (option_.device == Device::GPU) &&
 | |
|       device_id != option_.device_id) {
 | |
|     auto clone_option = option_;
 | |
|     clone_option.device_id = device_id;
 | |
|     clone_option.external_stream_ = stream;
 | |
|     if (runtime_option.model_from_memory_) {
 | |
|       FDASSERT(
 | |
|           casted_backend->InitFromPaddle(runtime_option.model_file,
 | |
|                                          runtime_option.params_file,
 | |
|                                          clone_option),
 | |
|           "Clone model from Paddle failed while initialize PaddleBackend.");
 | |
|     } else {
 | |
|       std::string model_buffer = "";
 | |
|       std::string params_buffer = "";
 | |
|       FDASSERT(
 | |
|           ReadBinaryFromFile(clone_option.model_file, &model_buffer),
 | |
|           "Fail to read binary from model file while cloning PaddleBackend");
 | |
|       FDASSERT(ReadBinaryFromFile(clone_option.params_file, ¶ms_buffer),
 | |
|                "Fail to read binary from parameter file while cloning "
 | |
|                "PaddleBackend");
 | |
|       FDASSERT(
 | |
|           casted_backend->InitFromPaddle(model_buffer, params_buffer,
 | |
|                                          clone_option),
 | |
|           "Clone model from Paddle failed while initialize PaddleBackend.");
 | |
|     }
 | |
| 
 | |
|     FDWARNING << "The target device id:" << device_id
 | |
|               << " is different from current device id:" << option_.device_id
 | |
|               << ", cannot share memory with current engine." << std::endl;
 | |
|     return new_backend;
 | |
|   }
 | |
|   casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
 | |
|   casted_backend->outputs_desc_.assign(outputs_desc_.begin(),
 | |
|                                        outputs_desc_.end());
 | |
|   casted_backend->predictor_ = std::move(predictor_->Clone(stream));
 | |
|   return new_backend;
 | |
| }
 | |
| 
 | |
| void PaddleBackend::SetTRTDynamicShapeToConfig(
 | |
|     const PaddleBackendOption& option) {
 | |
|   std::map<std::string, std::vector<int>> max_shape;
 | |
|   std::map<std::string, std::vector<int>> min_shape;
 | |
|   std::map<std::string, std::vector<int>> opt_shape;
 | |
|   GetDynamicShapeFromOption(option, &max_shape, &min_shape, &opt_shape);
 | |
|   if (min_shape.size() > 0) {
 | |
|     FDINFO << "Start setting trt dynamic shape." << std::endl;
 | |
|     config_.SetTRTDynamicShapeInfo(min_shape, max_shape, opt_shape);
 | |
|     FDINFO << "Finish setting trt dynamic shape." << std::endl;
 | |
|   }
 | |
| }
 | |
| 
 | |
| void PaddleBackend::GetDynamicShapeFromOption(
 | |
|     const PaddleBackendOption& option,
 | |
|     std::map<std::string, std::vector<int>>* max_shape,
 | |
|     std::map<std::string, std::vector<int>>* min_shape,
 | |
|     std::map<std::string, std::vector<int>>* opt_shape) const {
 | |
|   auto print_shape = [](const std::vector<int>& shape) -> std::string {
 | |
|     std::ostringstream oss;
 | |
|     oss << "[";
 | |
|     for (int i = 0; i < shape.size(); ++i) {
 | |
|       oss << shape[i];
 | |
|       if (i < shape.size() - 1) {
 | |
|         oss << ", ";
 | |
|       }
 | |
|     }
 | |
|     oss << "]";
 | |
|     return oss.str();
 | |
|   };
 | |
|   for (const auto& item : option.trt_option.min_shape) {
 | |
|     auto max_iter = option.trt_option.max_shape.find(item.first);
 | |
|     auto opt_iter = option.trt_option.opt_shape.find(item.first);
 | |
|     FDASSERT(max_iter != option.trt_option.max_shape.end(),
 | |
|              "Cannot find %s in TrtBackendOption::min_shape.",
 | |
|              item.first.c_str());
 | |
|     FDASSERT(opt_iter != option.trt_option.opt_shape.end(),
 | |
|              "Cannot find %s in TrtBackendOption::opt_shape.",
 | |
|              item.first.c_str());
 | |
|     (*max_shape)[item.first].assign(max_iter->second.begin(),
 | |
|                                     max_iter->second.end());
 | |
|     (*opt_shape)[item.first].assign(opt_iter->second.begin(),
 | |
|                                     opt_iter->second.end());
 | |
|     (*min_shape)[item.first].assign(item.second.begin(), item.second.end());
 | |
|     FDINFO << item.first
 | |
|            << ": the max shape = " << print_shape(max_iter->second)
 | |
|            << ", the min shape = " << print_shape(item.second)
 | |
|            << ", the opt shape = " << print_shape(opt_iter->second)
 | |
|            << std::endl;
 | |
|   }
 | |
| }
 | |
| 
 | |
| void PaddleBackend::CollectShapeRun(
 | |
|     paddle_infer::Predictor* predictor,
 | |
|     const std::map<std::string, std::vector<int>>& shape) const {
 | |
|   auto input_names = predictor->GetInputNames();
 | |
|   auto input_type = predictor->GetInputTypes();
 | |
|   for (const auto& name : input_names) {
 | |
|     FDASSERT(shape.find(name) != shape.end() &&
 | |
|                  input_type.find(name) != input_type.end(),
 | |
|              "When collect_trt_shape is true, please define max/opt/min shape "
 | |
|              "for model's input:[\"%s\"] by "
 | |
|              "(C++)RuntimeOption.trt_option.SetShape/"
 | |
|              "(Python)RuntimeOption.trt_option.set_shape.",
 | |
|              name.c_str());
 | |
|     auto tensor = predictor->GetInputHandle(name);
 | |
|     auto shape_value = shape.at(name);
 | |
|     int shape_num = std::accumulate(shape_value.begin(), shape_value.end(), 1,
 | |
|                                     std::multiplies<int>());
 | |
|     tensor->Reshape(shape_value);
 | |
|     auto dtype = input_type[name];
 | |
|     switch (dtype) {
 | |
|       case paddle_infer::DataType::FLOAT32: {
 | |
|         std::vector<float> input_data(shape_num, 1.0);
 | |
|         tensor->CopyFromCpu(input_data.data());
 | |
|         break;
 | |
|       }
 | |
|       case paddle_infer::DataType::INT32: {
 | |
|         std::vector<int> input_data(shape_num, 1);
 | |
|         tensor->CopyFromCpu(input_data.data());
 | |
|         break;
 | |
|       }
 | |
|       case paddle_infer::DataType::INT64: {
 | |
|         std::vector<int64_t> input_data(shape_num, 1);
 | |
|         tensor->CopyFromCpu(input_data.data());
 | |
|         break;
 | |
|       }
 | |
|       default: {
 | |
|         FDASSERT(false,
 | |
|                  "Input data Paddle backend only supports "
 | |
|                  "FP32/INT32/INT64 currently.");
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   predictor->Run();
 | |
| }
 | |
| 
 | |
| }  // namespace fastdeploy
 |