diff --git a/fastdeploy/backends/backend.h b/fastdeploy/backends/backend.h index 02c94875d..5affeb756 100644 --- a/fastdeploy/backends/backend.h +++ b/fastdeploy/backends/backend.h @@ -19,7 +19,6 @@ #include #include -#include "fastdeploy/backends/common/multiclass_nms.h" #include "fastdeploy/core/fd_tensor.h" #include "fastdeploy/core/fd_type.h" diff --git a/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.cu b/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.cu index 2fa63f36d..3f1abb894 100755 --- a/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.cu +++ b/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.cu @@ -2,7 +2,9 @@ namespace fastdeploy { -__global__ void CudaCastKernel(const float* in, float* out, int edge, int out_bc_offset, int in_bc_offset, int ih, int iw, int oh, int ow, bool is_avg) { +__global__ void CudaCastKernel(const float* in, float* out, int edge, + int out_bc_offset, int in_bc_offset, int ih, + int iw, int oh, int ow, bool is_avg) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) { return; @@ -14,38 +16,41 @@ __global__ void CudaCastKernel(const float* in, float* out, int edge, int out_b int hend = ceilf(static_cast((h + 1) * ih) / oh); int wstart = floorf(static_cast(w * iw) / ow); int wend = ceilf(static_cast((w + 1) * iw) / ow); - if(is_avg) { + if (is_avg) { out[position] = 0.0; } else { out[position] = in[offset * in_bc_offset + hstart * iw + wstart]; } for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { + for (int w = wstart; w < wend; ++w) { int input_idx = h * iw + w; - if(is_avg) { + if (is_avg) { out[position] = out[position] + in[offset * in_bc_offset + input_idx]; } else { - out[position] = max(out[position], in[offset * in_bc_offset + input_idx]); + out[position] = + max(out[position], in[offset * in_bc_offset + input_idx]); } } } out[position] = out[position] / ((hend - hstart) * (wend - wstart)); } -void CudaAdaptivePool(const std::vector& input_dims, const std::vector& output_dims, float* output, const float* input, void* compute_stream, const std::string& pooling_type){ +void CudaAdaptivePool(const std::vector& input_dims, + const std::vector& output_dims, float* output, + const float* input, void* compute_stream, + const std::string& pooling_type) { auto casted_compute_stream = reinterpret_cast(compute_stream); int out_bc_offset = output_dims[2] * output_dims[3]; int in_bc_offset = input_dims[2] * input_dims[3]; int jobs = 1; - for(int i : output_dims) { + for (int i : output_dims) { jobs *= i; } bool is_avg = pooling_type == "avg"; int threads = 256; int blocks = ceil(jobs / static_cast(threads)); CudaCastKernel<<>>( - input, - output, - jobs, out_bc_offset, in_bc_offset, int(input_dims[2]), int(input_dims[3]), int(output_dims[2]), int(output_dims[3]), is_avg); + input, output, jobs, out_bc_offset, in_bc_offset, int(input_dims[2]), + int(input_dims[3]), int(output_dims[2]), int(output_dims[3]), is_avg); } } // namespace fastdeploy \ No newline at end of file diff --git a/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.h b/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.h index 3e68908ed..dc29c07dc 100755 --- a/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.h +++ b/fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.h @@ -15,21 +15,18 @@ #pragma once +#include #include #include -#include #include -#include #include +#include namespace fastdeploy { void CudaAdaptivePool(const std::vector& input_dims, - const std::vector& output_dims, - float* output, - const float* input, - void* compute_stream, + const std::vector& output_dims, float* output, + const float* input, void* compute_stream, const std::string& pooling_type); - } // namespace fastdeploy diff --git a/fastdeploy/backends/openvino/ov_backend.cc b/fastdeploy/backends/openvino/ov_backend.cc index 6858f8547..553d116e0 100755 --- a/fastdeploy/backends/openvino/ov_backend.cc +++ b/fastdeploy/backends/openvino/ov_backend.cc @@ -341,8 +341,7 @@ int OpenVINOBackend::NumInputs() const { return input_infos_.size(); } int OpenVINOBackend::NumOutputs() const { return output_infos_.size(); } bool OpenVINOBackend::Infer(std::vector& inputs, - std::vector* outputs, - bool copy_to_fd) { + std::vector* outputs, bool copy_to_fd) { if (inputs.size() != input_infos_.size()) { FDERROR << "[OpenVINOBackend] Size of the inputs(" << inputs.size() << ") should keep same with the inputs of this model(" @@ -365,19 +364,17 @@ bool OpenVINOBackend::Infer(std::vector& inputs, auto out_tensor_shape = out_tensor.get_shape(); std::vector shape(out_tensor_shape.begin(), out_tensor_shape.end()); - if(copy_to_fd) { - (*outputs)[i].Resize(shape, + if (copy_to_fd) { + (*outputs)[i].Resize(shape, OpenVINODataTypeToFD(out_tensor.get_element_type()), - output_infos_[i].name, - Device::CPU); + output_infos_[i].name, Device::CPU); memcpy((*outputs)[i].MutableData(), out_tensor.data(), - (*outputs)[i].Nbytes()); + (*outputs)[i].Nbytes()); } else { (*outputs)[i].name = output_infos_[i].name; - (*outputs)[i].SetExternalData(shape, - OpenVINODataTypeToFD(out_tensor.get_element_type()), - out_tensor.data(), - Device::CPU); + (*outputs)[i].SetExternalData( + shape, OpenVINODataTypeToFD(out_tensor.get_element_type()), + out_tensor.data(), Device::CPU); } } return true; diff --git a/fastdeploy/backends/openvino/ov_backend.h b/fastdeploy/backends/openvino/ov_backend.h index 2dadab29d..dca33823b 100644 --- a/fastdeploy/backends/openvino/ov_backend.h +++ b/fastdeploy/backends/openvino/ov_backend.h @@ -47,8 +47,7 @@ class OpenVINOBackend : public BaseBackend { InitFromOnnx(const std::string& model_file, const OpenVINOBackendOption& option = OpenVINOBackendOption()); - bool Infer(std::vector& inputs, - std::vector* outputs, + bool Infer(std::vector& inputs, std::vector* outputs, bool copy_to_fd = true) override; int NumInputs() const override; diff --git a/fastdeploy/backends/ort/ops/adaptive_pool2d.cc b/fastdeploy/backends/ort/ops/adaptive_pool2d.cc index 045b42eb6..6e7a1d694 100755 --- a/fastdeploy/backends/ort/ops/adaptive_pool2d.cc +++ b/fastdeploy/backends/ort/ops/adaptive_pool2d.cc @@ -25,30 +25,38 @@ struct OrtTensorDimensions : std::vector { } }; -void AdaptivePool2dKernel::CpuAdaptivePool(const std::vector& input_size, const std::vector& output_size, const float* input_data, float* output_data){ +void AdaptivePool2dKernel::CpuAdaptivePool( + const std::vector& input_size, + const std::vector& output_size, const float* input_data, + float* output_data) { int64_t in_bc_offset = input_size[2] * input_size[3]; int64_t out_bc_offset = output_size[2] * output_size[3]; - for (int64_t b = 0; b < output_size[0] ; b++) { - for (int64_t c = 0; c < output_size[1] ; c++) { - for(int64_t h = 0; h < output_size[2]; h++){ - int64_t hstart = std::floor( static_cast(h * input_size[2]) / output_size[2]); - int64_t hend = std::ceil(static_cast((h + 1) * input_size[2]) / output_size[2]); - for(int64_t w = 0; w < output_size[3]; w++){ - int64_t wstart = std::floor(static_cast(w * input_size[3]) / output_size[3]); - int64_t wend = std::ceil(static_cast((w + 1) * input_size[3]) / output_size[3]); + for (int64_t b = 0; b < output_size[0]; b++) { + for (int64_t c = 0; c < output_size[1]; c++) { + for (int64_t h = 0; h < output_size[2]; h++) { + int64_t hstart = + std::floor(static_cast(h * input_size[2]) / output_size[2]); + int64_t hend = std::ceil(static_cast((h + 1) * input_size[2]) / + output_size[2]); + for (int64_t w = 0; w < output_size[3]; w++) { + int64_t wstart = std::floor(static_cast(w * input_size[3]) / + output_size[3]); + int64_t wend = std::ceil(static_cast((w + 1) * input_size[3]) / + output_size[3]); int64_t out_offset = h * output_size[3] + w; output_data[out_offset] = 0; - for(auto i = hstart; i < hend; i++){ - for(auto j = wstart; j< wend; j++){ - if(pooling_type_ == "avg"){ + for (auto i = hstart; i < hend; i++) { + for (auto j = wstart; j < wend; j++) { + if (pooling_type_ == "avg") { output_data[out_offset] += input_data[i * input_size[3] + j]; } - if(pooling_type_ == "max"){ - output_data[out_offset] = std::max(output_data[out_offset], input_data[i * input_size[3] + j]); + if (pooling_type_ == "max") { + output_data[out_offset] = std::max( + output_data[out_offset], input_data[i * input_size[3] + j]); } } } - if(pooling_type_ == "avg"){ + if (pooling_type_ == "avg") { output_data[out_offset] /= ((hend - hstart) * (wend - wstart)); } } @@ -64,26 +72,27 @@ void AdaptivePool2dKernel::Compute(OrtKernelContext* context) { const float* input_data = reinterpret_cast(ort_.GetTensorData(input)); - + OrtTensorDimensions input_dim(ort_, input); output_size_[0] = input_dim[0]; std::vector input_size; - for(auto i: input_dim){ + for (auto i : input_dim) { input_size.push_back(i); } - + OrtValue* output = ort_.KernelContext_GetOutput( context, 0, output_size_.data(), output_size_.size()); - + float* output_data = ort_.GetTensorMutableData(output); - if(!strcmp(this->provider_, "CUDAExecutionProvider")){ + if (!strcmp(this->provider_, "CUDAExecutionProvider")) { #ifdef WITH_GPU auto compute_stream = ort_.KernelContext_GetGPUComputeStream(context); - CudaAdaptivePool(input_size, output_size_, output_data, input_data, compute_stream, pooling_type_); + CudaAdaptivePool(input_size, output_size_, output_data, input_data, + compute_stream, pooling_type_); #else - FDWARNING << "FastDeploy didn't compile with WITH_GPU. " - << "Will force to use CPU to run." << std::endl; - CpuAdaptivePool(input_size, output_size_, input_data, output_data); + FDWARNING << "FastDeploy didn't compile with WITH_GPU. " + << "Will force to use CPU to run." << std::endl; + CpuAdaptivePool(input_size, output_size_, input_data, output_data); #endif } else { CpuAdaptivePool(input_size, output_size_, input_data, output_data); @@ -91,9 +100,13 @@ void AdaptivePool2dKernel::Compute(OrtKernelContext* context) { } void AdaptivePool2dKernel::GetAttribute(const OrtKernelInfo* info) { - pooling_type_ = ort_.KernelInfoGetAttribute(info, "pooling_type"); - output_size_ = ort_.KernelInfoGetAttribute>(info, "output_size"); - FDASSERT(output_size_.size() == 4 && output_size_[2] > 0 && output_size_[3] > 0, "The output size of adaptive pool must be positive."); + pooling_type_ = + ort_.KernelInfoGetAttribute(info, "pooling_type"); + output_size_ = + ort_.KernelInfoGetAttribute>(info, "output_size"); + FDASSERT(output_size_.size() == 4 && output_size_[2] > 0 && + output_size_[3] > 0, + "The output size of adaptive pool must be positive."); } } // namespace fastdeploy diff --git a/fastdeploy/backends/ort/ops/adaptive_pool2d.h b/fastdeploy/backends/ort/ops/adaptive_pool2d.h index 556ca033b..7d0acda10 100755 --- a/fastdeploy/backends/ort/ops/adaptive_pool2d.h +++ b/fastdeploy/backends/ort/ops/adaptive_pool2d.h @@ -14,12 +14,12 @@ #pragma once -#include -#include -#include -#include #include "fastdeploy/core/fd_tensor.h" #include "fastdeploy/utils/utils.h" +#include +#include +#include +#include #ifndef NON_64_PLATFORM #include "onnxruntime_cxx_api.h" // NOLINT @@ -38,9 +38,8 @@ struct AdaptivePool2dKernel { const char* provider_; public: - AdaptivePool2dKernel(Ort::CustomOpApi ort, - const OrtKernelInfo* info, - const char* provider) + AdaptivePool2dKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info, + const char* provider) : ort_(ort) { GetAttribute(info); provider_ = provider; @@ -51,9 +50,8 @@ struct AdaptivePool2dKernel { void Compute(OrtKernelContext* context); void CpuAdaptivePool(const std::vector& input_size, - const std::vector& output_size, - const float* input_data, - float* output_data); + const std::vector& output_size, + const float* input_data, float* output_data); }; struct AdaptivePool2dOp @@ -77,9 +75,8 @@ struct AdaptivePool2dOp return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; } - const char* GetExecutionProviderType() const { - return provider_; - } + const char* GetExecutionProviderType() const { return provider_; } + private: const char* provider_; }; diff --git a/fastdeploy/backends/ort/ops/multiclass_nms.cc b/fastdeploy/backends/ort/ops/multiclass_nms.cc index 36bc5dadf..590dc29a8 100644 --- a/fastdeploy/backends/ort/ops/multiclass_nms.cc +++ b/fastdeploy/backends/ort/ops/multiclass_nms.cc @@ -15,9 +15,9 @@ #ifndef NON_64_PLATFORM #include "fastdeploy/backends/ort/ops/multiclass_nms.h" -#include #include "fastdeploy/core/fd_tensor.h" #include "fastdeploy/utils/utils.h" +#include namespace fastdeploy { diff --git a/fastdeploy/backends/ort/ort_backend.cc b/fastdeploy/backends/ort/ort_backend.cc index 1e6d8bfb5..5ea47d6e4 100755 --- a/fastdeploy/backends/ort/ort_backend.cc +++ b/fastdeploy/backends/ort/ort_backend.cc @@ -16,8 +16,8 @@ #include -#include "fastdeploy/backends/ort/ops/multiclass_nms.h" #include "fastdeploy/backends/ort/ops/adaptive_pool2d.h" +#include "fastdeploy/backends/ort/ops/multiclass_nms.h" #include "fastdeploy/backends/ort/utils.h" #include "fastdeploy/core/float16.h" #include "fastdeploy/utils/utils.h" @@ -64,7 +64,7 @@ void OrtBackend::BuildOption(const OrtBackendOption& option) { } else { OrtCUDAProviderOptions cuda_options; cuda_options.device_id = option.gpu_id; - if(option.external_stream_) { + if (option.external_stream_) { cuda_options.has_user_compute_stream = 1; cuda_options.user_compute_stream = option.external_stream_; } @@ -91,11 +91,11 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file, strcpy(ops[0].export_op_name, "MultiClassNMS"); strcpy(ops[1].op_name, "pool2d"); strcpy(ops[1].export_op_name, "AdaptivePool2d"); - + if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(), &model_content_ptr, &model_content_size, 11, true, - verbose, true, true, true, ops.data(), - 2, "onnxruntime", nullptr, 0, "", &save_external)) { + verbose, true, true, true, ops.data(), 2, + "onnxruntime", nullptr, 0, "", &save_external)) { FDERROR << "Error occured while export PaddlePaddle to ONNX format." << std::endl; return false; @@ -105,11 +105,11 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file, model_content_ptr + model_content_size); delete[] model_content_ptr; model_content_ptr = nullptr; - if(save_external){ + if (save_external) { std::string model_file_name = "model.onnx"; std::fstream f(model_file_name, std::ios::out); FDASSERT(f.is_open(), "Can not open file: %s to save model.", - model_file_name.c_str()); + model_file_name.c_str()); f << onnx_model_proto; f.close(); return InitFromOnnx(model_file_name, option, false); @@ -182,7 +182,7 @@ bool OrtBackend::InitFromOnnx(const std::string& model_file, } void OrtBackend::OrtValueToFDTensor(const Ort::Value& value, FDTensor* tensor, - const std::string& name, bool copy_to_fd) { + const std::string& name, bool copy_to_fd) { const auto info = value.GetTensorTypeAndShapeInfo(); const auto data_type = info.GetElementType(); size_t numel = info.GetElementCount(); @@ -216,15 +216,13 @@ void OrtBackend::OrtValueToFDTensor(const Ort::Value& value, FDTensor* tensor, memcpy(tensor->MutableData(), value_ptr, numel); } else { tensor->name = name; - tensor->SetExternalData( - shape, dtype, - const_cast(value_ptr), Device::CPU); + tensor->SetExternalData(shape, dtype, const_cast(value_ptr), + Device::CPU); } } bool OrtBackend::Infer(std::vector& inputs, - std::vector* outputs, - bool copy_to_fd) { + std::vector* outputs, bool copy_to_fd) { if (inputs.size() != inputs_desc_.size()) { FDERROR << "[OrtBackend] Size of the inputs(" << inputs.size() << ") should keep same with the inputs of this model(" @@ -256,8 +254,8 @@ bool OrtBackend::Infer(std::vector& inputs, std::vector ort_outputs = binding_->GetOutputValues(); outputs->resize(ort_outputs.size()); for (size_t i = 0; i < ort_outputs.size(); ++i) { - OrtValueToFDTensor(ort_outputs[i], &((*outputs)[i]), - outputs_desc_[i].name, copy_to_fd); + OrtValueToFDTensor(ort_outputs[i], &((*outputs)[i]), outputs_desc_[i].name, + copy_to_fd); } return true; @@ -310,11 +308,13 @@ void OrtBackend::InitCustomOperators() { if (custom_operators_.size() == 0) { MultiClassNmsOp* multiclass_nms = new MultiClassNmsOp{}; custom_operators_.push_back(multiclass_nms); - if(option_.use_gpu){ - AdaptivePool2dOp* adaptive_pool2d = new AdaptivePool2dOp{"CUDAExecutionProvider"}; + if (option_.use_gpu) { + AdaptivePool2dOp* adaptive_pool2d = + new AdaptivePool2dOp{"CUDAExecutionProvider"}; custom_operators_.push_back(adaptive_pool2d); - }else{ - AdaptivePool2dOp* adaptive_pool2d = new AdaptivePool2dOp{"CPUExecutionProvider"}; + } else { + AdaptivePool2dOp* adaptive_pool2d = + new AdaptivePool2dOp{"CPUExecutionProvider"}; custom_operators_.push_back(adaptive_pool2d); } } diff --git a/fastdeploy/backends/ort/ort_backend.h b/fastdeploy/backends/ort/ort_backend.h index ab5f38e61..d485a442a 100644 --- a/fastdeploy/backends/ort/ort_backend.h +++ b/fastdeploy/backends/ort/ort_backend.h @@ -18,6 +18,7 @@ #include #include #include +#include #include "fastdeploy/backends/backend.h" #include "onnxruntime_cxx_api.h" // NOLINT @@ -67,8 +68,7 @@ class OrtBackend : public BaseBackend { const OrtBackendOption& option = OrtBackendOption(), bool from_memory_buffer = false); - bool Infer(std::vector& inputs, - std::vector* outputs, + bool Infer(std::vector& inputs, std::vector* outputs, bool copy_to_fd = true) override; int NumInputs() const override { return inputs_desc_.size(); } diff --git a/fastdeploy/backends/paddle/paddle_backend.cc b/fastdeploy/backends/paddle/paddle_backend.cc index 67f1eb762..a0a821782 100644 --- a/fastdeploy/backends/paddle/paddle_backend.cc +++ b/fastdeploy/backends/paddle/paddle_backend.cc @@ -104,7 +104,8 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_file, std::string contents; if (option.model_from_memory_) { - config_.SetModelBuffer(model_file.c_str(), option.model_buffer_size_, params_file.c_str(), option.params_buffer_size_); + config_.SetModelBuffer(model_file.c_str(), option.model_buffer_size_, + params_file.c_str(), option.params_buffer_size_); contents = model_file; } else { config_.SetModel(model_file, params_file); @@ -182,7 +183,9 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_file, FDINFO << "Start generating shape range info file." << std::endl; paddle_infer::Config analysis_config; if (option.model_from_memory_) { - analysis_config.SetModelBuffer(model_file.c_str(), option.model_buffer_size_, params_file.c_str(), option.params_buffer_size_); + analysis_config.SetModelBuffer( + model_file.c_str(), option.model_buffer_size_, params_file.c_str(), + option.params_buffer_size_); } else { analysis_config.SetModel(model_file, params_file); } diff --git a/fastdeploy/backends/paddle/util.cc b/fastdeploy/backends/paddle/util.cc index eff6a361f..fa6e757f3 100644 --- a/fastdeploy/backends/paddle/util.cc +++ b/fastdeploy/backends/paddle/util.cc @@ -30,24 +30,24 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, auto place = ConvertFDDeviceToPlace(fd_tensor.device); if (fd_tensor.dtype == FDDataType::FP32) { if (place == paddle_infer::PlaceType::kGPU) { - tensor->ShareExternalData(static_cast(fd_tensor.Data()), - shape, place); + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, place); } else { tensor->CopyFromCpu(static_cast(fd_tensor.Data())); } return; } else if (fd_tensor.dtype == FDDataType::INT32) { if (place == paddle_infer::PlaceType::kGPU) { - tensor->ShareExternalData(static_cast(fd_tensor.Data()), - shape, place); + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, place); } else { tensor->CopyFromCpu(static_cast(fd_tensor.Data())); } return; } else if (fd_tensor.dtype == FDDataType::INT64) { if (place == paddle_infer::PlaceType::kGPU) { - tensor->ShareExternalData(static_cast(fd_tensor.Data()), - shape, place); + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, place); } else { tensor->CopyFromCpu(static_cast(fd_tensor.Data())); } @@ -62,13 +62,12 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, } void PaddleTensorToFDTensor(std::unique_ptr& tensor, - FDTensor* fd_tensor, - bool copy_to_fd) { + FDTensor* fd_tensor, bool copy_to_fd) { auto fd_dtype = PaddleDataTypeToFD(tensor->type()); std::vector shape; auto tmp_shape = tensor->shape(); shape.assign(tmp_shape.begin(), tmp_shape.end()); - if(copy_to_fd) { + if (copy_to_fd) { fd_tensor->Resize(shape, fd_dtype, tensor->name()); if (fd_tensor->dtype == FDDataType::FP32) { tensor->CopyToCpu(static_cast(fd_tensor->MutableData())); @@ -79,9 +78,9 @@ void PaddleTensorToFDTensor(std::unique_ptr& tensor, } else if (fd_tensor->dtype == FDDataType::INT64) { tensor->CopyToCpu(static_cast(fd_tensor->MutableData())); return; - } + } FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.", - Str(fd_tensor->dtype).c_str()); + Str(fd_tensor->dtype).c_str()); } else { paddle_infer::PlaceType place; int size = 0; @@ -99,17 +98,17 @@ void PaddleTensorToFDTensor(std::unique_ptr& tensor, } else if (fd_dtype == FDDataType::UINT8) { out_data = tensor->data(&place, &size); } else { - FDASSERT(false, "Unexpected data type(%s) while infer shared with PaddleBackend.", + FDASSERT( + false, + "Unexpected data type(%s) while infer shared with PaddleBackend.", Str(fd_dtype).c_str()); } Device device = Device::CPU; - if(place == paddle_infer::PlaceType::kGPU) { + if (place == paddle_infer::PlaceType::kGPU) { device = Device::GPU; } fd_tensor->name = tensor->name(); - fd_tensor->SetExternalData( - shape, fd_dtype, - out_data, device); + fd_tensor->SetExternalData(shape, fd_dtype, out_data, device); } } @@ -153,7 +152,10 @@ FDDataType ReaderDataTypeToFD(int32_t dtype) { } else if (dtype == 6) { fd_dtype = FDDataType::FP16; } else { - FDASSERT(false, "Unexpected data type: %d while call ReaderDataTypeToFD in PaddleBackend.", dtype); + FDASSERT(false, + "Unexpected data type: %d while call ReaderDataTypeToFD in " + "PaddleBackend.", + dtype); } return fd_dtype; } diff --git a/fastdeploy/backends/poros/common/compile.h b/fastdeploy/backends/poros/common/compile.h index c7cbc6756..8e09c3664 100755 --- a/fastdeploy/backends/poros/common/compile.h +++ b/fastdeploy/backends/poros/common/compile.h @@ -14,14 +14,14 @@ #pragma once -#include #include -#include #include +#include +#include -#include "torch/script.h" #include "iengine.h" #include "poros_module.h" +#include "torch/script.h" namespace baidu { namespace mirana { @@ -36,28 +36,29 @@ namespace poros { * @return porosmodule * @retval !nullptr => succeed nullptr => failed **/ -std::unique_ptr Compile(const torch::jit::Module& module, - const std::vector >& prewarm_datas, +std::unique_ptr +Compile(const torch::jit::Module& module, + const std::vector>& prewarm_datas, const PorosOptions& options); class Compiler { -public: - typedef std::unordered_map engine_map_t; - typedef std::vector > ivalue_vec_t; + public: + typedef std::unordered_map engine_map_t; + typedef std::vector> ivalue_vec_t; - Compiler() : _origin_module(NULL) {} - ~Compiler(); + Compiler() : _origin_module(NULL) {} + ~Compiler(); - /** + /** * @brief initial Compiler * * @param [in] options : poros options * @return int * @retval 0 => succeed <0 => failed **/ - int init(const PorosOptions& options); + int init(const PorosOptions& options); - /** + /** * @brief compile whole graph * * @param [in] origin_module @@ -66,13 +67,12 @@ public: * @return int * @retval 0 => succeed <0 => failed **/ - int compile(const torch::jit::Module& origin_module, - const ivalue_vec_t& prewarm_datas, - torch::jit::Module* optimized_module); + int compile(const torch::jit::Module& origin_module, + const ivalue_vec_t& prewarm_datas, + torch::jit::Module* optimized_module); -private: - - /** + private: + /** * @brief preprocess this calculation graph * * @param [in] prewarm_datas : ivalue_vec_t, vector of IValue @@ -80,23 +80,25 @@ private: * @return int * @retval 0 => succeed <0 => failed **/ - int preprocess_graph(const ivalue_vec_t& prewarm_datas, std::shared_ptr& graph); + int preprocess_graph(const ivalue_vec_t& prewarm_datas, + std::shared_ptr& graph); - /** + /** * @brief segement this calculation graph * * @param [in/out] graph * @return int * @retval 0 => succeed <0 => failed **/ - int segment_graph(std::shared_ptr& graph); + int segment_graph(std::shared_ptr& graph); - // Split subgraph(block) - // The divided subgraph, as a subgraph, is associated with the block - int segment_block(torch::jit::Block& block, IEngine* engine, int current_depth); + // Split subgraph(block) + // The divided subgraph, as a subgraph, is associated with the block + int segment_block(torch::jit::Block& block, IEngine* engine, + int current_depth); - // Subgraph optimization - /** + // Subgraph optimization + /** * @brief Subgraph optimization * * @param [in] prewarm_datas : ivalue_vec_t, vector of IValue @@ -105,15 +107,15 @@ private: * @return int * @retval 0 => succeed <0 => failed **/ - int optimize_subgraph(const ivalue_vec_t& prewarm_datas, - const std::shared_ptr& opt_graph, - torch::jit::Module* optimized_module); + int optimize_subgraph(const ivalue_vec_t& prewarm_datas, + const std::shared_ptr& opt_graph, + torch::jit::Module* optimized_module); - // Subgraph optimization(block) - int optimize_subblock(torch::jit::Block* block, - torch::jit::Module* optimized_module); + // Subgraph optimization(block) + int optimize_subblock(torch::jit::Block* block, + torch::jit::Module* optimized_module); - /** + /** * @brief Compile the subgraph into a new graph based on the engine * * @param [in] engine : The engine used by the subgraph @@ -121,32 +123,32 @@ private: * @return [out] module : Transformed model * @retval 0 => succeed <0 => failed **/ - int transform(IEngine* engine, torch::jit::Node& subgraph_node, - torch::jit::Module& module); + int transform(IEngine* engine, torch::jit::Node& subgraph_node, + torch::jit::Module& module); - /** + /** * @brief Select engine based on subgraph and options * * @param [in] node : Jit Node * @return int * @retval 0 => succeed <0 => failed **/ - IEngine* select_engine(const torch::jit::Node* n); + IEngine* select_engine(const torch::jit::Node* n); - /** + /** * @brief destory * * @return void **/ - void close(); + void close(); -private: - int _max_segment_depth{5}; // Maximum subgraph segmentation depth - ivalue_vec_t _prewarm_datas; // Prewarm datas - PorosOptions _options; - engine_map_t _engine_map; // The engine used to record the subgraph - const torch::jit::Module* _origin_module; // Origin_module - std::atomic _engine_index = {0}; // Record engine index + private: + int _max_segment_depth{5}; // Maximum subgraph segmentation depth + ivalue_vec_t _prewarm_datas; // Prewarm datas + PorosOptions _options; + engine_map_t _engine_map; // The engine used to record the subgraph + const torch::jit::Module* _origin_module; // Origin_module + std::atomic _engine_index = {0}; // Record engine index }; /** @@ -158,9 +160,10 @@ private: * @return optimized_module * @retval !nullptr => succeed nullptr => failed **/ -std::unique_ptr CompileGraph(const torch::jit::Module& module, - const std::vector >& prewarm_datas, - const PorosOptions& options); +std::unique_ptr +CompileGraph(const torch::jit::Module& module, + const std::vector>& prewarm_datas, + const PorosOptions& options); } // namespace poros } // namespace mirana diff --git a/fastdeploy/backends/poros/common/iengine.h b/fastdeploy/backends/poros/common/iengine.h index c945621c1..e53685117 100755 --- a/fastdeploy/backends/poros/common/iengine.h +++ b/fastdeploy/backends/poros/common/iengine.h @@ -17,9 +17,9 @@ #include //from pytorch -#include "torch/script.h" -#include "torch/csrc/jit/ir/ir.h" #include "ATen/core/interned_strings.h" +#include "torch/csrc/jit/ir/ir.h" +#include "torch/script.h" #include "plugin_create.h" @@ -28,50 +28,51 @@ namespace mirana { namespace poros { struct PorosGraph { - torch::jit::Graph* graph = NULL; - torch::jit::Node* node = NULL; + torch::jit::Graph* graph = NULL; + torch::jit::Node* node = NULL; }; typedef uint64_t EngineID; -class IEngine : public IPlugin, public torch::CustomClassHolder{ -public: - virtual ~IEngine() {} +class IEngine : public IPlugin, public torch::CustomClassHolder { + public: + virtual ~IEngine() {} - /** + /** * @brief init, initialization must be successful if the init is successful * @return int * @retval 0 => success, <0 => fail **/ - virtual int init() = 0; + virtual int init() = 0; - /** + /** * @brief During compilation, the subgraph is converted into the graph structure of the corresponding engine and stored inside the engine, so that the execute_engine at runtime can be called * @param [in] sub_graph : subgraph * @return [res]int * @retval 0 => success, <0 => fail **/ - virtual int transform(const PorosGraph& sub_graph) = 0; + virtual int transform(const PorosGraph& sub_graph) = 0; - /** + /** * @brief Subgraph execution period logic * @param [in] inputs : input tensor * @return [res] output tensor **/ - virtual std::vector excute_engine(const std::vector& inputs) = 0; + virtual std::vector + excute_engine(const std::vector& inputs) = 0; - virtual void register_module_attribute(const std::string& name, torch::jit::Module& module) = 0; + virtual void register_module_attribute(const std::string& name, + torch::jit::Module& module) = 0; - // Logo - virtual const std::string who_am_i() = 0; + // Logo + virtual const std::string who_am_i() = 0; - // Whether the node is supported by the current engine - bool is_node_supported(const torch::jit::Node* node); - -public: - std::pair _num_io; // Number of input/output parameters - EngineID _id; + // Whether the node is supported by the current engine + bool is_node_supported(const torch::jit::Node* node); + public: + std::pair _num_io; // Number of input/output parameters + EngineID _id; }; } // namespace poros diff --git a/fastdeploy/backends/poros/common/plugin_create.h b/fastdeploy/backends/poros/common/plugin_create.h index d160f2440..61b5e8da1 100755 --- a/fastdeploy/backends/poros/common/plugin_create.h +++ b/fastdeploy/backends/poros/common/plugin_create.h @@ -14,52 +14,56 @@ #pragma once -#include #include +#include namespace baidu { namespace mirana { namespace poros { class IPlugin { -public: - virtual ~IPlugin() {} - virtual const std::string who_am_i() = 0; + public: + virtual ~IPlugin() {} + virtual const std::string who_am_i() = 0; }; typedef IPlugin* (*plugin_creator_t)(); typedef std::unordered_map plugin_creator_map_t; IPlugin* create_plugin(const std::string& plugin_name); -IPlugin* create_plugin(const std::string& plugin_name, const plugin_creator_map_t& plugin_creator_map); +IPlugin* create_plugin(const std::string& plugin_name, + const plugin_creator_map_t& plugin_creator_map); void create_all_plugins(const plugin_creator_map_t& plugin_creator_map, - std::unordered_map& plugin_m); + std::unordered_map& plugin_m); //void create_all_plugins(std::unordered_map& plugin_m); -template -IPlugin* default_plugin_creator() { - return new (std::nothrow)PluginType; +template IPlugin* default_plugin_creator() { + return new (std::nothrow) PluginType; } -void register_plugin_creator(const std::string& plugin_name, plugin_creator_t creator); void register_plugin_creator(const std::string& plugin_name, - plugin_creator_t creator, plugin_creator_map_t& plugin_creator_map); + plugin_creator_t creator); +void register_plugin_creator(const std::string& plugin_name, + plugin_creator_t creator, + plugin_creator_map_t& plugin_creator_map); template void register_plugin_class(const std::string& plugin_name) { - return register_plugin_creator(plugin_name, default_plugin_creator); + return register_plugin_creator(plugin_name, + default_plugin_creator); } // This version is recommended template -void register_plugin_class(const std::string& plugin_name, plugin_creator_map_t& plugin_creator_map) { - return register_plugin_creator(plugin_name, default_plugin_creator, plugin_creator_map); +void register_plugin_class(const std::string& plugin_name, + plugin_creator_map_t& plugin_creator_map) { + return register_plugin_creator( + plugin_name, default_plugin_creator, plugin_creator_map); } -}//poros -}//mirana -}//baidu - +} // namespace poros +} // namespace mirana +} // namespace baidu /* vim: set ts=4 sw=4 sts=4 tw=100 */ diff --git a/fastdeploy/backends/poros/common/poros_module.h b/fastdeploy/backends/poros/common/poros_module.h index 74ba485d4..71cabc8e6 100755 --- a/fastdeploy/backends/poros/common/poros_module.h +++ b/fastdeploy/backends/poros/common/poros_module.h @@ -14,53 +14,45 @@ #pragma once -#include -#include "torch/script.h" #include "torch/csrc/jit/jit_log.h" +#include "torch/script.h" +#include // #include "ATen/Context.h" namespace baidu { namespace mirana { namespace poros { -enum Device : int8_t { - GPU = 0, - CPU, - XPU, - UNKNOW -}; +enum Device : int8_t { GPU = 0, CPU, XPU, UNKNOW }; struct PorosOptions { - Device device = GPU; - bool debug = false; - bool use_fp16 = false; - bool is_dynamic = false; - bool long_to_int = true; - uint64_t max_workspace_size = 1ULL << 30; - int32_t device_id = -1; - int32_t unconst_ops_thres = -1; - bool use_nvidia_tf32 = false; + Device device = GPU; + bool debug = false; + bool use_fp16 = false; + bool is_dynamic = false; + bool long_to_int = true; + uint64_t max_workspace_size = 1ULL << 30; + int32_t device_id = -1; + int32_t unconst_ops_thres = -1; + bool use_nvidia_tf32 = false; }; class PorosModule : public torch::jit::Module { -public: - PorosModule(torch::jit::Module module) : torch::jit::Module(module) { - } - ~PorosModule() = default; + public: + PorosModule(torch::jit::Module module) : torch::jit::Module(module) {} + ~PorosModule() = default; - void to_device(Device device){ - _options.device = device; - } - - //c10::IValue forward(std::vector inputs); - //void save(const std::string& filename); -public: - PorosOptions _options; + void to_device(Device device) { _options.device = device; } + //c10::IValue forward(std::vector inputs); + //void save(const std::string& filename); + public: + PorosOptions _options; }; //via porosmodule.save -std::unique_ptr Load(const std::string& filename, const PorosOptions& options); +std::unique_ptr Load(const std::string& filename, + const PorosOptions& options); } // namespace poros } // namespace mirana diff --git a/fastdeploy/backends/poros/poros_backend.cc b/fastdeploy/backends/poros/poros_backend.cc index a7c96f7cd..26a167ebe 100755 --- a/fastdeploy/backends/poros/poros_backend.cc +++ b/fastdeploy/backends/poros/poros_backend.cc @@ -188,8 +188,7 @@ bool PorosBackend::InitFromPoros(const std::string& model_file, } bool PorosBackend::Infer(std::vector& inputs, - std::vector* outputs, - bool copy_to_fd) { + std::vector* outputs, bool copy_to_fd) { // Convert FD Tensor to PyTorch Tensor std::vector poros_inputs; bool is_backend_cuda = diff --git a/fastdeploy/backends/poros/poros_backend.h b/fastdeploy/backends/poros/poros_backend.h index 00dfe4444..d391d9869 100755 --- a/fastdeploy/backends/poros/poros_backend.h +++ b/fastdeploy/backends/poros/poros_backend.h @@ -74,9 +74,9 @@ class PorosBackend : public BaseBackend { void BuildOption(const PorosBackendOption& option); - bool InitFromTorchScript( - const std::string& model_file, - const PorosBackendOption& option = PorosBackendOption()); + bool + InitFromTorchScript(const std::string& model_file, + const PorosBackendOption& option = PorosBackendOption()); bool InitFromPoros(const std::string& model_file, const PorosBackendOption& option = PorosBackendOption()); @@ -85,8 +85,7 @@ class PorosBackend : public BaseBackend { std::vector>& prewarm_tensors, const PorosBackendOption& option = PorosBackendOption()); - bool Infer(std::vector& inputs, - std::vector* outputs, + bool Infer(std::vector& inputs, std::vector* outputs, bool copy_to_fd = true) override; int NumInputs() const { return _numinputs; } diff --git a/fastdeploy/backends/poros/utils.cc b/fastdeploy/backends/poros/utils.cc index e7b749b58..ee4b5f681 100644 --- a/fastdeploy/backends/poros/utils.cc +++ b/fastdeploy/backends/poros/utils.cc @@ -23,32 +23,32 @@ namespace fastdeploy { std::string AtType2String(const at::ScalarType& dtype) { std::string out; switch (dtype) { - case at::kByte: - out = "at::kByte"; - break; - case at::kChar: - out = "at::kChar"; - break; - case at::kShort: - out = "at::kShort"; - break; - case at::kInt: - out = "at::kInt"; - break; - case at::kLong: - out = "at::kLong"; - break; - case at::kHalf: - out = "at::kHalf"; - break; - case at::kFloat: - out = "at::kFloat"; - break; - case at::kDouble: - out = "at::kDouble"; - break; - default: - out = "at::UNKNOWN"; + case at::kByte: + out = "at::kByte"; + break; + case at::kChar: + out = "at::kChar"; + break; + case at::kShort: + out = "at::kShort"; + break; + case at::kInt: + out = "at::kInt"; + break; + case at::kLong: + out = "at::kLong"; + break; + case at::kHalf: + out = "at::kHalf"; + break; + case at::kFloat: + out = "at::kFloat"; + break; + case at::kDouble: + out = "at::kDouble"; + break; + default: + out = "at::UNKNOWN"; } return out; } @@ -129,9 +129,8 @@ at::Tensor CreatePorosValue(FDTensor& tensor, bool is_backend_cuda) { numel * sizeof(double)); } } else { - FDASSERT(false, - "Unrecognized data type while calling " - "PorosBackend::CreatePorosValue()."); + FDASSERT(false, "Unrecognized data type while calling " + "PorosBackend::CreatePorosValue()."); } return poros_value; } diff --git a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc index bfda43451..94a6d42d3 100644 --- a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc +++ b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc @@ -27,14 +27,14 @@ RKNPU2Backend::~RKNPU2Backend() { for (uint32_t i = 0; i < io_num.n_input; i++) { rknn_destroy_mem(ctx, input_mems_[i]); } - if(input_mems_ != nullptr){ + if (input_mems_ != nullptr) { free(input_mems_); } for (uint32_t i = 0; i < io_num.n_output; i++) { rknn_destroy_mem(ctx, output_mems_[i]); } - if(output_mems_ != nullptr){ + if (output_mems_ != nullptr) { free(output_mems_); } } @@ -173,16 +173,15 @@ bool RKNPU2Backend::GetModelInputOutputInfos() { // create input tensor memory // rknn_tensor_mem* input_mems[io_num.n_input]; - input_mems_ = (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_input); + input_mems_ = + (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_input); // get input info and copy to input tensor info for (uint32_t i = 0; i < io_num.n_input; i++) { input_attrs_[i].index = i; // query info - ret = rknn_query(ctx, - RKNN_QUERY_INPUT_ATTR, - &(input_attrs_[i]), + ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs_[i]), sizeof(rknn_tensor_attr)); DumpTensorAttr(input_attrs_[i]); @@ -190,12 +189,12 @@ bool RKNPU2Backend::GetModelInputOutputInfos() { printf("rknn_init error! ret=%d\n", ret); return false; } - if((input_attrs_[i].fmt != RKNN_TENSOR_NHWC) && - (input_attrs_[i].fmt != RKNN_TENSOR_UNDEFINED)){ - FDERROR << "rknpu2_backend only support input format is NHWC or UNDEFINED" << std::endl; + if ((input_attrs_[i].fmt != RKNN_TENSOR_NHWC) && + (input_attrs_[i].fmt != RKNN_TENSOR_UNDEFINED)) { + FDERROR << "rknpu2_backend only support input format is NHWC or UNDEFINED" + << std::endl; } - // copy input_attrs_ to input tensor info std::string temp_name = input_attrs_[i].name; std::vector temp_shape{}; @@ -203,25 +202,28 @@ bool RKNPU2Backend::GetModelInputOutputInfos() { for (int j = 0; j < input_attrs_[i].n_dims; j++) { temp_shape[j] = (int)input_attrs_[i].dims[j]; } - FDDataType temp_dtype = fastdeploy::RKNPU2Backend::RknnTensorTypeToFDDataType(input_attrs_[i].type); + FDDataType temp_dtype = + fastdeploy::RKNPU2Backend::RknnTensorTypeToFDDataType( + input_attrs_[i].type); TensorInfo temp_input_info = {temp_name, temp_shape, temp_dtype}; inputs_desc_[i] = temp_input_info; } // Get detailed output parameters - output_attrs_ = (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_output); + output_attrs_ = + (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_output); memset(output_attrs_, 0, io_num.n_output * sizeof(rknn_tensor_attr)); outputs_desc_.resize(io_num.n_output); // Create output tensor memory - output_mems_ = (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_output);; + output_mems_ = + (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_output); + ; for (uint32_t i = 0; i < io_num.n_output; i++) { output_attrs_[i].index = i; // query info - ret = rknn_query(ctx, - RKNN_QUERY_OUTPUT_ATTR, - &(output_attrs_[i]), + ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs_[i]), sizeof(rknn_tensor_attr)); DumpTensorAttr(output_attrs_[i]); @@ -233,7 +235,7 @@ bool RKNPU2Backend::GetModelInputOutputInfos() { // If the output dimension is 3, the runtime will automatically change it to 4. // Obviously, this is wrong, and manual correction is required here. int n_dims = output_attrs_[i].n_dims; - if((n_dims == 4) && (output_attrs_[i].dims[3] == 1)){ + if ((n_dims == 4) && (output_attrs_[i].dims[3] == 1)) { n_dims--; } @@ -292,8 +294,7 @@ std::vector RKNPU2Backend::GetOutputInfos() { } bool RKNPU2Backend::Infer(std::vector& inputs, - std::vector* outputs, - bool copy_to_fd) { + std::vector* outputs, bool copy_to_fd) { int ret = RKNN_SUCC; // Judge whether the input and output size are the same if (inputs.size() != inputs_desc_.size()) { @@ -303,15 +304,17 @@ bool RKNPU2Backend::Infer(std::vector& inputs, return false; } - if(!this->infer_init){ + if (!this->infer_init) { for (uint32_t i = 0; i < io_num.n_input; i++) { // Judge whether the input and output types are the same rknn_tensor_type input_type = - fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType(inputs[i].dtype); + fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType( + inputs[i].dtype); if (input_type != input_attrs_[i].type) { FDWARNING << "The input tensor type != model's inputs type." - << "The input_type need " << get_type_string(input_attrs_[i].type) - << ",but inputs["<< i << "].type is " << get_type_string(input_type) + << "The input_type need " + << get_type_string(input_attrs_[i].type) << ",but inputs[" + << i << "].type is " << get_type_string(input_type) << std::endl; } @@ -319,10 +322,11 @@ bool RKNPU2Backend::Infer(std::vector& inputs, input_attrs_[i].type = input_type; input_attrs_[i].size = inputs[0].Nbytes(); input_attrs_[i].size_with_stride = inputs[0].Nbytes(); - if(input_attrs_[i].type == RKNN_TENSOR_FLOAT16 || - input_attrs_[i].type == RKNN_TENSOR_FLOAT32){ + if (input_attrs_[i].type == RKNN_TENSOR_FLOAT16 || + input_attrs_[i].type == RKNN_TENSOR_FLOAT32) { FDINFO << "The input model is not a quantitative model. " - "Close the normalize operation." << std::endl; + "Close the normalize operation." + << std::endl; } input_mems_[i] = rknn_create_mem(ctx, inputs[i].Nbytes()); @@ -474,4 +478,4 @@ RKNPU2Backend::FDDataTypeToRknnTensorType(fastdeploy::FDDataType type) { FDERROR << "rknn_tensor_type don't support this type" << std::endl; return RKNN_TENSOR_TYPE_MAX; } -} // namespace fastdeploy \ No newline at end of file +} // namespace fastdeploy \ No newline at end of file diff --git a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h index af28fdddf..33704679c 100644 --- a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h +++ b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h @@ -14,9 +14,9 @@ #pragma once #include "fastdeploy/backends/backend.h" -#include "fastdeploy/core/fd_tensor.h" -#include "rknn_api.h" // NOLINT #include "fastdeploy/backends/rknpu/rknpu2/rknpu2_config.h" +#include "fastdeploy/core/fd_tensor.h" +#include "rknn_api.h" // NOLINT #include #include #include @@ -71,8 +71,7 @@ class RKNPU2Backend : public BaseBackend { TensorInfo GetOutputInfo(int index) override; std::vector GetInputInfos() override; std::vector GetOutputInfos() override; - bool Infer(std::vector& inputs, - std::vector* outputs, + bool Infer(std::vector& inputs, std::vector* outputs, bool copy_to_fd = true) override; private: diff --git a/fastdeploy/backends/rknpu/rknpu2/rknpu2_config.h b/fastdeploy/backends/rknpu/rknpu2/rknpu2_config.h index 60e1a76aa..7205d0bb4 100644 --- a/fastdeploy/backends/rknpu/rknpu2/rknpu2_config.h +++ b/fastdeploy/backends/rknpu/rknpu2/rknpu2_config.h @@ -24,9 +24,9 @@ typedef enum _rknpu2_cpu_name { /*! RKNPU2 core mask for mobile device. */ typedef enum _rknpu2_core_mask { RKNN_NPU_CORE_AUTO = 0, //< default, run on NPU core randomly. - RKNN_NPU_CORE_0 = 1, //< run on NPU core 0. - RKNN_NPU_CORE_1 = 2, //< run on NPU core 1. - RKNN_NPU_CORE_2 = 4, //< run on NPU core 2. + RKNN_NPU_CORE_0 = 1, //< run on NPU core 0. + RKNN_NPU_CORE_1 = 2, //< run on NPU core 1. + RKNN_NPU_CORE_2 = 4, //< run on NPU core 2. RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 | RKNN_NPU_CORE_1, //< run on NPU core 1 and core 2. RKNN_NPU_CORE_0_1_2 = diff --git a/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.cc b/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.cc index bfec5e356..191ac1560 100755 --- a/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.cc +++ b/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.cc @@ -17,108 +17,106 @@ namespace fastdeploy { nvinfer1::PluginFieldCollection AdaptivePool2dPluginCreator::mFC{}; -std::vector AdaptivePool2dPluginCreator::mPluginAttributes; +std::vector + AdaptivePool2dPluginCreator::mPluginAttributes; -pluginStatus_t AdaptivePool2dInference(cudaStream_t stream, int32_t n, const void* input, void* output); +pluginStatus_t AdaptivePool2dInference(cudaStream_t stream, int32_t n, + const void* input, void* output); -AdaptivePool2d::AdaptivePool2d(std::vector output_size, std::string pooling_type) { +AdaptivePool2d::AdaptivePool2d(std::vector output_size, + std::string pooling_type) { output_size_ = output_size; pooling_type_ = pooling_type; } AdaptivePool2d::AdaptivePool2d(const void* buffer, size_t length) { - const char *d = reinterpret_cast(buffer), *a = d; - output_size_.resize(4); - for(int64_t i =0 ; i < 4; i++){ - output_size_[i] =read(d); - } - if(read(d) == 0){ - pooling_type_ = "avg"; - }else{ - pooling_type_ = "max"; - } - FDASSERT(d == a + length, "deserialize failed."); + const char *d = reinterpret_cast(buffer), *a = d; + output_size_.resize(4); + for (int64_t i = 0; i < 4; i++) { + output_size_[i] = read(d); + } + if (read(d) == 0) { + pooling_type_ = "avg"; + } else { + pooling_type_ = "max"; + } + FDASSERT(d == a + length, "deserialize failed."); } -int AdaptivePool2d::getNbOutputs() const noexcept { - return 1; -} +int AdaptivePool2d::getNbOutputs() const noexcept { return 1; } nvinfer1::DimsExprs AdaptivePool2d::getOutputDimensions( - int outputIndex, const nvinfer1::DimsExprs* inputs, - int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept { + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) noexcept { try { nvinfer1::DimsExprs output(inputs[0]); output.d[2] = exprBuilder.constant(static_cast(output_size_[2])); output.d[3] = exprBuilder.constant(static_cast(output_size_[3])); return output; - } - catch (const std::exception& e) { - FDASSERT(false, "getOutputDimensions failed: %s.",e.what()); + } catch (const std::exception& e) { + FDASSERT(false, "getOutputDimensions failed: %s.", e.what()); } return nvinfer1::DimsExprs{}; } -int AdaptivePool2d::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, - const nvinfer1::PluginTensorDesc* outputDesc, - const void* const* inputs, - void* const* outputs, - void* workspace, - cudaStream_t stream) noexcept { +int AdaptivePool2d::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workspace, cudaStream_t stream) noexcept { if (inputDesc[0].type != nvinfer1::DataType::kFLOAT) { - return -1; + return -1; } auto const* data = static_cast(inputs[0]); auto* result = static_cast(outputs[0]); - int nums = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] * outputDesc[0].dims.d[2]* outputDesc[0].dims.d[3]; + int nums = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] * + outputDesc[0].dims.d[2] * outputDesc[0].dims.d[3]; std::vector input_size, output_size; - for(int i =0; i< 4; i++){ + for (int i = 0; i < 4; i++) { input_size.push_back(inputDesc[0].dims.d[i]); output_size.push_back(outputDesc[0].dims.d[i]); } - CudaAdaptivePool(input_size, output_size, result, data, stream, pooling_type_); + CudaAdaptivePool(input_size, output_size, result, data, stream, + pooling_type_); return cudaPeekAtLastError(); } size_t AdaptivePool2d::getSerializationSize() const noexcept { - return 5 * sizeof(int32_t) ; + return 5 * sizeof(int32_t); } -void AdaptivePool2d::serialize(void* buffer) const noexcept { +void AdaptivePool2d::serialize(void* buffer) const noexcept { char *d = reinterpret_cast(buffer), *a = d; - for(int64_t i=0; i< 4; i++){ + for (int64_t i = 0; i < 4; i++) { write(d, output_size_[i]); } int32_t pooling_type_val = 0; - if(pooling_type_ != "avg"){ + if (pooling_type_ != "avg") { pooling_type_val = 1; } write(d, pooling_type_val); FDASSERT(d == a + getSerializationSize(), "d == a + getSerializationSize()"); } -nvinfer1::DataType AdaptivePool2d::getOutputDataType( - int index, const nvinfer1::DataType* inputType, int nbInputs) const noexcept { +nvinfer1::DataType +AdaptivePool2d::getOutputDataType(int index, + const nvinfer1::DataType* inputType, + int nbInputs) const noexcept { return inputType[0]; } bool AdaptivePool2d::supportsFormatCombination( - int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept { + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) noexcept { return (inOut[pos].format == nvinfer1::PluginFormat::kLINEAR); } -int AdaptivePool2d::initialize() noexcept { - return 0; -} +int AdaptivePool2d::initialize() noexcept { return 0; } -void AdaptivePool2d::terminate() noexcept { - return; -} +void AdaptivePool2d::terminate() noexcept { return; } -size_t AdaptivePool2d::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, - int nbInputs, - const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const noexcept { +size_t AdaptivePool2d::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept { return 0; } @@ -126,33 +124,32 @@ const char* AdaptivePool2d::getPluginType() const noexcept { return "AdaptivePool2d"; } -const char* AdaptivePool2d::getPluginVersion() const noexcept { - return "1"; -} +const char* AdaptivePool2d::getPluginVersion() const noexcept { return "1"; } -void AdaptivePool2d::destroy() noexcept { +void AdaptivePool2d::destroy() noexcept { return; } +void AdaptivePool2d::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept { return; } -void AdaptivePool2d::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, - const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept { - return; -} nvinfer1::IPluginV2DynamicExt* AdaptivePool2d::clone() const noexcept { - try{ - nvinfer1::IPluginV2DynamicExt* plugin = new AdaptivePool2d(output_size_, pooling_type_); - plugin->setPluginNamespace(mNamespace.c_str()); - return plugin; - } - catch (std::exception const& e){ - FDASSERT(false, "clone failed: %s.",e.what()); + try { + nvinfer1::IPluginV2DynamicExt* plugin = + new AdaptivePool2d(output_size_, pooling_type_); + plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; + } catch (std::exception const& e) { + FDASSERT(false, "clone failed: %s.", e.what()); } return nullptr; } AdaptivePool2dPluginCreator::AdaptivePool2dPluginCreator() { mPluginAttributes.clear(); - mPluginAttributes.emplace_back(nvinfer1::PluginField("output_size", nullptr, nvinfer1::PluginFieldType::kINT32, 4)); - mPluginAttributes.emplace_back(nvinfer1::PluginField("pooling_type", nullptr, nvinfer1::PluginFieldType::kCHAR, 3)); + mPluginAttributes.emplace_back(nvinfer1::PluginField( + "output_size", nullptr, nvinfer1::PluginFieldType::kINT32, 4)); + mPluginAttributes.emplace_back(nvinfer1::PluginField( + "pooling_type", nullptr, nvinfer1::PluginFieldType::kCHAR, 3)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); @@ -166,17 +163,18 @@ const char* AdaptivePool2dPluginCreator::getPluginVersion() const noexcept { return "1"; } -const nvinfer1::PluginFieldCollection* AdaptivePool2dPluginCreator::getFieldNames() noexcept { +const nvinfer1::PluginFieldCollection* +AdaptivePool2dPluginCreator::getFieldNames() noexcept { return &mFC; } -nvinfer1::IPluginV2DynamicExt* AdaptivePool2dPluginCreator::createPlugin(const char* name, - const nvinfer1::PluginFieldCollection* fc) noexcept { - try{ +nvinfer1::IPluginV2DynamicExt* AdaptivePool2dPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept { + try { const nvinfer1::PluginField* fields = fc->fields; auto const dims = static_cast(fields[0].data); output_size_.resize(4); - for(int64_t i = 0; i < 4; i++){ + for (int64_t i = 0; i < 4; i++) { output_size_[i] = dims[i]; } @@ -184,23 +182,20 @@ nvinfer1::IPluginV2DynamicExt* AdaptivePool2dPluginCreator::createPlugin(const c std::string pooling_type(pooling_type_ptr, 3); pooling_type_ = pooling_type; return new AdaptivePool2d(output_size_, pooling_type_); - } - catch (std::exception const& e){ - FDASSERT(false, "createPlugin failed: %s.",e.what()); + } catch (std::exception const& e) { + FDASSERT(false, "createPlugin failed: %s.", e.what()); } return nullptr; } -nvinfer1::IPluginV2DynamicExt* AdaptivePool2dPluginCreator::deserializePlugin(const char* name, - const void* serialData, - size_t serialLength) noexcept { - try{ +nvinfer1::IPluginV2DynamicExt* AdaptivePool2dPluginCreator::deserializePlugin( + const char* name, const void* serialData, size_t serialLength) noexcept { + try { return new AdaptivePool2d(serialData, serialLength); - } - catch (std::exception const& e){ - FDASSERT(false, "deserializePlugin failed: %s.",e.what()); + } catch (std::exception const& e) { + FDASSERT(false, "deserializePlugin failed: %s.", e.what()); } return nullptr; } -} // namespace fastdeploy \ No newline at end of file +} // namespace fastdeploy \ No newline at end of file diff --git a/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.h b/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.h index 2e6e45e2c..1fe788920 100755 --- a/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.h +++ b/fastdeploy/backends/tensorrt/ops/adaptive_pool2d.h @@ -13,98 +13,93 @@ // limitations under the License. #pragma once +#include "common.h" // NOLINT #include "fastdeploy/backends/op_cuda_kernels/adaptive_pool2d_kernel.h" -#include "common.h" // NOLINT namespace fastdeploy { class AdaptivePool2d : public BasePlugin { public: - AdaptivePool2d(std::vector output_size, std::string pooling_type); + AdaptivePool2d(std::vector output_size, std::string pooling_type); - AdaptivePool2d(const void* buffer, size_t length); + AdaptivePool2d(const void* buffer, size_t length); - ~AdaptivePool2d() override = default; + ~AdaptivePool2d() override = default; - int getNbOutputs() const noexcept override; + int getNbOutputs() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions( - int outputIndex, - const nvinfer1::DimsExprs* inputs, - int nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; + nvinfer1::DimsExprs + getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, + int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) noexcept override; - nvinfer1::DataType getOutputDataType( - int index, - const nvinfer1::DataType* inputType, - int nbInputs) const noexcept override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputType, + int nbInputs) const noexcept override; - bool supportsFormatCombination( - int pos, - const nvinfer1::PluginTensorDesc* inOut, - int nbInputs, - int nbOutputs) noexcept override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) noexcept override; - int initialize() noexcept override; + int initialize() noexcept override; - void terminate() noexcept override; + void terminate() noexcept override; - size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, - int nbInputs, - const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const noexcept override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const noexcept override; - int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, - const nvinfer1::PluginTensorDesc* outputDesc, - const void* const* inputs, - void* const* outputs, - void* workspace, - cudaStream_t stream) noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept override; - size_t getSerializationSize() const noexcept override; + size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; + void serialize(void* buffer) const noexcept override; - const char* getPluginType() const noexcept override; + const char* getPluginType() const noexcept override; - const char* getPluginVersion() const noexcept override; - void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, - int nbInputs, - const nvinfer1::DynamicPluginTensorDesc* out, - int nbOutputs) noexcept override; - void destroy() noexcept override; + const char* getPluginVersion() const noexcept override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) noexcept override; + void destroy() noexcept override; - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; + nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; private: - std::vector output_size_; - std::string pooling_type_; + std::vector output_size_; + std::string pooling_type_; }; class AdaptivePool2dPluginCreator : public BaseCreator { public: - AdaptivePool2dPluginCreator(); + AdaptivePool2dPluginCreator(); - ~AdaptivePool2dPluginCreator() override = default; + ~AdaptivePool2dPluginCreator() override = default; - const char* getPluginName() const noexcept override; + const char* getPluginName() const noexcept override; - const char* getPluginVersion() const noexcept override; + const char* getPluginVersion() const noexcept override; - const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override; + const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override; - nvinfer1::IPluginV2DynamicExt* createPlugin(const char* name, - const nvinfer1::PluginFieldCollection* fc) noexcept override; + nvinfer1::IPluginV2DynamicExt* + createPlugin(const char* name, + const nvinfer1::PluginFieldCollection* fc) noexcept override; - nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, - const void* serialData, - size_t serialLength) noexcept override; + nvinfer1::IPluginV2DynamicExt* + deserializePlugin(const char* name, const void* serialData, + size_t serialLength) noexcept override; private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::vector output_size_; - std::string pooling_type_; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; + std::vector output_size_; + std::string pooling_type_; }; REGISTER_TENSORRT_PLUGIN(AdaptivePool2dPluginCreator); diff --git a/fastdeploy/backends/tensorrt/ops/common.h b/fastdeploy/backends/tensorrt/ops/common.h index beada71ff..975582ffd 100755 --- a/fastdeploy/backends/tensorrt/ops/common.h +++ b/fastdeploy/backends/tensorrt/ops/common.h @@ -17,40 +17,40 @@ #include "NvInferPlugin.h" #include "NvInferRuntimeCommon.h" #include "fastdeploy/utils/utils.h" +#include #include +#include +#include #include #include -#include -#include -#include namespace fastdeploy { class BasePlugin : public nvinfer1::IPluginV2DynamicExt { protected: - void setPluginNamespace(const char* libNamespace) noexcept override { - mNamespace = libNamespace; - } + void setPluginNamespace(const char* libNamespace) noexcept override { + mNamespace = libNamespace; + } - const char* getPluginNamespace() const noexcept override { - return mNamespace.c_str(); - } + const char* getPluginNamespace() const noexcept override { + return mNamespace.c_str(); + } - std::string mNamespace; + std::string mNamespace; }; class BaseCreator : public nvinfer1::IPluginCreator { public: - void setPluginNamespace(const char* libNamespace) noexcept override { - mNamespace = libNamespace; - } + void setPluginNamespace(const char* libNamespace) noexcept override { + mNamespace = libNamespace; + } - const char* getPluginNamespace() const noexcept override { - return mNamespace.c_str(); - } + const char* getPluginNamespace() const noexcept override { + return mNamespace.c_str(); + } protected: - std::string mNamespace; + std::string mNamespace; }; typedef enum { @@ -62,19 +62,17 @@ typedef enum { } pluginStatus_t; // Write values into buffer -template -void write(char*& buffer, const T& val) { - std::memcpy(buffer, &val, sizeof(T)); - buffer += sizeof(T); +template void write(char*& buffer, const T& val) { + std::memcpy(buffer, &val, sizeof(T)); + buffer += sizeof(T); } // Read values from buffer -template -T read(const char*& buffer) { - T val{}; - std::memcpy(&val, buffer, sizeof(T)); - buffer += sizeof(T); - return val; +template T read(const char*& buffer) { + T val{}; + std::memcpy(&val, buffer, sizeof(T)); + buffer += sizeof(T); + return val; } } // namespace fastdeploy diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc index 3a8659ace..bdd23c8d6 100755 --- a/fastdeploy/backends/tensorrt/trt_backend.cc +++ b/fastdeploy/backends/tensorrt/trt_backend.cc @@ -134,9 +134,9 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file, int calibration_cache_size = 0; if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(), &model_content_ptr, &model_content_size, 11, true, - verbose, true, true, true, ops.data(), - 1, "tensorrt", - &calibration_cache_ptr, &calibration_cache_size, "", &save_external_)) { + verbose, true, true, true, ops.data(), 1, "tensorrt", + &calibration_cache_ptr, &calibration_cache_size, "", + &save_external_)) { FDERROR << "Error occured while export PaddlePaddle to ONNX format." << std::endl; return false; @@ -152,11 +152,11 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file, calibration_str_ = calibration_str; delete[] calibration_cache_ptr; } - if(save_external_){ + if (save_external_) { model_file_name_ = "model.onnx"; std::fstream f(model_file_name_, std::ios::out); FDASSERT(f.is_open(), "Can not open file: %s to save model.", - model_file_name_.c_str()); + model_file_name_.c_str()); f << onnx_model_proto; f.close(); return InitFromOnnx(model_file_name_, option, false); @@ -215,13 +215,14 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file, outputs_desc_.resize(onnx_reader.num_outputs); for (int i = 0; i < onnx_reader.num_inputs; ++i) { std::string name(onnx_reader.inputs[i].name); - std::vector shape( - onnx_reader.inputs[i].shape, - onnx_reader.inputs[i].shape + onnx_reader.inputs[i].rank); + std::vector shape(onnx_reader.inputs[i].shape, + onnx_reader.inputs[i].shape + + onnx_reader.inputs[i].rank); inputs_desc_[i].name = name; inputs_desc_[i].shape.assign(shape.begin(), shape.end()); inputs_desc_[i].dtype = ReaderDtypeToTrtDtype(onnx_reader.inputs[i].dtype); - inputs_desc_[i].original_dtype = ReaderDtypeToFDDtype(onnx_reader.inputs[i].dtype); + inputs_desc_[i].original_dtype = + ReaderDtypeToFDDtype(onnx_reader.inputs[i].dtype); auto info = ShapeRangeInfo(shape); info.name = name; auto iter_min = option.min_shape.find(name); @@ -237,9 +238,9 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file, for (int i = 0; i < onnx_reader.num_outputs; ++i) { std::string name(onnx_reader.outputs[i].name); - std::vector shape( - onnx_reader.outputs[i].shape, - onnx_reader.outputs[i].shape + onnx_reader.outputs[i].rank); + std::vector shape(onnx_reader.outputs[i].shape, + onnx_reader.outputs[i].shape + + onnx_reader.outputs[i].rank); outputs_desc_[i].name = name; outputs_desc_[i].shape.assign(shape.begin(), shape.end()); outputs_desc_[i].dtype = @@ -252,10 +253,10 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file, stream_ = reinterpret_cast(option_.external_stream_); } else { FDASSERT(cudaStreamCreate(&stream_) == 0, - "[ERROR] Error occurs while calling cudaStreamCreate()."); + "[ERROR] Error occurs while calling cudaStreamCreate()."); } - if(save_external_){ + if (save_external_) { onnx_content.clear(); onnx_content = model_file_name_; } @@ -283,8 +284,7 @@ int TrtBackend::ShapeRangeInfoUpdated(const std::vector& inputs) { } bool TrtBackend::Infer(std::vector& inputs, - std::vector* outputs, - bool copy_to_fd) { + std::vector* outputs, bool copy_to_fd) { if (inputs.size() != NumInputs()) { FDERROR << "Require " << NumInputs() << "inputs, but get " << inputs.size() << "." << std::endl; @@ -297,7 +297,8 @@ bool TrtBackend::Infer(std::vector& inputs, << "TensorRT engine will be rebuilt once shape range information " "changed, this may take lots of time, you can set a proper shape " "range before loading model to avoid rebuilding process. refer " - "https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/en/faq/" + "https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/en/" + "faq/" "tensorrt_tricks.md for more details." << std::endl; BuildTrtEngine(); @@ -314,38 +315,42 @@ bool TrtBackend::Infer(std::vector& inputs, for (size_t i = 0; i < outputs->size(); ++i) { // if the final output tensor's dtype is different from the model output tensor's dtype, // then we need cast the data to the final output's dtype - auto model_output_dtype = GetFDDataType(outputs_device_buffer_[(*outputs)[i].name].dtype()); + auto model_output_dtype = + GetFDDataType(outputs_device_buffer_[(*outputs)[i].name].dtype()); if ((*outputs)[i].dtype != model_output_dtype) { FDTensor output_tensor; - output_tensor.SetExternalData((*outputs)[i].shape, model_output_dtype, - outputs_device_buffer_[(*outputs)[i].name].data(), - Device::GPU); - - casted_output_tensors_[(*outputs)[i].name].Resize((*outputs)[i].shape, (*outputs)[i].dtype, - (*outputs)[i].name, Device::GPU); - function::CudaCast(output_tensor, &casted_output_tensors_[(*outputs)[i].name], stream_); - if(!copy_to_fd) { - (*outputs)[i].SetExternalData((*outputs)[i].shape, model_output_dtype, - casted_output_tensors_[(*outputs)[i].name].MutableData(), - Device::GPU, option_.gpu_id); + output_tensor.SetExternalData( + (*outputs)[i].shape, model_output_dtype, + outputs_device_buffer_[(*outputs)[i].name].data(), Device::GPU); + + casted_output_tensors_[(*outputs)[i].name].Resize( + (*outputs)[i].shape, (*outputs)[i].dtype, (*outputs)[i].name, + Device::GPU); + function::CudaCast(output_tensor, + &casted_output_tensors_[(*outputs)[i].name], stream_); + if (!copy_to_fd) { + (*outputs)[i].SetExternalData( + (*outputs)[i].shape, model_output_dtype, + casted_output_tensors_[(*outputs)[i].name].MutableData(), + Device::GPU, option_.gpu_id); } } else { casted_output_tensors_[(*outputs)[i].name].SetExternalData( (*outputs)[i].shape, model_output_dtype, - outputs_device_buffer_[(*outputs)[i].name].data(), - Device::GPU); + outputs_device_buffer_[(*outputs)[i].name].data(), Device::GPU); } } if (copy_to_fd) { for (size_t i = 0; i < outputs->size(); ++i) { - FDASSERT(cudaMemcpyAsync((*outputs)[i].Data(), - casted_output_tensors_[(*outputs)[i].name].Data(), - (*outputs)[i].Nbytes(), cudaMemcpyDeviceToHost, - stream_) == 0, - "[ERROR] Error occurs while copy memory from GPU to CPU."); + FDASSERT( + cudaMemcpyAsync((*outputs)[i].Data(), + casted_output_tensors_[(*outputs)[i].name].Data(), + (*outputs)[i].Nbytes(), cudaMemcpyDeviceToHost, + stream_) == 0, + "[ERROR] Error occurs while copy memory from GPU to CPU."); } FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess, - "[ERROR] Error occurs while sync cuda stream."); + "[ERROR] Error occurs while sync cuda stream."); } return true; @@ -356,10 +361,12 @@ void TrtBackend::GetInputOutputInfo() { std::unordered_map inputs_original_dtype_map; std::unordered_map outputs_original_dtype_map; for (size_t i = 0; i < inputs_desc_.size(); ++i) { - inputs_original_dtype_map[inputs_desc_[i].name] = inputs_desc_[i].original_dtype; + inputs_original_dtype_map[inputs_desc_[i].name] = + inputs_desc_[i].original_dtype; } for (size_t i = 0; i < outputs_desc_.size(); ++i) { - outputs_original_dtype_map[outputs_desc_[i].name] = outputs_desc_[i].original_dtype; + outputs_original_dtype_map[outputs_desc_[i].name] = + outputs_desc_[i].original_dtype; } // Re-read the tensor infos from TRT model and write into inputs_desc_ and outputs_desc_ @@ -373,12 +380,18 @@ void TrtBackend::GetInputOutputInfo() { auto shape = ToVec(engine_->getBindingDimensions(i)); auto dtype = engine_->getBindingDataType(i); if (engine_->bindingIsInput(i)) { - auto original_dtype = inputs_original_dtype_map.count(name) ? inputs_original_dtype_map[name] : GetFDDataType(dtype); - inputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype, original_dtype}); + auto original_dtype = inputs_original_dtype_map.count(name) + ? inputs_original_dtype_map[name] + : GetFDDataType(dtype); + inputs_desc_.emplace_back( + TrtValueInfo{name, shape, dtype, original_dtype}); inputs_device_buffer_[name] = FDDeviceBuffer(dtype); } else { - auto original_dtype = outputs_original_dtype_map.count(name) ? outputs_original_dtype_map[name] : GetFDDataType(dtype); - outputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype, original_dtype}); + auto original_dtype = outputs_original_dtype_map.count(name) + ? outputs_original_dtype_map[name] + : GetFDDataType(dtype); + outputs_desc_.emplace_back( + TrtValueInfo{name, shape, dtype, original_dtype}); outputs_device_buffer_[name] = FDDeviceBuffer(dtype); casted_output_tensors_[name] = FDTensor(); } @@ -391,8 +404,9 @@ void TrtBackend::SetInputs(const std::vector& inputs) { for (const auto& item : inputs) { // auto idx = engine_->getBindingIndex(item.name.c_str()); auto iter = io_name_index_.find(item.name); - FDASSERT(iter != io_name_index_.end(), "TRTBackend SetInputs not find name:%s", item.name.c_str()); - auto idx = iter->second; + FDASSERT(iter != io_name_index_.end(), + "TRTBackend SetInputs not find name:%s", item.name.c_str()); + auto idx = iter->second; std::vector shape(item.shape.begin(), item.shape.end()); auto dims = ToDims(shape); context_->setBindingDimensions(idx, dims); @@ -424,9 +438,8 @@ void TrtBackend::SetInputs(const std::vector& inputs) { "Error occurs while copy memory from CPU to GPU."); } else { FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(), - item.Data(), - item.Nbytes(), cudaMemcpyHostToDevice, - stream_) == 0, + item.Data(), item.Nbytes(), + cudaMemcpyHostToDevice, stream_) == 0, "Error occurs while copy memory from CPU to GPU."); } } @@ -443,8 +456,10 @@ void TrtBackend::AllocateOutputsBuffer(std::vector* outputs, for (size_t i = 0; i < outputs_desc_.size(); ++i) { // auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str()); auto idx_iter = io_name_index_.find(outputs_desc_[i].name); - FDASSERT(idx_iter != io_name_index_.end(), "TRTBackend Outputs not find name:%s", outputs_desc_[i].name.c_str()); - auto idx = idx_iter->second; + FDASSERT(idx_iter != io_name_index_.end(), + "TRTBackend Outputs not find name:%s", + outputs_desc_[i].name.c_str()); + auto idx = idx_iter->second; auto output_dims = context_->getBindingDimensions(idx); // find the original index of output @@ -457,23 +472,22 @@ void TrtBackend::AllocateOutputsBuffer(std::vector* outputs, // Allocate output buffer memory outputs_device_buffer_[outputs_desc_[i].name].resize(output_dims); - + // binding output buffer - bindings_[idx] = outputs_device_buffer_[outputs_desc_[i].name].data(); - + bindings_[idx] = outputs_device_buffer_[outputs_desc_[i].name].data(); + // set user's outputs info std::vector shape(output_dims.d, output_dims.d + output_dims.nbDims); - if(copy_to_fd) { + if (copy_to_fd) { (*outputs)[ori_idx].is_pinned_memory = option_.enable_pinned_memory; (*outputs)[ori_idx].Resize(shape, outputs_desc_[i].original_dtype, outputs_desc_[i].name); } else { (*outputs)[ori_idx].name = outputs_desc_[i].name; (*outputs)[ori_idx].SetExternalData( - shape, outputs_desc_[i].original_dtype, - bindings_[idx], Device::GPU, - option_.gpu_id); + shape, outputs_desc_[i].original_dtype, bindings_[idx], Device::GPU, + option_.gpu_id); } } } @@ -587,7 +601,8 @@ bool TrtBackend::BuildTrtEngine() { if (option_.serialize_file != "") { FDINFO << "Serialize TensorRTEngine to local file " << option_.serialize_file << "." << std::endl; - std::ofstream engine_file(option_.serialize_file.c_str(), std::ios::binary | std::ios::out); + std::ofstream engine_file(option_.serialize_file.c_str(), + std::ios::binary | std::ios::out); if (!engine_file) { FDERROR << "Failed to open " << option_.serialize_file << " to write." << std::endl; @@ -628,10 +643,11 @@ bool TrtBackend::CreateTrtEngineFromOnnx(const std::string& onnx_model_buffer) { return false; } bool model_parser; - if(save_external_){ - model_parser=!parser_->parseFromFile(onnx_model_buffer.c_str(), 0); - }else{ - model_parser = !parser_->parse(onnx_model_buffer.data(), onnx_model_buffer.size()); + if (save_external_) { + model_parser = !parser_->parseFromFile(onnx_model_buffer.c_str(), 0); + } else { + model_parser = + !parser_->parse(onnx_model_buffer.data(), onnx_model_buffer.size()); } if (model_parser) { FDERROR << "Failed to parse ONNX model by TensorRT." << std::endl; @@ -665,7 +681,8 @@ bool TrtBackend::CreateTrtEngineFromOnnx(const std::string& onnx_model_buffer) { "should be noticed that FastDeploy will rebuild the engine while " "new input shape is out of the collected shape range, this may " "bring some time consuming problem, refer " - "https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/en/faq/" + "https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/en/" + "faq/" "tensorrt_tricks.md for more details." << std::endl; initialized_ = true; @@ -721,27 +738,24 @@ std::vector TrtBackend::GetOutputInfos() { return infos; } -std::unique_ptr TrtBackend::Clone(void *stream, int device_id) { +std::unique_ptr TrtBackend::Clone(void* stream, int device_id) { std::unique_ptr new_backend = utils::make_unique(); auto casted_backend = dynamic_cast(new_backend.get()); - if(device_id > 0 && device_id != option_.gpu_id) { + if (device_id > 0 && device_id != option_.gpu_id) { auto clone_option = option_; clone_option.gpu_id = device_id; clone_option.external_stream_ = stream; if (option_.model_format == ModelFormat::ONNX) { FDASSERT(casted_backend->InitFromOnnx(option_.model_file, clone_option), - "Clone model from ONNX failed while initialize TrtBackend."); + "Clone model from ONNX failed while initialize TrtBackend."); } else { - FDASSERT(casted_backend->InitFromPaddle(option_.model_file, - option_.params_file, clone_option), - "Clone model from Paddle failed while initialize TrtBackend."); + FDASSERT(casted_backend->InitFromPaddle( + option_.model_file, option_.params_file, clone_option), + "Clone model from Paddle failed while initialize TrtBackend."); } - FDWARNING << "The target device id:" - << device_id - << " is different from current device id:" - << option_.gpu_id - << ", cannot share memory with current engine." - << std::endl; + FDWARNING << "The target device id:" << device_id + << " is different from current device id:" << option_.gpu_id + << ", cannot share memory with current engine." << std::endl; return new_backend; } cudaSetDevice(option_.gpu_id); @@ -750,12 +764,15 @@ std::unique_ptr TrtBackend::Clone(void *stream, int device_id) { casted_backend->stream_ = reinterpret_cast(stream); } else { FDASSERT(cudaStreamCreate(&casted_backend->stream_) == 0, - "[ERROR] Error occurs while clone calling cudaStreamCreate()."); + "[ERROR] Error occurs while clone calling cudaStreamCreate()."); } casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end()); - casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end()); - casted_backend->outputs_order_.insert(outputs_order_.begin(), outputs_order_.end()); - casted_backend->shape_range_info_.insert(shape_range_info_.begin(), shape_range_info_.end()); + casted_backend->outputs_desc_.assign(outputs_desc_.begin(), + outputs_desc_.end()); + casted_backend->outputs_order_.insert(outputs_order_.begin(), + outputs_order_.end()); + casted_backend->shape_range_info_.insert(shape_range_info_.begin(), + shape_range_info_.end()); casted_backend->engine_ = engine_; casted_backend->context_ = std::shared_ptr( casted_backend->engine_->createExecutionContext()); diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h index 425087fad..c22f09787 100755 --- a/fastdeploy/backends/tensorrt/trt_backend.h +++ b/fastdeploy/backends/tensorrt/trt_backend.h @@ -58,7 +58,7 @@ namespace fastdeploy { struct TrtValueInfo { std::string name; std::vector shape; - nvinfer1::DataType dtype; // dtype of TRT model + nvinfer1::DataType dtype; // dtype of TRT model FDDataType original_dtype; // dtype of original ONNX/Paddle model }; @@ -97,8 +97,7 @@ class TrtBackend : public BaseBackend { bool InitFromOnnx(const std::string& model_file, const TrtBackendOption& option = TrtBackendOption(), bool from_memory_buffer = false); - bool Infer(std::vector& inputs, - std::vector* outputs, + bool Infer(std::vector& inputs, std::vector* outputs, bool copy_to_fd = true) override; int NumInputs() const { return inputs_desc_.size(); } @@ -107,7 +106,7 @@ class TrtBackend : public BaseBackend { TensorInfo GetOutputInfo(int index); std::vector GetInputInfos() override; std::vector GetOutputInfos() override; - std::unique_ptr Clone(void *stream = nullptr, + std::unique_ptr Clone(void* stream = nullptr, int device_id = -1) override; ~TrtBackend() { diff --git a/fastdeploy/backends/tensorrt/utils.h b/fastdeploy/backends/tensorrt/utils.h index af62c445e..3d4c11f31 100644 --- a/fastdeploy/backends/tensorrt/utils.h +++ b/fastdeploy/backends/tensorrt/utils.h @@ -32,17 +32,15 @@ namespace fastdeploy { struct FDInferDeleter { - template - void operator()(T* obj) const { + template void operator()(T* obj) const { if (obj) { delete obj; -// obj->destroy(); + // obj->destroy(); } } }; -template -using FDUniquePtr = std::unique_ptr; +template using FDUniquePtr = std::unique_ptr; int64_t Volume(const nvinfer1::Dims& d); @@ -72,17 +70,13 @@ std::ostream& operator<<(std::ostream& out, const std::vector& vec) { return out; } -template -class FDGenericBuffer { +template class FDGenericBuffer { public: //! //! \brief Construct an empty buffer. //! explicit FDGenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT) - : mSize(0), - mCapacity(0), - mType(type), - mBuffer(nullptr), + : mSize(0), mCapacity(0), mType(type), mBuffer(nullptr), mExternal_buffer(nullptr) {} //! @@ -104,9 +98,7 @@ class FDGenericBuffer { } FDGenericBuffer(FDGenericBuffer&& buf) - : mSize(buf.mSize), - mCapacity(buf.mCapacity), - mType(buf.mType), + : mSize(buf.mSize), mCapacity(buf.mCapacity), mType(buf.mType), mBuffer(buf.mBuffer) { buf.mSize = 0; buf.mCapacity = 0; @@ -133,7 +125,8 @@ class FDGenericBuffer { //! \brief Returns pointer to underlying array. //! void* data() { - if (mExternal_buffer != nullptr) return mExternal_buffer; + if (mExternal_buffer != nullptr) + return mExternal_buffer; return mBuffer; } @@ -141,7 +134,8 @@ class FDGenericBuffer { //! \brief Returns pointer to underlying array. //! const void* data() const { - if (mExternal_buffer != nullptr) return mExternal_buffer; + if (mExternal_buffer != nullptr) + return mExternal_buffer; return mBuffer; } @@ -213,8 +207,8 @@ class FDGenericBuffer { }; using FDDeviceBuffer = FDGenericBuffer; -using FDDeviceHostBuffer = FDGenericBuffer; +using FDDeviceHostBuffer = + FDGenericBuffer; class FDTrtLogger : public nvinfer1::ILogger { public: diff --git a/fastdeploy/backends/common/multiclass_nms.cc b/fastdeploy/vision/detection/ppdet/multiclass_nms.cc similarity index 95% rename from fastdeploy/backends/common/multiclass_nms.cc rename to fastdeploy/vision/detection/ppdet/multiclass_nms.cc index db98e8c7e..f4c081036 100644 --- a/fastdeploy/backends/common/multiclass_nms.cc +++ b/fastdeploy/vision/detection/ppdet/multiclass_nms.cc @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "fastdeploy/backends/common/multiclass_nms.h" +#include "fastdeploy/vision/detection/ppdet/multiclass_nms.h" #include #include "fastdeploy/core/fd_tensor.h" #include "fastdeploy/utils/utils.h" namespace fastdeploy { -namespace backend { +namespace vision { +namespace detection { template bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2) { @@ -79,7 +80,7 @@ float JaccardOverlap(const float* box1, const float* box2, } } -void MultiClassNMS::FastNMS(const float* boxes, const float* scores, +void PaddleMultiClassNMS::FastNMS(const float* boxes, const float* scores, const int& num_boxes, std::vector* keep_indices) { std::vector> sorted_indices; @@ -109,7 +110,7 @@ void MultiClassNMS::FastNMS(const float* boxes, const float* scores, } } -int MultiClassNMS::NMSForEachSample( +int PaddleMultiClassNMS::NMSForEachSample( const float* boxes, const float* scores, int num_boxes, int num_classes, std::map>* keep_indices) { for (int i = 0; i < num_classes; ++i) { @@ -152,7 +153,7 @@ int MultiClassNMS::NMSForEachSample( return num_det; } -void MultiClassNMS::Compute(const float* boxes_data, const float* scores_data, +void PaddleMultiClassNMS::Compute(const float* boxes_data, const float* scores_data, const std::vector& boxes_dim, const std::vector& scores_dim) { int score_size = scores_dim.size(); @@ -220,5 +221,6 @@ void MultiClassNMS::Compute(const float* boxes_data, const float* scores_data, } } } -} // namespace backend +} // namespace detection +} // namespace vision } // namespace fastdeploy diff --git a/fastdeploy/backends/common/multiclass_nms.h b/fastdeploy/vision/detection/ppdet/multiclass_nms.h similarity index 92% rename from fastdeploy/backends/common/multiclass_nms.h rename to fastdeploy/vision/detection/ppdet/multiclass_nms.h index 48a3d9336..e7e66cae3 100644 --- a/fastdeploy/backends/common/multiclass_nms.h +++ b/fastdeploy/vision/detection/ppdet/multiclass_nms.h @@ -18,8 +18,9 @@ #include namespace fastdeploy { -namespace backend { -struct MultiClassNMS { +namespace vision { +namespace detection { +struct PaddleMultiClassNMS { int64_t background_label = -1; int64_t keep_top_k = -1; float nms_eta; @@ -40,6 +41,6 @@ struct MultiClassNMS { const std::vector& boxes_dim, const std::vector& scores_dim); }; -} // namespace backend - +} // namespace detection +} // namespace vision } // namespace fastdeploy diff --git a/fastdeploy/vision/detection/ppdet/postprocessor.cc b/fastdeploy/vision/detection/ppdet/postprocessor.cc index 7b72e24ba..f09a21556 100644 --- a/fastdeploy/vision/detection/ppdet/postprocessor.cc +++ b/fastdeploy/vision/detection/ppdet/postprocessor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "fastdeploy/vision/detection/ppdet/postprocessor.h" +#include "fastdeploy/vision/detection/ppdet/multiclass_nms.h" #include "fastdeploy/vision/utils/utils.h" namespace fastdeploy { @@ -176,7 +177,7 @@ bool PaddleDetPostprocessor::ProcessUnDecodeResults( return false; } - backend::MultiClassNMS nms; + PaddleMultiClassNMS nms; nms.background_label = -1; nms.keep_top_k = 100; nms.nms_eta = 1.0;