diff --git a/csrc/fastdeploy/CMakeLists.txt b/csrc/fastdeploy/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/csrc/fastdeploy/backends/backend.h b/csrc/fastdeploy/backends/backend.h new file mode 100644 index 000000000..de7b5a575 --- /dev/null +++ b/csrc/fastdeploy/backends/backend.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "fastdeploy/backends/common/multiclass_nms.h" +#include "fastdeploy/core/fd_tensor.h" + +namespace fastdeploy { + +struct TensorInfo { + std::string name; + std::vector shape; + FDDataType dtype; +}; + +class BaseBackend { + public: + bool initialized_ = false; + + BaseBackend() {} + virtual ~BaseBackend() = default; + + virtual bool Initialized() const { return initialized_; } + + virtual int NumInputs() const = 0; + virtual int NumOutputs() const = 0; + virtual TensorInfo GetInputInfo(int index) = 0; + virtual TensorInfo GetOutputInfo(int index) = 0; + virtual bool Infer(std::vector& inputs, + std::vector* outputs) = 0; +}; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/common/multiclass_nms.cc b/csrc/fastdeploy/backends/common/multiclass_nms.cc new file mode 100644 index 000000000..c3d65ec7d --- /dev/null +++ b/csrc/fastdeploy/backends/common/multiclass_nms.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/backends/common/multiclass_nms.h" +#include +#include "fastdeploy/core/fd_tensor.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { +namespace backend { +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +void GetMaxScoreIndex(const float* scores, const int& score_size, + const float& threshold, const int& top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < score_size; ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +float BBoxArea(const float* box, const bool& normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return 0.f; + } else { + const float w = box[2] - box[0]; + const float h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +float JaccardOverlap(const float* box1, const float* box2, + const bool& normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return 0.f; + } else { + const float inter_xmin = std::max(box1[0], box2[0]); + const float inter_ymin = std::max(box1[1], box2[1]); + const float inter_xmax = std::min(box1[2], box2[2]); + const float inter_ymax = std::min(box1[3], box2[3]); + float norm = normalized ? 0.0f : 1.0f; + float inter_w = inter_xmax - inter_xmin + norm; + float inter_h = inter_ymax - inter_ymin + norm; + const float inter_area = inter_w * inter_h; + const float bbox1_area = BBoxArea(box1, normalized); + const float bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +void MultiClassNMS::FastNMS(const float* boxes, const float* scores, + const int& num_boxes, + std::vector* keep_indices) { + std::vector> sorted_indices; + GetMaxScoreIndex(scores, num_boxes, score_threshold, nms_top_k, + &sorted_indices); + + float adaptive_threshold = nms_threshold; + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < keep_indices->size(); ++k) { + if (!keep) { + break; + } + const int kept_idx = (*keep_indices)[k]; + float overlap = + JaccardOverlap(boxes + idx * 4, boxes + kept_idx * 4, normalized); + keep = overlap <= adaptive_threshold; + } + if (keep) { + keep_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && nms_eta<1.0 & adaptive_threshold> 0.5) { + adaptive_threshold *= nms_eta; + } + } +} + +int MultiClassNMS::NMSForEachSample( + const float* boxes, const float* scores, int num_boxes, int num_classes, + std::map>* keep_indices) { + for (int i = 0; i < num_classes; ++i) { + if (i == background_label) { + continue; + } + const float* score_for_class_i = scores + i * num_boxes; + FastNMS(boxes, score_for_class_i, num_boxes, &((*keep_indices)[i])); + } + int num_det = 0; + for (auto iter = keep_indices->begin(); iter != keep_indices->end(); ++iter) { + num_det += iter->second.size(); + } + + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : *keep_indices) { + int label = it.first; + const float* current_score = scores + label * num_boxes; + auto& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + score_index_pairs.push_back( + std::make_pair(current_score[idx], std::make_pair(label, idx))); + } + } + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(*keep_indices); + num_det = keep_top_k; + } + return num_det; +} + +void MultiClassNMS::Compute(const float* boxes_data, const float* scores_data, + const std::vector& boxes_dim, + const std::vector& scores_dim) { + int score_size = scores_dim.size(); + + int64_t batch_size = scores_dim[0]; + int64_t box_dim = boxes_dim[2]; + int64_t out_dim = box_dim + 2; + + int num_nmsed_out = 0; + FDASSERT(score_size == 3, "Require rank of input scores be 3, but now it's " + + std::to_string(score_size) + "."); + FDASSERT(boxes_dim[2] == 4, + "Require the 3-dimension of input boxes be 4, but now it's " + + std::to_string(boxes_dim[2]) + "."); + out_num_rois_data.resize(batch_size); + + std::vector>> all_indices; + for (size_t i = 0; i < batch_size; ++i) { + std::map> indices; // indices kept for each class + const float* current_boxes_ptr = + boxes_data + i * boxes_dim[1] * boxes_dim[2]; + const float* current_scores_ptr = + scores_data + i * scores_dim[1] * scores_dim[2]; + int num = NMSForEachSample(current_boxes_ptr, current_scores_ptr, + boxes_dim[1], scores_dim[1], &indices); + num_nmsed_out += num; + out_num_rois_data[i] = num; + all_indices.emplace_back(indices); + } + std::vector out_box_dims = {num_nmsed_out, 6}; + std::vector out_index_dims = {num_nmsed_out, 1}; + if (num_nmsed_out == 0) { + for (size_t i = 0; i < batch_size; ++i) { + out_num_rois_data[i] = 0; + } + return; + } + out_box_data.resize(num_nmsed_out * 6); + out_index_data.resize(num_nmsed_out); + + int count = 0; + for (size_t i = 0; i < batch_size; ++i) { + const float* current_boxes_ptr = + boxes_data + i * boxes_dim[1] * boxes_dim[2]; + const float* current_scores_ptr = + scores_data + i * scores_dim[1] * scores_dim[2]; + for (const auto& it : all_indices[i]) { + int label = it.first; + const auto& indices = it.second; + const float* current_scores_class_ptr = + current_scores_ptr + label * scores_dim[2]; + for (size_t j = 0; j < indices.size(); ++j) { + int start = count * 6; + out_box_data[start] = label; + out_box_data[start + 1] = current_scores_class_ptr[indices[j]]; + + out_box_data[start + 2] = current_boxes_ptr[indices[j] * 4]; + out_box_data[start + 3] = current_boxes_ptr[indices[j] * 4 + 1]; + out_box_data[start + 4] = current_boxes_ptr[indices[j] * 4 + 2]; + + out_box_data[start + 5] = current_boxes_ptr[indices[j] * 4 + 3]; + out_index_data[count] = i * boxes_dim[1] + indices[j]; + count += 1; + } + } + } +} +} // namespace backend +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/common/multiclass_nms.h b/csrc/fastdeploy/backends/common/multiclass_nms.h new file mode 100644 index 000000000..48a3d9336 --- /dev/null +++ b/csrc/fastdeploy/backends/common/multiclass_nms.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +namespace fastdeploy { +namespace backend { +struct MultiClassNMS { + int64_t background_label = -1; + int64_t keep_top_k = -1; + float nms_eta; + float nms_threshold = 0.7; + int64_t nms_top_k; + bool normalized; + float score_threshold; + + std::vector out_num_rois_data; + std::vector out_index_data; + std::vector out_box_data; + void FastNMS(const float* boxes, const float* scores, const int& num_boxes, + std::vector* keep_indices); + int NMSForEachSample(const float* boxes, const float* scores, int num_boxes, + int num_classes, + std::map>* keep_indices); + void Compute(const float* boxes, const float* scores, + const std::vector& boxes_dim, + const std::vector& scores_dim); +}; +} // namespace backend + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/ort/ops/multiclass_nms.cc b/csrc/fastdeploy/backends/ort/ops/multiclass_nms.cc new file mode 100644 index 000000000..a132dbffc --- /dev/null +++ b/csrc/fastdeploy/backends/ort/ops/multiclass_nms.cc @@ -0,0 +1,261 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NON_64_PLATFORM + +#include "fastdeploy/backends/ort/ops/multiclass_nms.h" +#include +#include "fastdeploy/core/fd_tensor.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { + +struct OrtTensorDimensions : std::vector { + OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) { + OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value); + std::vector::operator=(ort.GetTensorShape(info)); + ort.ReleaseTensorTypeAndShapeInfo(info); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +void GetMaxScoreIndex(const float* scores, const int& score_size, + const float& threshold, const int& top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < score_size; ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +float BBoxArea(const float* box, const bool& normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return 0.f; + } else { + const float w = box[2] - box[0]; + const float h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +float JaccardOverlap(const float* box1, const float* box2, + const bool& normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return 0.f; + } else { + const float inter_xmin = std::max(box1[0], box2[0]); + const float inter_ymin = std::max(box1[1], box2[1]); + const float inter_xmax = std::min(box1[2], box2[2]); + const float inter_ymax = std::min(box1[3], box2[3]); + float norm = normalized ? 0.0f : 1.0f; + float inter_w = inter_xmax - inter_xmin + norm; + float inter_h = inter_ymax - inter_ymin + norm; + const float inter_area = inter_w * inter_h; + const float bbox1_area = BBoxArea(box1, normalized); + const float bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +void MultiClassNmsKernel::FastNMS(const float* boxes, const float* scores, + const int& num_boxes, + std::vector* keep_indices) { + std::vector> sorted_indices; + GetMaxScoreIndex(scores, num_boxes, score_threshold, nms_top_k, + &sorted_indices); + + float adaptive_threshold = nms_threshold; + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < keep_indices->size(); ++k) { + if (!keep) { + break; + } + const int kept_idx = (*keep_indices)[k]; + float overlap = + JaccardOverlap(boxes + idx * 4, boxes + kept_idx * 4, normalized); + keep = overlap <= adaptive_threshold; + } + if (keep) { + keep_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && nms_eta<1.0 & adaptive_threshold> 0.5) { + adaptive_threshold *= nms_eta; + } + } +} + +int MultiClassNmsKernel::NMSForEachSample( + const float* boxes, const float* scores, int num_boxes, int num_classes, + std::map>* keep_indices) { + for (int i = 0; i < num_classes; ++i) { + if (i == background_label) { + continue; + } + const float* score_for_class_i = scores + i * num_boxes; + FastNMS(boxes, score_for_class_i, num_boxes, &((*keep_indices)[i])); + } + int num_det = 0; + for (auto iter = keep_indices->begin(); iter != keep_indices->end(); ++iter) { + num_det += iter->second.size(); + } + + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : *keep_indices) { + int label = it.first; + const float* current_score = scores + label * num_boxes; + auto& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + score_index_pairs.push_back( + std::make_pair(current_score[idx], std::make_pair(label, idx))); + } + } + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(*keep_indices); + num_det = keep_top_k; + } + return num_det; +} + +void MultiClassNmsKernel::Compute(OrtKernelContext* context) { + const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0); + const OrtValue* scores = ort_.KernelContext_GetInput(context, 1); + const float* boxes_data = + reinterpret_cast(ort_.GetTensorData(boxes)); + const float* scores_data = + reinterpret_cast(ort_.GetTensorData(scores)); + OrtTensorDimensions boxes_dim(ort_, boxes); + OrtTensorDimensions scores_dim(ort_, scores); + int score_size = scores_dim.size(); + + int64_t batch_size = scores_dim[0]; + int64_t box_dim = boxes_dim[2]; + int64_t out_dim = box_dim + 2; + + int num_nmsed_out = 0; + FDASSERT(score_size == 3, "Require rank of input scores be 3, but now it's " + + std::to_string(score_size) + "."); + FDASSERT(boxes_dim[2] == 4, + "Require the 3-dimension of input boxes be 4, but now it's " + + std::to_string(boxes_dim[2]) + "."); + std::vector out_num_rois_dims = {batch_size}; + OrtValue* out_num_rois = ort_.KernelContext_GetOutput( + context, 2, out_num_rois_dims.data(), out_num_rois_dims.size()); + int32_t* out_num_rois_data = ort_.GetTensorMutableData(out_num_rois); + + std::vector>> all_indices; + for (size_t i = 0; i < batch_size; ++i) { + std::map> indices; // indices kept for each class + const float* current_boxes_ptr = + boxes_data + i * boxes_dim[1] * boxes_dim[2]; + const float* current_scores_ptr = + scores_data + i * scores_dim[1] * scores_dim[2]; + int num = NMSForEachSample(current_boxes_ptr, current_scores_ptr, + boxes_dim[1], scores_dim[1], &indices); + num_nmsed_out += num; + out_num_rois_data[i] = num; + all_indices.emplace_back(indices); + } + std::vector out_box_dims = {num_nmsed_out, 6}; + std::vector out_index_dims = {num_nmsed_out, 1}; + OrtValue* out_box = ort_.KernelContext_GetOutput( + context, 0, out_box_dims.data(), out_box_dims.size()); + OrtValue* out_index = ort_.KernelContext_GetOutput( + context, 1, out_index_dims.data(), out_index_dims.size()); + if (num_nmsed_out == 0) { + int32_t* out_num_rois_data = + ort_.GetTensorMutableData(out_num_rois); + for (size_t i = 0; i < batch_size; ++i) { + out_num_rois_data[i] = 0; + } + return; + } + float* out_box_data = ort_.GetTensorMutableData(out_box); + int32_t* out_index_data = ort_.GetTensorMutableData(out_index); + + int count = 0; + for (size_t i = 0; i < batch_size; ++i) { + const float* current_boxes_ptr = + boxes_data + i * boxes_dim[1] * boxes_dim[2]; + const float* current_scores_ptr = + scores_data + i * scores_dim[1] * scores_dim[2]; + for (const auto& it : all_indices[i]) { + int label = it.first; + const auto& indices = it.second; + const float* current_scores_class_ptr = + current_scores_ptr + label * scores_dim[2]; + for (size_t j = 0; j < indices.size(); ++j) { + int start = count * 6; + out_box_data[start] = label; + out_box_data[start + 1] = current_scores_class_ptr[indices[j]]; + + out_box_data[start + 2] = current_boxes_ptr[indices[j] * 4]; + out_box_data[start + 3] = current_boxes_ptr[indices[j] * 4 + 1]; + out_box_data[start + 4] = current_boxes_ptr[indices[j] * 4 + 2]; + + out_box_data[start + 5] = current_boxes_ptr[indices[j] * 4 + 3]; + out_index_data[count] = i * boxes_dim[1] + indices[j]; + count += 1; + } + } + } +} + +void MultiClassNmsKernel::GetAttribute(const OrtKernelInfo* info) { + background_label = + ort_.KernelInfoGetAttribute(info, "background_label"); + keep_top_k = ort_.KernelInfoGetAttribute(info, "keep_top_k"); + nms_eta = ort_.KernelInfoGetAttribute(info, "nms_eta"); + nms_threshold = ort_.KernelInfoGetAttribute(info, "nms_threshold"); + nms_top_k = ort_.KernelInfoGetAttribute(info, "nms_top_k"); + normalized = ort_.KernelInfoGetAttribute(info, "normalized"); + score_threshold = ort_.KernelInfoGetAttribute(info, "score_threshold"); +} +} // namespace fastdeploy + +#endif \ No newline at end of file diff --git a/csrc/fastdeploy/backends/ort/ops/multiclass_nms.h b/csrc/fastdeploy/backends/ort/ops/multiclass_nms.h new file mode 100644 index 000000000..4e167d669 --- /dev/null +++ b/csrc/fastdeploy/backends/ort/ops/multiclass_nms.h @@ -0,0 +1,81 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#ifndef NON_64_PLATFORM +#include "onnxruntime_cxx_api.h" // NOLINT + +namespace fastdeploy { + +struct MultiClassNmsKernel { + protected: + int64_t background_label = -1; + int64_t keep_top_k = -1; + float nms_eta; + float nms_threshold = 0.7; + int64_t nms_top_k; + bool normalized; + float score_threshold; + Ort::CustomOpApi ort_; + + public: + MultiClassNmsKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info) + : ort_(ort) { + GetAttribute(info); + } + + void GetAttribute(const OrtKernelInfo* info); + + void Compute(OrtKernelContext* context); + void FastNMS(const float* boxes, const float* scores, const int& num_boxes, + std::vector* keep_indices); + int NMSForEachSample(const float* boxes, const float* scores, int num_boxes, + int num_classes, + std::map>* keep_indices); +}; + +struct MultiClassNmsOp + : Ort::CustomOpBase { + void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const { + return new MultiClassNmsKernel(api, info); + } + + const char* GetName() const { return "MultiClassNMS"; } + + size_t GetInputTypeCount() const { return 2; } + + ONNXTensorElementDataType GetInputType(size_t index) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; + } + + size_t GetOutputTypeCount() const { return 3; } + + ONNXTensorElementDataType GetOutputType(size_t index) const { + if (index == 0) { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; + } + return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; + } + + const char* GetExecutionProviderType() const { + return "CPUExecutionProvider"; + } +}; + +} // namespace fastdeploy + +#endif \ No newline at end of file diff --git a/csrc/fastdeploy/backends/ort/ort_backend.cc b/csrc/fastdeploy/backends/ort/ort_backend.cc new file mode 100644 index 000000000..c17890109 --- /dev/null +++ b/csrc/fastdeploy/backends/ort/ort_backend.cc @@ -0,0 +1,279 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/backends/ort/ort_backend.h" +#include +#include "fastdeploy/backends/ort/ops/multiclass_nms.h" +#include "fastdeploy/backends/ort/utils.h" +#include "fastdeploy/utils/utils.h" +#ifdef ENABLE_PADDLE_FRONTEND +#include "paddle2onnx/converter.h" +#endif + +namespace fastdeploy { + +std::vector OrtBackend::custom_operators_ = + std::vector(); + +void OrtBackend::BuildOption(const OrtBackendOption& option) { + option_ = option; + if (option.graph_optimization_level >= 0) { + session_options_.SetGraphOptimizationLevel( + GraphOptimizationLevel(option.graph_optimization_level)); + } + if (option.intra_op_num_threads >= 0) { + session_options_.SetIntraOpNumThreads(option.intra_op_num_threads); + } + if (option.inter_op_num_threads >= 0) { + session_options_.SetInterOpNumThreads(option.inter_op_num_threads); + } + if (option.execution_mode >= 0) { + session_options_.SetExecutionMode(ExecutionMode(option.execution_mode)); + } + if (option.use_gpu) { + auto all_providers = Ort::GetAvailableProviders(); + bool support_cuda = false; + std::string providers_msg = ""; + for (size_t i = 0; i < all_providers.size(); ++i) { + providers_msg = providers_msg + all_providers[i] + ", "; + if (all_providers[i] == "CUDAExecutionProvider") { + support_cuda = true; + } + } + if (!support_cuda) { + FDWARNING << "Compiled fastdeploy with onnxruntime doesn't " + "support GPU, the available providers are " + << providers_msg << "will fallback to CPUExecutionProvider." + << std::endl; + option_.use_gpu = false; + } else { + FDASSERT(option.gpu_id == 0, "Requires gpu_id == 0, but now gpu_id = " + + std::to_string(option.gpu_id) + "."); + OrtCUDAProviderOptions cuda_options; + cuda_options.device_id = option.gpu_id; + session_options_.AppendExecutionProvider_CUDA(cuda_options); + } + } +} + +bool OrtBackend::InitFromPaddle(const std::string& model_file, + const std::string& params_file, + const OrtBackendOption& option, bool verbose) { + if (initialized_) { + FDERROR << "OrtBackend is already initlized, cannot initialize again." + << std::endl; + return false; + } +#ifdef ENABLE_PADDLE_FRONTEND + char* model_content_ptr; + int model_content_size = 0; + + std::vector custom_ops; + for (auto& item : option.custom_op_info_) { + paddle2onnx::CustomOp op; + strcpy(op.op_name, item.first.c_str()); + strcpy(op.export_op_name, item.second.c_str()); + custom_ops.emplace_back(op); + } + if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(), + &model_content_ptr, &model_content_size, 11, true, + verbose, true, true, true, custom_ops.data(), + custom_ops.size())) { + FDERROR << "Error occured while export PaddlePaddle to ONNX format." + << std::endl; + return false; + } + + std::string onnx_model_proto(model_content_ptr, + model_content_ptr + model_content_size); + delete[] model_content_ptr; + model_content_ptr = nullptr; + return InitFromOnnx(onnx_model_proto, option, true); +#else + FDERROR << "Didn't compile with PaddlePaddle frontend, you can try to " + "call `InitFromOnnx` instead." + << std::endl; +#endif + return false; +} + +bool OrtBackend::InitFromOnnx(const std::string& model_file, + const OrtBackendOption& option, + bool from_memory_buffer) { + if (initialized_) { + FDERROR << "OrtBackend is already initlized, cannot initialize again." + << std::endl; + return false; + } + + BuildOption(option); + InitCustomOperators(); + if (from_memory_buffer) { + session_ = {env_, model_file.data(), model_file.size(), session_options_}; + } else { +#ifdef _WIN32 + session_ = {env_, + std::wstring(model_file.begin(), model_file.end()).c_str(), + session_options_}; +#else + session_ = {env_, model_file.c_str(), session_options_}; +#endif + } + binding_ = std::make_shared(session_); + + Ort::MemoryInfo memory_info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + Ort::Allocator allocator(session_, memory_info); + size_t n_inputs = session_.GetInputCount(); + for (size_t i = 0; i < n_inputs; ++i) { + auto input_name = session_.GetInputName(i, allocator); + auto type_info = session_.GetInputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + inputs_desc_.emplace_back(OrtValueInfo{input_name, shape, data_type}); + allocator.Free(input_name); + } + + size_t n_outputs = session_.GetOutputCount(); + for (size_t i = 0; i < n_outputs; ++i) { + auto output_name = session_.GetOutputName(i, allocator); + auto type_info = session_.GetOutputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + outputs_desc_.emplace_back(OrtValueInfo{output_name, shape, data_type}); + + Ort::MemoryInfo out_memory_info("Cpu", OrtDeviceAllocator, 0, + OrtMemTypeDefault); + binding_->BindOutput(output_name, out_memory_info); + + allocator.Free(output_name); + } + initialized_ = true; + return true; +} + +void OrtBackend::CopyToCpu(const Ort::Value& value, FDTensor* tensor) { + const auto info = value.GetTensorTypeAndShapeInfo(); + const auto data_type = info.GetElementType(); + size_t numel = info.GetElementCount(); + tensor->shape = info.GetShape(); + + if (data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + tensor->data.resize(numel * sizeof(float)); + memcpy(static_cast(tensor->Data()), value.GetTensorData(), + numel * sizeof(float)); + tensor->dtype = FDDataType::FP32; + } else if (data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) { + tensor->data.resize(numel * sizeof(int32_t)); + memcpy(static_cast(tensor->Data()), value.GetTensorData(), + numel * sizeof(int32_t)); + tensor->dtype = FDDataType::INT32; + } else if (data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { + tensor->data.resize(numel * sizeof(int64_t)); + memcpy(static_cast(tensor->Data()), value.GetTensorData(), + numel * sizeof(int64_t)); + tensor->dtype = FDDataType::INT64; + } else if (data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { + tensor->data.resize(numel * sizeof(double)); + memcpy(static_cast(tensor->Data()), value.GetTensorData(), + numel * sizeof(double)); + tensor->dtype = FDDataType::FP64; + } else { + FDASSERT(false, "Unrecognized data type of " + std::to_string(data_type) + + " while calling OrtBackend::CopyToCpu()."); + } +} + +bool OrtBackend::Infer(std::vector& inputs, + std::vector* outputs) { + if (inputs.size() != inputs_desc_.size()) { + FDERROR << "[OrtBackend] Size of the inputs(" << inputs.size() + << ") should keep same with the inputs of this model(" + << inputs_desc_.size() << ")." << std::endl; + return false; + } + + // from FDTensor to Ort Inputs + for (size_t i = 0; i < inputs.size(); ++i) { + auto ort_value = CreateOrtValue(inputs[i], option_.use_gpu); + binding_->BindInput(inputs[i].name.c_str(), ort_value); + } + + for (size_t i = 0; i < outputs_desc_.size(); ++i) { + Ort::MemoryInfo memory_info("Cpu", OrtDeviceAllocator, 0, + OrtMemTypeDefault); + binding_->BindOutput(outputs_desc_[i].name.c_str(), memory_info); + } + + // Inference with inputs + try { + session_.Run({}, *(binding_.get())); + } catch (const std::exception& e) { + FDERROR << "Failed to Infer: " << e.what() << std::endl; + return false; + } + + // Copy result after inference + std::vector ort_outputs = binding_->GetOutputValues(); + outputs->resize(ort_outputs.size()); + for (size_t i = 0; i < ort_outputs.size(); ++i) { + (*outputs)[i].name = outputs_desc_[i].name; + CopyToCpu(ort_outputs[i], &((*outputs)[i])); + } + + return true; +} + +TensorInfo OrtBackend::GetInputInfo(int index) { + FDASSERT(index < NumInputs(), "The index:" + std::to_string(index) + + " should less than the number of inputs:" + + std::to_string(NumInputs()) + "."); + TensorInfo info; + info.name = inputs_desc_[index].name; + info.shape.assign(inputs_desc_[index].shape.begin(), + inputs_desc_[index].shape.end()); + info.dtype = GetFdDtype(inputs_desc_[index].dtype); + return info; +} + +TensorInfo OrtBackend::GetOutputInfo(int index) { + FDASSERT(index < NumOutputs(), + "The index:" + std::to_string(index) + + " should less than the number of outputs:" + + std::to_string(NumOutputs()) + "."); + TensorInfo info; + info.name = outputs_desc_[index].name; + info.shape.assign(outputs_desc_[index].shape.begin(), + outputs_desc_[index].shape.end()); + info.dtype = GetFdDtype(outputs_desc_[index].dtype); + return info; +} + +void OrtBackend::InitCustomOperators() { +#ifndef NON_64_PLATFORM + if (custom_operators_.size() == 0) { + MultiClassNmsOp* custom_op = new MultiClassNmsOp{}; + custom_operators_.push_back(custom_op); + } + for (size_t i = 0; i < custom_operators_.size(); ++i) { + custom_op_domain_.Add(custom_operators_[i]); + } + session_options_.Add(custom_op_domain_); +#endif +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/ort/ort_backend.h b/csrc/fastdeploy/backends/ort/ort_backend.h new file mode 100644 index 000000000..5070934c6 --- /dev/null +++ b/csrc/fastdeploy/backends/ort/ort_backend.h @@ -0,0 +1,93 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "fastdeploy/backends/backend.h" +#include "onnxruntime_cxx_api.h" // NOLINT + +namespace fastdeploy { + +struct OrtValueInfo { + std::string name; + std::vector shape; + ONNXTensorElementDataType dtype; +}; + +struct OrtBackendOption { + // -1 means default + // 0: ORT_DISABLE_ALL + // 1: ORT_ENABLE_BASIC + // 2: ORT_ENABLE_EXTENDED + // 99: ORT_ENABLE_ALL (enable some custom optimizations e.g bert) + int graph_optimization_level = -1; + int intra_op_num_threads = -1; + int inter_op_num_threads = -1; + // 0: ORT_SEQUENTIAL + // 1: ORT_PARALLEL + int execution_mode = -1; + bool use_gpu = false; + int gpu_id = 0; + + // inside parameter, maybe remove next version + bool remove_multiclass_nms_ = false; + std::map custom_op_info_; +}; + +class OrtBackend : public BaseBackend { + public: + OrtBackend() {} + virtual ~OrtBackend() = default; + + void BuildOption(const OrtBackendOption& option); + + bool InitFromPaddle(const std::string& model_file, + const std::string& params_file, + const OrtBackendOption& option = OrtBackendOption(), + bool verbose = false); + + bool InitFromOnnx(const std::string& model_file, + const OrtBackendOption& option = OrtBackendOption(), + bool from_memory_buffer = false); + + bool Infer(std::vector& inputs, std::vector* outputs); + + int NumInputs() const { return inputs_desc_.size(); } + + int NumOutputs() const { return outputs_desc_.size(); } + + TensorInfo GetInputInfo(int index); + TensorInfo GetOutputInfo(int index); + static std::vector custom_operators_; + void InitCustomOperators(); + + private: + Ort::Env env_; + Ort::Session session_{nullptr}; + Ort::SessionOptions session_options_; + std::shared_ptr binding_; + std::vector inputs_desc_; + std::vector outputs_desc_; +#ifndef NON_64_PLATFORM + Ort::CustomOpDomain custom_op_domain_ = Ort::CustomOpDomain("Paddle"); +#endif + OrtBackendOption option_; + void CopyToCpu(const Ort::Value& value, FDTensor* tensor); +}; +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/ort/utils.cc b/csrc/fastdeploy/backends/ort/utils.cc new file mode 100644 index 000000000..ae3e45b86 --- /dev/null +++ b/csrc/fastdeploy/backends/ort/utils.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/backends/ort/utils.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { + +ONNXTensorElementDataType GetOrtDtype(const FDDataType& fd_dtype) { + if (fd_dtype == FDDataType::FP32) { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; + } else if (fd_dtype == FDDataType::FP64) { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; + } else if (fd_dtype == FDDataType::INT32) { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; + } else if (fd_dtype == FDDataType::INT64) { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; + } + FDERROR << "Unrecognized fastdeply data type:" << Str(fd_dtype) << "." + << std::endl; + return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; +} + +FDDataType GetFdDtype(const ONNXTensorElementDataType& ort_dtype) { + if (ort_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + return FDDataType::FP32; + } else if (ort_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { + return FDDataType::FP64; + } else if (ort_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) { + return FDDataType::INT32; + } else if (ort_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { + return FDDataType::INT64; + } + FDERROR << "Unrecognized ort data type:" << ort_dtype << "." << std::endl; + return FDDataType::FP32; +} + +Ort::Value CreateOrtValue(FDTensor& tensor, bool is_backend_cuda) { + FDASSERT(tensor.device == Device::GPU || tensor.device == Device::CPU, + "Only support tensor which device is CPU or GPU for OrtBackend."); + if (tensor.device == Device::GPU && is_backend_cuda) { + Ort::MemoryInfo memory_info("Cuda", OrtDeviceAllocator, 0, + OrtMemTypeDefault); + auto ort_value = Ort::Value::CreateTensor( + memory_info, tensor.MutableData(), tensor.Nbytes(), tensor.shape.data(), + tensor.shape.size(), GetOrtDtype(tensor.dtype)); + return ort_value; + } + Ort::MemoryInfo memory_info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + auto ort_value = Ort::Value::CreateTensor( + memory_info, tensor.Data(), tensor.Nbytes(), tensor.shape.data(), + tensor.shape.size(), GetOrtDtype(tensor.dtype)); + return ort_value; +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/ort/utils.h b/csrc/fastdeploy/backends/ort/utils.h new file mode 100644 index 000000000..e2912ad38 --- /dev/null +++ b/csrc/fastdeploy/backends/ort/utils.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "fastdeploy/backends/backend.h" +#include "onnxruntime_cxx_api.h" // NOLINT + +namespace fastdeploy { + +// Convert FDDataType to OrtDataType +ONNXTensorElementDataType GetOrtDtype(const FDDataType& fd_dtype); + +// Convert OrtDataType to FDDataType +FDDataType GetFdDtype(const ONNXTensorElementDataType& ort_dtype); + +// Create Ort::Value +// is_backend_cuda specify if the onnxruntime use CUDAExectionProvider +// While is_backend_cuda = true, and tensor.device = Device::GPU +// Will directly share the cuda data in tensor to OrtValue +Ort::Value CreateOrtValue(FDTensor& tensor, bool is_backend_cuda = false); + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/paddle/paddle_backend.cc b/csrc/fastdeploy/backends/paddle/paddle_backend.cc new file mode 100644 index 000000000..2fae38937 --- /dev/null +++ b/csrc/fastdeploy/backends/paddle/paddle_backend.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/backends/paddle/paddle_backend.h" + +namespace fastdeploy { + +void PaddleBackend::BuildOption(const PaddleBackendOption& option) { + if (option.use_gpu) { + config_.EnableUseGpu(option.gpu_mem_init_size, option.gpu_id); + } else { + config_.DisableGpu(); + if (option.enable_mkldnn) { + config_.EnableMKLDNN(); + config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size); + } + } + config_.SetCpuMathLibraryNumThreads(option.cpu_thread_num); +} + +bool PaddleBackend::InitFromPaddle(const std::string& model_file, + const std::string& params_file, + const PaddleBackendOption& option) { + if (initialized_) { + FDERROR << "PaddleBackend is already initlized, cannot initialize again." + << std::endl; + return false; + } + config_.SetModel(model_file, params_file); + BuildOption(option); + predictor_ = paddle_infer::CreatePredictor(config_); + std::vector input_names = predictor_->GetInputNames(); + std::vector output_names = predictor_->GetOutputNames(); + for (size_t i = 0; i < input_names.size(); ++i) { + auto handle = predictor_->GetInputHandle(input_names[i]); + TensorInfo info; + auto shape = handle->shape(); + info.shape.assign(shape.begin(), shape.end()); + info.dtype = PaddleDataTypeToFD(handle->type()); + info.name = input_names[i]; + inputs_desc_.emplace_back(info); + } + for (size_t i = 0; i < output_names.size(); ++i) { + auto handle = predictor_->GetOutputHandle(output_names[i]); + TensorInfo info; + auto shape = handle->shape(); + info.shape.assign(shape.begin(), shape.end()); + info.dtype = PaddleDataTypeToFD(handle->type()); + info.name = output_names[i]; + outputs_desc_.emplace_back(info); + } + initialized_ = true; + return true; +} + +TensorInfo PaddleBackend::GetInputInfo(int index) { + FDASSERT(index < NumInputs(), "The index:" + std::to_string(index) + + " should less than the number of inputs:" + + std::to_string(NumInputs()) + "."); + return inputs_desc_[index]; +} + +TensorInfo PaddleBackend::GetOutputInfo(int index) { + FDASSERT(index < NumOutputs(), + "The index:" + std::to_string(index) + + " should less than the number of outputs:" + + std::to_string(NumOutputs()) + "."); + return outputs_desc_[index]; +} + +bool PaddleBackend::Infer(std::vector& inputs, + std::vector* outputs) { + if (inputs.size() != inputs_desc_.size()) { + FDERROR << "[PaddleBackend] Size of inputs(" << inputs.size() + << ") should keep same with the inputs of this model(" + << inputs_desc_.size() << ")." << std::endl; + return false; + } + + for (size_t i = 0; i < inputs.size(); ++i) { + auto handle = predictor_->GetInputHandle(inputs[i].name); + ShareTensorFromCpu(handle.get(), inputs[i]); + } + + predictor_->Run(); + outputs->resize(outputs_desc_.size()); + for (size_t i = 0; i < outputs_desc_.size(); ++i) { + auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name); + CopyTensorToCpu(handle, &((*outputs)[i])); + } + return true; +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/paddle/paddle_backend.h b/csrc/fastdeploy/backends/paddle/paddle_backend.h new file mode 100644 index 000000000..99ca5eb1b --- /dev/null +++ b/csrc/fastdeploy/backends/paddle/paddle_backend.h @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "fastdeploy/backends/backend.h" +#include "paddle_inference_api.h" // NOLINT + +namespace fastdeploy { + +struct PaddleBackendOption { +#ifdef WITH_GPU + bool use_gpu = true; +#else + bool use_gpu = false; +#endif + bool enable_mkldnn = true; + + int mkldnn_cache_size = 1; + int cpu_thread_num = 8; + // initialize memory size(MB) for GPU + int gpu_mem_init_size = 100; + // gpu device id + int gpu_id = 0; +}; + +// Share memory buffer with paddle_infer::Tensor from fastdeploy::FDTensor +void ShareTensorFromCpu(paddle_infer::Tensor* tensor, FDTensor& fd_tensor); + +// Copy memory data from paddle_infer::Tensor to fastdeploy::FDTensor +void CopyTensorToCpu(std::unique_ptr& tensor, + FDTensor* fd_tensor); + +// Convert data type from paddle inference to fastdeploy +FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype); + +class PaddleBackend : public BaseBackend { + public: + PaddleBackend() {} + virtual ~PaddleBackend() = default; + void BuildOption(const PaddleBackendOption& option); + + bool InitFromPaddle( + const std::string& model_file, const std::string& params_file, + const PaddleBackendOption& option = PaddleBackendOption()); + + bool Infer(std::vector& inputs, std::vector* outputs); + + int NumInputs() const { return inputs_desc_.size(); } + + int NumOutputs() const { return outputs_desc_.size(); } + + TensorInfo GetInputInfo(int index); + TensorInfo GetOutputInfo(int index); + + private: + paddle_infer::Config config_; + std::shared_ptr predictor_; + std::vector inputs_desc_; + std::vector outputs_desc_; +}; +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/paddle/util.cc b/csrc/fastdeploy/backends/paddle/util.cc new file mode 100644 index 000000000..1ae5b3553 --- /dev/null +++ b/csrc/fastdeploy/backends/paddle/util.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/backends/paddle/paddle_backend.h" + +namespace fastdeploy { +void ShareTensorFromCpu(paddle_infer::Tensor* tensor, FDTensor& fd_tensor) { + std::vector shape(fd_tensor.shape.begin(), fd_tensor.shape.end()); + tensor->Reshape(shape); + if (fd_tensor.dtype == FDDataType::FP32) { + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, paddle_infer::PlaceType::kCPU); + return; + } else if (fd_tensor.dtype == FDDataType::INT32) { + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, paddle_infer::PlaceType::kCPU); + return; + } else if (fd_tensor.dtype == FDDataType::INT64) { + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, paddle_infer::PlaceType::kCPU); + return; + } + FDASSERT(false, "Unexpected data type(" + Str(fd_tensor.dtype) + + ") while infer with PaddleBackend."); +} + +void CopyTensorToCpu(std::unique_ptr& tensor, + FDTensor* fd_tensor) { + auto fd_dtype = PaddleDataTypeToFD(tensor->type()); + std::vector shape; + auto tmp_shape = tensor->shape(); + shape.assign(tmp_shape.begin(), tmp_shape.end()); + fd_tensor->Allocate(shape, fd_dtype, tensor->name()); + if (fd_tensor->dtype == FDDataType::FP32) { + tensor->CopyToCpu(static_cast(fd_tensor->MutableData())); + return; + } else if (fd_tensor->dtype == FDDataType::INT32) { + tensor->CopyToCpu(static_cast(fd_tensor->MutableData())); + return; + } else if (fd_tensor->dtype == FDDataType::INT64) { + tensor->CopyToCpu(static_cast(fd_tensor->MutableData())); + return; + } + FDASSERT(false, "Unexpected data type(" + Str(fd_tensor->dtype) + + ") while infer with PaddleBackend."); +} + +FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype) { + auto fd_dtype = FDDataType::FP32; + if (dtype == paddle_infer::FLOAT32) { + fd_dtype = FDDataType::FP32; + } else if (dtype == paddle_infer::INT64) { + fd_dtype = FDDataType::INT64; + } else if (dtype == paddle_infer::INT32) { + fd_dtype = FDDataType::INT32; + } else if (dtype == paddle_infer::UINT8) { + fd_dtype = FDDataType::UINT8; + } else { + FDASSERT(false, "Unexpected data type:" + std::to_string(int(dtype)) + + " while call CopyTensorToCpu in PaddleBackend."); + } + return fd_dtype; +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/tensorrt/common/BatchStream.h b/csrc/fastdeploy/backends/tensorrt/common/BatchStream.h new file mode 100644 index 000000000..2484ccc68 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/BatchStream.h @@ -0,0 +1,342 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef BATCH_STREAM_H +#define BATCH_STREAM_H + +#include "NvInfer.h" +#include "common.h" +#include +#include +#include + +class IBatchStream { + public: + virtual void reset(int firstBatch) = 0; + virtual bool next() = 0; + virtual void skip(int skipCount) = 0; + virtual float* getBatch() = 0; + virtual float* getLabels() = 0; + virtual int getBatchesRead() const = 0; + virtual int getBatchSize() const = 0; + virtual nvinfer1::Dims getDims() const = 0; +}; + +class MNISTBatchStream : public IBatchStream { + public: + MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, + const std::string& labelsFile, + const std::vector& directories) + : mBatchSize{batchSize}, mMaxBatches{maxBatches}, mDims{3, {1, 28, 28}} + //!< We already know the dimensions of MNIST images. + { + readDataFile(locateFile(dataFile, directories)); + readLabelsFile(locateFile(labelsFile, directories)); + } + + void reset(int firstBatch) override { mBatchCount = firstBatch; } + + bool next() override { + if (mBatchCount >= mMaxBatches) { + return false; + } + ++mBatchCount; + return true; + } + + void skip(int skipCount) override { mBatchCount += skipCount; } + + float* getBatch() override { + return mData.data() + + (mBatchCount * mBatchSize * samplesCommon::volume(mDims)); + } + + float* getLabels() override { + return mLabels.data() + (mBatchCount * mBatchSize); + } + + int getBatchesRead() const override { return mBatchCount; } + + int getBatchSize() const override { return mBatchSize; } + + nvinfer1::Dims getDims() const override { + return Dims{4, {mBatchSize, mDims.d[0], mDims.d[1], mDims.d[2]}}; + } + + private: + void readDataFile(const std::string& dataFilePath) { + std::ifstream file{dataFilePath.c_str(), std::ios::binary}; + + int magicNumber, numImages, imageH, imageW; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2051 && + "Magic Number does not match the expected value for an MNIST image " + "set"); + + // Read number of images and dimensions + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + file.read(reinterpret_cast(&imageH), sizeof(imageH)); + file.read(reinterpret_cast(&imageW), sizeof(imageW)); + + numImages = samplesCommon::swapEndianness(numImages); + imageH = samplesCommon::swapEndianness(imageH); + imageW = samplesCommon::swapEndianness(imageW); + + // The MNIST data is made up of unsigned bytes, so we need to cast to float + // and normalize. + int numElements = numImages * imageH * imageW; + std::vector rawData(numElements); + file.read(reinterpret_cast(rawData.data()), + numElements * sizeof(uint8_t)); + mData.resize(numElements); + std::transform(rawData.begin(), rawData.end(), mData.begin(), + [](uint8_t val) { return static_cast(val) / 255.f; }); + } + + void readLabelsFile(const std::string& labelsFilePath) { + std::ifstream file{labelsFilePath.c_str(), std::ios::binary}; + int magicNumber, numImages; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2049 && + "Magic Number does not match the expected value for an MNIST labels " + "file"); + + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + numImages = samplesCommon::swapEndianness(numImages); + + std::vector rawLabels(numImages); + file.read(reinterpret_cast(rawLabels.data()), + numImages * sizeof(uint8_t)); + mLabels.resize(numImages); + std::transform(rawLabels.begin(), rawLabels.end(), mLabels.begin(), + [](uint8_t val) { return static_cast(val); }); + } + + int mBatchSize{0}; + int mBatchCount{ + 0}; //!< The batch that will be read on the next invocation of next() + int mMaxBatches{0}; + Dims mDims{}; + std::vector mData{}; + std::vector mLabels{}; +}; + +class BatchStream : public IBatchStream { + public: + BatchStream(int batchSize, int maxBatches, std::string prefix, + std::string suffix, std::vector directories) + : mBatchSize(batchSize), mMaxBatches(maxBatches), mPrefix(prefix), + mSuffix(suffix), mDataDir(directories) { + FILE* file = fopen( + locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), + "rb"); + ASSERT(file != nullptr); + int d[4]; + size_t readSize = fread(d, sizeof(int), 4, file); + ASSERT(readSize == 4); + mDims.nbDims = 4; // The number of dimensions. + mDims.d[0] = d[0]; // Batch Size + mDims.d[1] = d[1]; // Channels + mDims.d[2] = d[2]; // Height + mDims.d[3] = d[3]; // Width + ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && + mDims.d[3] > 0); + fclose(file); + + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + BatchStream(int batchSize, int maxBatches, std::string prefix, + std::vector directories) + : BatchStream(batchSize, maxBatches, prefix, ".batch", directories) {} + + BatchStream(int batchSize, int maxBatches, nvinfer1::Dims dims, + std::string listFile, std::vector directories) + : mBatchSize(batchSize), mMaxBatches(maxBatches), mDims(dims), + mListFile(listFile), mDataDir(directories) { + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + // Resets data members + void reset(int firstBatch) override { + mBatchCount = 0; + mFileCount = 0; + mFileBatchPos = mDims.d[0]; + skip(firstBatch); + } + + // Advance to next batch and return true, or return false if there is no batch + // left. + bool next() override { + if (mBatchCount == mMaxBatches) { + return false; + } + + for (int csize = 1, batchPos = 0; batchPos < mBatchSize; + batchPos += csize, mFileBatchPos += csize) { + ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]); + if (mFileBatchPos == mDims.d[0] && !update()) { + return false; + } + + // copy the smaller of: elements left to fulfill the request, or elements + // left in the file buffer. + csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); + std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, + csize * mImageSize, getBatch() + batchPos * mImageSize); + std::copy_n(getFileLabels() + mFileBatchPos, csize, + getLabels() + batchPos); + } + mBatchCount++; + return true; + } + + // Skips the batches + void skip(int skipCount) override { + if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && + mFileBatchPos == mDims.d[0]) { + mFileCount += skipCount * mBatchSize / mDims.d[0]; + return; + } + + int x = mBatchCount; + for (int i = 0; i < skipCount; i++) { + next(); + } + mBatchCount = x; + } + + float* getBatch() override { return mBatch.data(); } + + float* getLabels() override { return mLabels.data(); } + + int getBatchesRead() const override { return mBatchCount; } + + int getBatchSize() const override { return mBatchSize; } + + nvinfer1::Dims getDims() const override { return mDims; } + + private: + float* getFileBatch() { return mFileBatch.data(); } + + float* getFileLabels() { return mFileLabels.data(); } + + bool update() { + if (mListFile.empty()) { + std::string inputFileName = locateFile( + mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir); + FILE* file = fopen(inputFileName.c_str(), "rb"); + if (!file) { + return false; + } + + int d[4]; + size_t readSize = fread(d, sizeof(int), 4, file); + ASSERT(readSize == 4); + ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && + mDims.d[3] == d[3]); + size_t readInputCount = + fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file); + ASSERT(readInputCount == size_t(mDims.d[0] * mImageSize)); + size_t readLabelCount = + fread(getFileLabels(), sizeof(float), mDims.d[0], file); + ASSERT(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0])); + + fclose(file); + } else { + std::vector fNames; + std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary); + if (!file) { + return false; + } + + sample::gLogInfo << "Batch #" << mFileCount << std::endl; + file.seekg(((mBatchCount * mBatchSize)) * 7); + + for (int i = 1; i <= mBatchSize; i++) { + std::string sName; + std::getline(file, sName); + sName = sName + ".ppm"; + sample::gLogInfo << "Calibrating with file " << sName << std::endl; + fNames.emplace_back(sName); + } + + mFileCount++; + + const int imageC = 3; + const int imageH = 300; + const int imageW = 300; + std::vector> ppms( + fNames.size()); + for (uint32_t i = 0; i < fNames.size(); ++i) { + readPPMFile(locateFile(fNames[i], mDataDir), ppms[i]); + } + + std::vector data(samplesCommon::volume(mDims)); + const float scale = 2.0 / 255.0; + const float bias = 1.0; + long int volChl = mDims.d[2] * mDims.d[3]; + + // Normalize input data + for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; + i < mBatchSize; ++i) { + for (int c = 0; c < mDims.d[1]; ++c) { + for (int j = 0; j < volChl; ++j) { + data[i * volImg + c * volChl + j] = + scale * float(ppms[i].buffer[j * mDims.d[1] + c]) - bias; + } + } + } + + std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch()); + } + + mFileBatchPos = 0; + return true; + } + + int mBatchSize{0}; + int mMaxBatches{0}; + int mBatchCount{0}; + int mFileCount{0}; + int mFileBatchPos{0}; + int mImageSize{0}; + std::vector mBatch; //!< Data for the batch + std::vector mLabels; //!< Labels for the batch + std::vector mFileBatch; //!< List of image files + std::vector mFileLabels; //!< List of label files + std::string mPrefix; //!< Batch file name prefix + std::string mSuffix; //!< Batch file name suffix + nvinfer1::Dims mDims; //!< Input dimensions + std::string mListFile; //!< File name of the list of image names + std::vector + mDataDir; //!< Directories where the files can be found +}; + +#endif diff --git a/csrc/fastdeploy/backends/tensorrt/common/CPPLINT.cfg b/csrc/fastdeploy/backends/tensorrt/common/CPPLINT.cfg new file mode 100644 index 000000000..51ff339c1 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/CPPLINT.cfg @@ -0,0 +1 @@ +exclude_files=.* diff --git a/csrc/fastdeploy/backends/tensorrt/common/EntropyCalibrator.h b/csrc/fastdeploy/backends/tensorrt/common/EntropyCalibrator.h new file mode 100644 index 000000000..40eb8f13e --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/EntropyCalibrator.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ENTROPY_CALIBRATOR_H +#define ENTROPY_CALIBRATOR_H + +#include "BatchStream.h" +#include "NvInfer.h" + +//! \class EntropyCalibratorImpl +//! +//! \brief Implements common functionality for Entropy calibrators. +//! +template class EntropyCalibratorImpl { + public: + EntropyCalibratorImpl(TBatchStream stream, int firstBatch, + std::string networkName, const char* inputBlobName, + bool readCache = true) + : mStream{stream}, + mCalibrationTableName("CalibrationTable" + networkName), + mInputBlobName(inputBlobName), mReadCache(readCache) { + nvinfer1::Dims dims = mStream.getDims(); + mInputCount = samplesCommon::volume(dims); + CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float))); + mStream.reset(firstBatch); + } + + virtual ~EntropyCalibratorImpl() { CHECK(cudaFree(mDeviceInput)); } + + int getBatchSize() const noexcept { return mStream.getBatchSize(); } + + bool getBatch(void* bindings[], const char* names[], + int nbBindings) noexcept { + if (!mStream.next()) { + return false; + } + CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), + mInputCount * sizeof(float), cudaMemcpyHostToDevice)); + ASSERT(!strcmp(names[0], mInputBlobName)); + bindings[0] = mDeviceInput; + return true; + } + + const void* readCalibrationCache(size_t& length) noexcept { + mCalibrationCache.clear(); + std::ifstream input(mCalibrationTableName, std::ios::binary); + input >> std::noskipws; + if (mReadCache && input.good()) { + std::copy(std::istream_iterator(input), + std::istream_iterator(), + std::back_inserter(mCalibrationCache)); + } + length = mCalibrationCache.size(); + return length ? mCalibrationCache.data() : nullptr; + } + + void writeCalibrationCache(const void* cache, size_t length) noexcept { + std::ofstream output(mCalibrationTableName, std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + + private: + TBatchStream mStream; + size_t mInputCount; + std::string mCalibrationTableName; + const char* mInputBlobName; + bool mReadCache{true}; + void* mDeviceInput{nullptr}; + std::vector mCalibrationCache; +}; + +//! \class Int8EntropyCalibrator2 +//! +//! \brief Implements Entropy calibrator 2. +//! CalibrationAlgoType is kENTROPY_CALIBRATION_2. +//! +template +class Int8EntropyCalibrator2 : public IInt8EntropyCalibrator2 { + public: + Int8EntropyCalibrator2(TBatchStream stream, int firstBatch, + const char* networkName, const char* inputBlobName, + bool readCache = true) + : mImpl(stream, firstBatch, networkName, inputBlobName, readCache) {} + + int getBatchSize() const noexcept override { return mImpl.getBatchSize(); } + + bool getBatch(void* bindings[], const char* names[], + int nbBindings) noexcept override { + return mImpl.getBatch(bindings, names, nbBindings); + } + + const void* readCalibrationCache(size_t& length) noexcept override { + return mImpl.readCalibrationCache(length); + } + + void writeCalibrationCache(const void* cache, + size_t length) noexcept override { + mImpl.writeCalibrationCache(cache, length); + } + + private: + EntropyCalibratorImpl mImpl; +}; + +#endif // ENTROPY_CALIBRATOR_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/ErrorRecorder.h b/csrc/fastdeploy/backends/tensorrt/common/ErrorRecorder.h new file mode 100644 index 000000000..e13b55bd9 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/ErrorRecorder.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ERROR_RECORDER_H +#define ERROR_RECORDER_H +#include "NvInferRuntimeCommon.h" +#include "logger.h" +#include +#include +#include +#include +#include + +using nvinfer1::ErrorCode; +using nvinfer1::IErrorRecorder; + +//! +//! A simple implementation of the IErrorRecorder interface for +//! use by samples. This interface also can be used as a reference +//! implementation. +//! The sample Error recorder is based on a vector that pairs the error +//! code and the error string into a single element. It also uses +//! standard mutex's and atomics in order to make sure that the code +//! works in a multi-threaded environment. +//! +class SampleErrorRecorder : public IErrorRecorder { + using errorPair = std::pair; + using errorStack = std::vector; + + public: + SampleErrorRecorder() = default; + + virtual ~SampleErrorRecorder() noexcept {} + int32_t getNbErrors() const noexcept final { return mErrorStack.size(); } + ErrorCode getErrorCode(int32_t errorIdx) const noexcept final { + return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT + : (*this)[errorIdx].first; + }; + IErrorRecorder::ErrorDesc + getErrorDesc(int32_t errorIdx) const noexcept final { + return invalidIndexCheck(errorIdx) ? "errorIdx out of range." + : (*this)[errorIdx].second.c_str(); + } + // This class can never overflow since we have dynamic resize via std::vector + // usage. + bool hasOverflowed() const noexcept final { return false; } + + // Empty the errorStack. + void clear() noexcept final { + try { + // grab a lock so that there is no addition while clearing. + std::lock_guard guard(mStackLock); + mErrorStack.clear(); + } catch (const std::exception& e) { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + }; + + //! Simple helper function that + bool empty() const noexcept { return mErrorStack.empty(); } + + bool reportError(ErrorCode val, + IErrorRecorder::ErrorDesc desc) noexcept final { + try { + std::lock_guard guard(mStackLock); + sample::gLogError << "Error[" << static_cast(val) + << "]: " << desc << std::endl; + mErrorStack.push_back(errorPair(val, desc)); + } catch (const std::exception& e) { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + // All errors are considered fatal. + return true; + } + + // Atomically increment or decrement the ref counter. + IErrorRecorder::RefCount incRefCount() noexcept final { return ++mRefCount; } + IErrorRecorder::RefCount decRefCount() noexcept final { return --mRefCount; } + + private: + // Simple helper functions. + const errorPair& operator[](size_t index) const noexcept { + return mErrorStack[index]; + } + + bool invalidIndexCheck(int32_t index) const noexcept { + // By converting signed to unsigned, we only need a single check since + // negative numbers turn into large positive greater than the size. + size_t sIndex = index; + return sIndex >= mErrorStack.size(); + } + // Mutex to hold when locking mErrorStack. + std::mutex mStackLock; + + // Reference count of the class. Destruction of the class when mRefCount + // is not zero causes undefined behavior. + std::atomic mRefCount{0}; + + // The error stack that holds the errors recorded by TensorRT. + errorStack mErrorStack; +}; // class SampleErrorRecorder +#endif // ERROR_RECORDER_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/README.md b/csrc/fastdeploy/backends/tensorrt/common/README.md new file mode 100644 index 000000000..0ed86b17a --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/README.md @@ -0,0 +1 @@ +目录代码来源自 https://github.com/NVIDIA/TensorRT diff --git a/csrc/fastdeploy/backends/tensorrt/common/argsParser.h b/csrc/fastdeploy/backends/tensorrt/common/argsParser.h new file mode 100644 index 000000000..e2e1b1e95 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/argsParser.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_ARGS_PARSER_H +#define TENSORRT_ARGS_PARSER_H + +#include +#include +#ifdef _MSC_VER +#include ".\windows\getopt.h" +#else +#include +#endif +#include + +namespace samplesCommon { + +//! +//! \brief The SampleParams structure groups the basic parameters required by +//! all sample networks. +//! +struct SampleParams { + int32_t batchSize{1}; //!< Number of inputs in a batch + int32_t dlaCore{-1}; //!< Specify the DLA core to run network on. + bool int8{false}; //!< Allow runnning the network in Int8 mode. + bool fp16{false}; //!< Allow running the network in FP16 mode. + std::vector + dataDirs; //!< Directory paths where sample data files are stored + std::vector inputTensorNames; + std::vector outputTensorNames; +}; + +//! +//! \brief The CaffeSampleParams structure groups the additional parameters +//! required by +//! networks that use caffe +//! +struct CaffeSampleParams : public SampleParams { + std::string + prototxtFileName; //!< Filename of prototxt design file of a network + std::string + weightsFileName; //!< Filename of trained weights file of a network + std::string meanFileName; //!< Filename of mean file of a network +}; + +//! +//! \brief The OnnxSampleParams structure groups the additional parameters +//! required by +//! networks that use ONNX +//! +struct OnnxSampleParams : public SampleParams { + std::string onnxFileName; //!< Filename of ONNX file of a network +}; + +//! +//! \brief The UffSampleParams structure groups the additional parameters +//! required by +//! networks that use Uff +//! +struct UffSampleParams : public SampleParams { + std::string uffFileName; //!< Filename of uff file of a network +}; + +//! +//! /brief Struct to maintain command-line arguments. +//! +struct Args { + bool runInInt8{false}; + bool runInFp16{false}; + bool help{false}; + int32_t useDLACore{-1}; + int32_t batch{1}; + std::vector dataDirs; + std::string saveEngine; + std::string loadEngine; + bool useILoop{false}; +}; + +//! +//! \brief Populates the Args struct with the provided command-line parameters. +//! +//! \throw invalid_argument if any of the arguments are not valid +//! +//! \return boolean If return value is true, execution can continue, otherwise +//! program should exit +//! +inline bool parseArgs(Args& args, int32_t argc, char* argv[]) { + while (1) { + int32_t arg; + static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"datadir", required_argument, 0, 'd'}, + {"int8", no_argument, 0, 'i'}, + {"fp16", no_argument, 0, 'f'}, + {"useILoop", no_argument, 0, 'l'}, + {"saveEngine", required_argument, 0, 's'}, + {"loadEngine", no_argument, 0, 'o'}, + {"useDLACore", required_argument, 0, 'u'}, + {"batch", required_argument, 0, 'b'}, + {nullptr, 0, nullptr, 0}}; + int32_t option_index = 0; + arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index); + if (arg == -1) { + break; + } + + switch (arg) { + case 'h': + args.help = true; + return true; + case 'd': + if (optarg) { + args.dataDirs.push_back(optarg); + } else { + std::cerr << "ERROR: --datadir requires option argument" << std::endl; + return false; + } + break; + case 's': + if (optarg) { + args.saveEngine = optarg; + } + break; + case 'o': + if (optarg) { + args.loadEngine = optarg; + } + break; + case 'i': + args.runInInt8 = true; + break; + case 'f': + args.runInFp16 = true; + break; + case 'l': + args.useILoop = true; + break; + case 'u': + if (optarg) { + args.useDLACore = std::stoi(optarg); + } + break; + case 'b': + if (optarg) { + args.batch = std::stoi(optarg); + } + break; + default: + return false; + } + } + return true; +} + +} // namespace samplesCommon + +#endif // TENSORRT_ARGS_PARSER_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/buffers.h b/csrc/fastdeploy/backends/tensorrt/common/buffers.h new file mode 100644 index 000000000..8061ee33d --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/buffers.h @@ -0,0 +1,426 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_BUFFERS_H +#define TENSORRT_BUFFERS_H + +#include "NvInfer.h" +#include "common.h" +#include "half.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace samplesCommon { + +//! +//! \brief The GenericBuffer class is a templated class for buffers. +//! +//! \details This templated RAII (Resource Acquisition Is Initialization) class +//! handles the allocation, +//! deallocation, querying of buffers on both the device and the host. +//! It can handle data of arbitrary types because it stores byte +//! buffers. +//! The template parameters AllocFunc and FreeFunc are used for the +//! allocation and deallocation of the buffer. +//! AllocFunc must be a functor that takes in (void** ptr, size_t size) +//! and returns bool. ptr is a pointer to where the allocated buffer +//! address should be stored. +//! size is the amount of memory in bytes to allocate. +//! The boolean indicates whether or not the memory allocation was +//! successful. +//! FreeFunc must be a functor that takes in (void* ptr) and returns +//! void. +//! ptr is the allocated buffer address. It must work with nullptr +//! input. +//! +template class GenericBuffer { + public: + //! + //! \brief Construct an empty buffer. + //! + GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT) + : mSize(0), mCapacity(0), mType(type), mBuffer(nullptr) {} + + //! + //! \brief Construct a buffer with the specified allocation size in bytes. + //! + GenericBuffer(size_t size, nvinfer1::DataType type) + : mSize(size), mCapacity(size), mType(type) { + if (!allocFn(&mBuffer, this->nbBytes())) { + throw std::bad_alloc(); + } + } + + GenericBuffer(GenericBuffer&& buf) + : mSize(buf.mSize), mCapacity(buf.mCapacity), mType(buf.mType), + mBuffer(buf.mBuffer) { + buf.mSize = 0; + buf.mCapacity = 0; + buf.mType = nvinfer1::DataType::kFLOAT; + buf.mBuffer = nullptr; + } + + GenericBuffer& operator=(GenericBuffer&& buf) { + if (this != &buf) { + freeFn(mBuffer); + mSize = buf.mSize; + mCapacity = buf.mCapacity; + mType = buf.mType; + mBuffer = buf.mBuffer; + // Reset buf. + buf.mSize = 0; + buf.mCapacity = 0; + buf.mBuffer = nullptr; + } + return *this; + } + + //! + //! \brief Returns pointer to underlying array. + //! + void* data() { return mBuffer; } + + //! + //! \brief Returns pointer to underlying array. + //! + const void* data() const { return mBuffer; } + + //! + //! \brief Returns the size (in number of elements) of the buffer. + //! + size_t size() const { return mSize; } + + //! + //! \brief Returns the size (in bytes) of the buffer. + //! + size_t nbBytes() const { + return this->size() * samplesCommon::getElementSize(mType); + } + + //! + //! \brief Resizes the buffer. This is a no-op if the new size is smaller than + //! or equal to the current capacity. + //! + void resize(size_t newSize) { + mSize = newSize; + if (mCapacity < newSize) { + freeFn(mBuffer); + if (!allocFn(&mBuffer, this->nbBytes())) { + throw std::bad_alloc{}; + } + mCapacity = newSize; + } + } + + //! + //! \brief Overload of resize that accepts Dims + //! + void resize(const nvinfer1::Dims& dims) { + return this->resize(samplesCommon::volume(dims)); + } + + ~GenericBuffer() { freeFn(mBuffer); } + + private: + size_t mSize{0}, mCapacity{0}; + nvinfer1::DataType mType; + void* mBuffer; + AllocFunc allocFn; + FreeFunc freeFn; +}; + +class DeviceAllocator { + public: + bool operator()(void** ptr, size_t size) const { + return cudaMalloc(ptr, size) == cudaSuccess; + } +}; + +class DeviceFree { + public: + void operator()(void* ptr) const { cudaFree(ptr); } +}; + +class HostAllocator { + public: + bool operator()(void** ptr, size_t size) const { + *ptr = malloc(size); + return *ptr != nullptr; + } +}; + +class HostFree { + public: + void operator()(void* ptr) const { free(ptr); } +}; + +using DeviceBuffer = GenericBuffer; +using HostBuffer = GenericBuffer; + +//! +//! \brief The ManagedBuffer class groups together a pair of corresponding +//! device and host buffers. +//! +class ManagedBuffer { + public: + DeviceBuffer deviceBuffer; + HostBuffer hostBuffer; +}; + +//! +//! \brief The BufferManager class handles host and device buffer allocation +//! and deallocation. +//! +//! \details This RAII class handles host and device buffer allocation and +//! deallocation, +//! memcpy between host and device buffers to aid with inference, +//! and debugging dumps to validate inference. The BufferManager class +//! is meant to be +//! used to simplify buffer management and any interactions between +//! buffers and the engine. +//! +class BufferManager { + public: + static const size_t kINVALID_SIZE_VALUE = ~size_t(0); + + //! + //! \brief Create a BufferManager for handling buffer interactions with + //! engine. + //! + BufferManager(std::shared_ptr engine, + const int batchSize = 0, + const nvinfer1::IExecutionContext* context = nullptr) + : mEngine(engine), mBatchSize(batchSize) { + // Full Dims implies no batch size. + assert(engine->hasImplicitBatchDimension() || mBatchSize == 0); + // Create host and device buffers + for (int i = 0; i < mEngine->getNbBindings(); i++) { + auto dims = context ? context->getBindingDimensions(i) + : mEngine->getBindingDimensions(i); + size_t vol = context || !mBatchSize ? 1 : static_cast(mBatchSize); + nvinfer1::DataType type = mEngine->getBindingDataType(i); + int vecDim = mEngine->getBindingVectorizedDim(i); + if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector + { + int scalarsPerVec = mEngine->getBindingComponentsPerElement(i); + dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); + vol *= scalarsPerVec; + } + vol *= samplesCommon::volume(dims); + std::unique_ptr manBuf{new ManagedBuffer()}; + manBuf->deviceBuffer = DeviceBuffer(vol, type); + manBuf->hostBuffer = HostBuffer(vol, type); + mDeviceBindings.emplace_back(manBuf->deviceBuffer.data()); + mManagedBuffers.emplace_back(std::move(manBuf)); + } + } + + //! + //! \brief Returns a vector of device buffers that you can use directly as + //! bindings for the execute and enqueue methods of IExecutionContext. + //! + std::vector& getDeviceBindings() { return mDeviceBindings; } + + //! + //! \brief Returns a vector of device buffers. + //! + const std::vector& getDeviceBindings() const { + return mDeviceBindings; + } + + //! + //! \brief Returns the device buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getDeviceBuffer(const std::string& tensorName) const { + return getBuffer(false, tensorName); + } + + //! + //! \brief Returns the host buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getHostBuffer(const std::string& tensorName) const { + return getBuffer(true, tensorName); + } + + //! + //! \brief Returns the size of the host and device buffers that correspond to + //! tensorName. + //! Returns kINVALID_SIZE_VALUE if no such tensor can be found. + //! + size_t size(const std::string& tensorName) const { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return kINVALID_SIZE_VALUE; + return mManagedBuffers[index]->hostBuffer.nbBytes(); + } + + //! + //! \brief Dump host buffer with specified tensorName to ostream. + //! Prints error message to std::ostream if no such tensor can be + //! found. + //! + void dumpBuffer(std::ostream& os, const std::string& tensorName) { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) { + os << "Invalid tensor name" << std::endl; + return; + } + void* buf = mManagedBuffers[index]->hostBuffer.data(); + size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes(); + nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index); + size_t rowCount = static_cast( + bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize); + int leadDim = mBatchSize; + int* trailDims = bufDims.d; + int nbDims = bufDims.nbDims; + + // Fix explicit Dimension networks + if (!leadDim && nbDims > 0) { + leadDim = bufDims.d[0]; + ++trailDims; + --nbDims; + } + + os << "[" << leadDim; + for (int i = 0; i < nbDims; i++) + os << ", " << trailDims[i]; + os << "]" << std::endl; + switch (mEngine->getBindingDataType(index)) { + case nvinfer1::DataType::kINT32: + print(os, buf, bufSize, rowCount); + break; + case nvinfer1::DataType::kFLOAT: + print(os, buf, bufSize, rowCount); + break; + case nvinfer1::DataType::kHALF: + print(os, buf, bufSize, rowCount); + break; + case nvinfer1::DataType::kINT8: + assert(0 && "Int8 network-level input and output is not supported"); + break; + case nvinfer1::DataType::kBOOL: + assert(0 && "Bool network-level input and output are not supported"); + break; + } + } + + //! + //! \brief Templated print function that dumps buffers of arbitrary type to + //! std::ostream. + //! rowCount parameter controls how many elements are on each line. + //! A rowCount of 1 means that there is only 1 element on each line. + //! + template + void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount) { + assert(rowCount != 0); + assert(bufSize % sizeof(T) == 0); + T* typedBuf = static_cast(buf); + size_t numItems = bufSize / sizeof(T); + for (int i = 0; i < static_cast(numItems); i++) { + // Handle rowCount == 1 case + if (rowCount == 1 && i != static_cast(numItems) - 1) + os << typedBuf[i] << std::endl; + else if (rowCount == 1) + os << typedBuf[i]; + // Handle rowCount > 1 case + else if (i % rowCount == 0) + os << typedBuf[i]; + else if (i % rowCount == rowCount - 1) + os << " " << typedBuf[i] << std::endl; + else + os << " " << typedBuf[i]; + } + } + + //! + //! \brief Copy the contents of input host buffers to input device buffers + //! synchronously. + //! + void copyInputToDevice() { memcpyBuffers(true, false, false); } + + //! + //! \brief Copy the contents of output device buffers to output host buffers + //! synchronously. + //! + void copyOutputToHost() { memcpyBuffers(false, true, false); } + + //! + //! \brief Copy the contents of input host buffers to input device buffers + //! asynchronously. + //! + void copyInputToDeviceAsync(const cudaStream_t& stream = 0) { + memcpyBuffers(true, false, true, stream); + } + + //! + //! \brief Copy the contents of output device buffers to output host buffers + //! asynchronously. + //! + void copyOutputToHostAsync(const cudaStream_t& stream = 0) { + memcpyBuffers(false, true, true, stream); + } + + ~BufferManager() = default; + + private: + void* getBuffer(const bool isHost, const std::string& tensorName) const { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return nullptr; + return (isHost ? mManagedBuffers[index]->hostBuffer.data() + : mManagedBuffers[index]->deviceBuffer.data()); + } + + void memcpyBuffers(const bool copyInput, const bool deviceToHost, + const bool async, const cudaStream_t& stream = 0) { + for (int i = 0; i < mEngine->getNbBindings(); i++) { + void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() + : mManagedBuffers[i]->deviceBuffer.data(); + const void* srcPtr = deviceToHost + ? mManagedBuffers[i]->deviceBuffer.data() + : mManagedBuffers[i]->hostBuffer.data(); + const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes(); + const cudaMemcpyKind memcpyType = + deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; + if ((copyInput && mEngine->bindingIsInput(i)) || + (!copyInput && !mEngine->bindingIsInput(i))) { + if (async) + CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); + else + CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType)); + } + } + } + + std::shared_ptr mEngine; //!< The pointer to the engine + int mBatchSize; //!< The batch size for legacy networks, 0 otherwise. + std::vector> + mManagedBuffers; //!< The vector of pointers to managed buffers + std::vector mDeviceBindings; //!< The vector of device buffers needed + //! for engine execution +}; + +} // namespace samplesCommon + +#endif // TENSORRT_BUFFERS_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/common.h b/csrc/fastdeploy/backends/tensorrt/common/common.h new file mode 100644 index 000000000..ad3af72a2 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/common.h @@ -0,0 +1,844 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_COMMON_H +#define TENSORRT_COMMON_H + +// For loadLibrary +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with +// std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include +#endif + +#include "NvInfer.h" +#include "NvInferPlugin.h" +#include "logger.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "safeCommon.h" + +using namespace nvinfer1; +using namespace plugin; + +#ifdef _MSC_VER +#define FN_NAME __FUNCTION__ +#else +#define FN_NAME __func__ +#endif + +#if defined(__aarch64__) || defined(__QNX__) +#define ENABLE_DLA_API 1 +#endif + +#define CHECK_RETURN_W_MSG(status, val, errMsg) \ + do { \ + if (!(status)) { \ + sample::gLogError << errMsg << " Error in " << __FILE__ << ", function " \ + << FN_NAME << "(), line " << __LINE__ << std::endl; \ + return val; \ + } \ + } while (0) + +#undef ASSERT +#define ASSERT(condition) \ + do { \ + if (!(condition)) { \ + sample::gLogError << "Assertion failure: " << #condition << std::endl; \ + abort(); \ + } \ + } while (0) + +#define CHECK_RETURN(status, val) CHECK_RETURN_W_MSG(status, val, "") + +#define OBJ_GUARD(A) std::unique_ptr + +template OBJ_GUARD(T) makeObjGuard(T_* t) { + CHECK(!(std::is_base_of::value || std::is_same::value)); + auto deleter = [](T* t) { t->destroy(); }; + return std::unique_ptr{static_cast(t), deleter}; +} + +constexpr long double operator"" _GiB(long double val) { + return val * (1 << 30); +} +constexpr long double operator"" _MiB(long double val) { + return val * (1 << 20); +} +constexpr long double operator"" _KiB(long double val) { + return val * (1 << 10); +} + +// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB. +// Since the return type is signed, -1_GiB will work as expected. +constexpr long long int operator"" _GiB(unsigned long long val) { + return val * (1 << 30); +} +constexpr long long int operator"" _MiB(unsigned long long val) { + return val * (1 << 20); +} +constexpr long long int operator"" _KiB(unsigned long long val) { + return val * (1 << 10); +} + +struct SimpleProfiler : public nvinfer1::IProfiler { + struct Record { + float time{0}; + int count{0}; + }; + + virtual void reportLayerTime(const char* layerName, float ms) noexcept { + mProfile[layerName].count++; + mProfile[layerName].time += ms; + if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == + mLayerNames.end()) { + mLayerNames.push_back(layerName); + } + } + + SimpleProfiler(const char* name, + const std::vector& srcProfilers = + std::vector()) + : mName(name) { + for (const auto& srcProfiler : srcProfilers) { + for (const auto& rec : srcProfiler.mProfile) { + auto it = mProfile.find(rec.first); + if (it == mProfile.end()) { + mProfile.insert(rec); + } else { + it->second.time += rec.second.time; + it->second.count += rec.second.count; + } + } + } + } + + friend std::ostream& operator<<(std::ostream& out, + const SimpleProfiler& value) { + out << "========== " << value.mName << " profile ==========" << std::endl; + float totalTime = 0; + std::string layerNameStr = "TensorRT layer name"; + int maxLayerNameLength = + std::max(static_cast(layerNameStr.size()), 70); + for (const auto& elem : value.mProfile) { + totalTime += elem.second.time; + maxLayerNameLength = + std::max(maxLayerNameLength, static_cast(elem.first.size())); + } + + auto old_settings = out.flags(); + auto old_precision = out.precision(); + // Output header + { + out << std::setw(maxLayerNameLength) << layerNameStr << " "; + out << std::setw(12) << "Runtime, " + << "%" + << " "; + out << std::setw(12) << "Invocations" + << " "; + out << std::setw(12) << "Runtime, ms" << std::endl; + } + for (size_t i = 0; i < value.mLayerNames.size(); i++) { + const std::string layerName = value.mLayerNames[i]; + auto elem = value.mProfile.at(layerName); + out << std::setw(maxLayerNameLength) << layerName << " "; + out << std::setw(12) << std::fixed << std::setprecision(1) + << (elem.time * 100.0F / totalTime) << "%" + << " "; + out << std::setw(12) << elem.count << " "; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time + << std::endl; + } + out.flags(old_settings); + out.precision(old_precision); + out << "========== " << value.mName << " total runtime = " << totalTime + << " ms ==========" << std::endl; + + return out; + } + + private: + std::string mName; + std::vector mLayerNames; + std::map mProfile; +}; + +//! Locate path to file, given its filename or filepath suffix and possible dirs +//! it might lie in. +//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a +//! file path. +inline std::string locateFile(const std::string& filepathSuffix, + const std::vector& directories, + bool reportError = true) { + const int MAX_DEPTH{10}; + bool found{false}; + std::string filepath; + + for (auto& dir : directories) { + if (!dir.empty() && dir.back() != '/') { +#ifdef _MSC_VER + filepath = dir + "\\" + filepathSuffix; +#else + filepath = dir + "/" + filepathSuffix; +#endif + } else { + filepath = dir + filepathSuffix; + } + + for (int i = 0; i < MAX_DEPTH && !found; i++) { + const std::ifstream checkFile(filepath); + found = checkFile.is_open(); + if (found) { + break; + } + + filepath = "../" + filepath; // Try again in parent dir + } + + if (found) { + break; + } + + filepath.clear(); + } + + // Could not find the file + if (filepath.empty()) { + const std::string dirList = std::accumulate( + directories.begin() + 1, directories.end(), directories.front(), + [](const std::string& a, const std::string& b) { + return a + "\n\t" + b; + }); + std::cout << "Could not find " << filepathSuffix + << " in data directories:\n\t" << dirList << std::endl; + + if (reportError) { + std::cout << "&&&& FAILED" << std::endl; + exit(EXIT_FAILURE); + } + } + + return filepath; +} + +inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, + int inW) { + std::ifstream infile(fileName, std::ifstream::binary); + assert(infile.is_open() && + "Attempting to read from a file that is not open."); + std::string magic, h, w, max; + infile >> magic >> h >> w >> max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(buffer), inH * inW); +} + +namespace samplesCommon { + +// Swaps endianness of an integral type. +template ::value, int>::type = 0> +inline T swapEndianness(const T& value) { + uint8_t bytes[sizeof(T)]; + for (int i = 0; i < static_cast(sizeof(T)); ++i) { + bytes[sizeof(T) - 1 - i] = *(reinterpret_cast(&value) + i); + } + return *reinterpret_cast(bytes); +} + +class HostMemory { + public: + HostMemory() = delete; + virtual void* data() const noexcept { return mData; } + virtual std::size_t size() const noexcept { return mSize; } + virtual DataType type() const noexcept { return mType; } + virtual ~HostMemory() {} + + protected: + HostMemory(std::size_t size, DataType type) + : mData{nullptr}, mSize(size), mType(type) {} + void* mData; + std::size_t mSize; + DataType mType; +}; + +template +class TypedHostMemory : public HostMemory { + public: + explicit TypedHostMemory(std::size_t size) : HostMemory(size, dataType) { + mData = new ElemType[size]; + }; + ~TypedHostMemory() noexcept { delete[](ElemType*) mData; } + ElemType* raw() noexcept { return static_cast(data()); } +}; + +using FloatMemory = TypedHostMemory; +using HalfMemory = TypedHostMemory; +using ByteMemory = TypedHostMemory; + +inline void* safeCudaMalloc(size_t memSize) { + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + return deviceMem; +} + +inline bool isDebug() { return (std::getenv("TENSORRT_DEBUG") ? true : false); } + +struct InferDeleter { + template void operator()(T* obj) const { delete obj; } +}; + +template using SampleUniquePtr = std::unique_ptr; + +static auto StreamDeleter = [](cudaStream_t* pStream) { + if (pStream) { + cudaStreamDestroy(*pStream); + delete pStream; + } +}; + +inline std::unique_ptr makeCudaStream() { + std::unique_ptr pStream( + new cudaStream_t, StreamDeleter); + if (cudaStreamCreateWithFlags(pStream.get(), cudaStreamNonBlocking) != + cudaSuccess) { + pStream.reset(nullptr); + } + + return pStream; +} + +//! Return vector of indices that puts magnitudes of sequence in descending +//! order. +template +std::vector argMagnitudeSort(Iter begin, Iter end) { + std::vector indices(end - begin); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&begin](size_t i, size_t j) { + return std::abs(begin[j]) < std::abs(begin[i]); + }); + return indices; +} + +inline bool readReferenceFile(const std::string& fileName, + std::vector& refVector) { + std::ifstream infile(fileName); + if (!infile.is_open()) { + std::cout << "ERROR: readReferenceFile: Attempting to read from a file " + "that is not open." + << std::endl; + return false; + } + std::string line; + while (std::getline(infile, line)) { + if (line.empty()) + continue; + refVector.push_back(line); + } + infile.close(); + return true; +} + +template +std::vector classify(const std::vector& refVector, + const std::vector& output, + const size_t topK) { + const auto inds = + samplesCommon::argMagnitudeSort(output.cbegin(), output.cend()); + std::vector result; + result.reserve(topK); + for (size_t k = 0; k < topK; ++k) { + result.push_back(refVector[inds[k]]); + } + return result; +} + +// Returns indices of highest K magnitudes in v. +template +std::vector topKMagnitudes(const std::vector& v, const size_t k) { + std::vector indices = + samplesCommon::argMagnitudeSort(v.cbegin(), v.cend()); + indices.resize(k); + return indices; +} + +template +bool readASCIIFile(const std::string& fileName, const size_t size, + std::vector& out) { + std::ifstream infile(fileName); + if (!infile.is_open()) { + std::cout << "ERROR readASCIIFile: Attempting to read from a file that is " + "not open." + << std::endl; + return false; + } + out.clear(); + out.reserve(size); + out.assign(std::istream_iterator(infile), std::istream_iterator()); + infile.close(); + return true; +} + +template +bool writeASCIIFile(const std::string& fileName, const std::vector& in) { + std::ofstream outfile(fileName); + if (!outfile.is_open()) { + std::cout << "ERROR: writeASCIIFile: Attempting to write to a file that is " + "not open." + << std::endl; + return false; + } + for (auto fn : in) { + outfile << fn << "\n"; + } + outfile.close(); + return true; +} + +inline void print_version() { + std::cout << " TensorRT version: " << NV_TENSORRT_MAJOR << "." + << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << "." + << NV_TENSORRT_BUILD << std::endl; +} + +inline std::string getFileType(const std::string& filepath) { + return filepath.substr(filepath.find_last_of(".") + 1); +} + +inline std::string toLower(const std::string& inp) { + std::string out = inp; + std::transform(out.begin(), out.end(), out.begin(), ::tolower); + return out; +} + +inline float getMaxValue(const float* buffer, int64_t size) { + assert(buffer != nullptr); + assert(size > 0); + return *std::max_element(buffer, buffer + size); +} + +// Ensures that every tensor used by a network has a dynamic range set. +// +// All tensors in a network must have a dynamic range specified if a calibrator +// is not used. +// This function is just a utility to globally fill in missing scales and +// zero-points for the entire network. +// +// If a tensor does not have a dyanamic range set, it is assigned inRange or +// outRange as follows: +// +// * If the tensor is the input to a layer or output of a pooling node, its +// dynamic range is derived from inRange. +// * Otherwise its dynamic range is derived from outRange. +// +// The default parameter values are intended to demonstrate, for final layers in +// the network, +// cases where dynamic ranges are asymmetric. +// +// The default parameter values choosen arbitrarily. Range values should be +// choosen such that +// we avoid underflow or overflow. Also range value should be non zero to avoid +// uniform zero scale tensor. +inline void setAllDynamicRanges(INetworkDefinition* network, + float inRange = 2.0f, float outRange = 4.0f) { + // Ensure that all layer inputs have a scale. + for (int i = 0; i < network->getNbLayers(); i++) { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbInputs(); j++) { + ITensor* input{layer->getInput(j)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input != nullptr && !input->dynamicRangeIsSet()) { + ASSERT(input->setDynamicRange(-inRange, inRange)); + } + } + } + + // Ensure that all layer outputs have a scale. + // Tensors that are also inputs to layers are ingored here + // since the previous loop nest assigned scales to them. + for (int i = 0; i < network->getNbLayers(); i++) { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbOutputs(); j++) { + ITensor* output{layer->getOutput(j)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output != nullptr && !output->dynamicRangeIsSet()) { + // Pooling must have the same input and output scales. + if (layer->getType() == LayerType::kPOOLING) { + ASSERT(output->setDynamicRange(-inRange, inRange)); + } else { + ASSERT(output->setDynamicRange(-outRange, outRange)); + } + } + } + } +} + +inline void setDummyInt8DynamicRanges(const IBuilderConfig* c, + INetworkDefinition* n) { + // Set dummy per-tensor dynamic range if Int8 mode is requested. + if (c->getFlag(BuilderFlag::kINT8)) { + sample::gLogWarning << "Int8 calibrator not provided. Generating dummy " + "per-tensor dynamic range. Int8 accuracy is not " + "guaranteed." + << std::endl; + setAllDynamicRanges(n); + } +} + +inline void enableDLA(IBuilder* builder, IBuilderConfig* config, int useDLACore, + bool allowGPUFallback = true) { + if (useDLACore >= 0) { + if (builder->getNbDLACores() == 0) { + std::cerr << "Trying to use DLA core " << useDLACore + << " on a platform that doesn't have any DLA cores" + << std::endl; + assert( + "Error: use DLA core on a platfrom that doesn't have any DLA cores" && + false); + } + if (allowGPUFallback) { + config->setFlag(BuilderFlag::kGPU_FALLBACK); + } + if (!config->getFlag(BuilderFlag::kINT8)) { + // User has not requested INT8 Mode. + // By default run in FP16 mode. FP32 mode is not permitted. + config->setFlag(BuilderFlag::kFP16); + } + config->setDefaultDeviceType(DeviceType::kDLA); + config->setDLACore(useDLACore); + } +} + +inline int32_t parseDLA(int32_t argc, char** argv) { + for (int32_t i = 1; i < argc; i++) { + if (strncmp(argv[i], "--useDLACore=", 13) == 0) { + return std::stoi(argv[i] + 13); + } + } + return -1; +} + +inline uint32_t getElementSize(nvinfer1::DataType t) noexcept { + switch (t) { + case nvinfer1::DataType::kINT32: + return 4; + case nvinfer1::DataType::kFLOAT: + return 4; + case nvinfer1::DataType::kHALF: + return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kINT8: + return 1; + } + return 0; +} + +inline int64_t volume(const nvinfer1::Dims& d) { + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +template struct PPM { + std::string magic, fileName; + int h, w, max; + uint8_t buffer[C * H * W]; +}; + +// New vPPM(variable sized PPM) class with variable dimensions. +struct vPPM { + std::string magic, fileName; + int h, w, max; + std::vector buffer; +}; + +struct BBox { + float x1, y1, x2, y2; +}; + +template +void readPPMFile(const std::string& filename, + samplesCommon::PPM& ppm) { + ppm.fileName = filename; + std::ifstream infile(filename, std::ifstream::binary); + assert(infile.is_open() && + "Attempting to read from a file that is not open."); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void readPPMFile(const std::string& filename, vPPM& ppm, + std::vector& input_dir) { + ppm.fileName = filename; + std::ifstream infile(locateFile(filename, input_dir), std::ifstream::binary); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + + for (int i = 0; i < ppm.w * ppm.h * 3; ++i) { + ppm.buffer.push_back(0); + } + + infile.read(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +template +void writePPMFileWithBBox(const std::string& filename, PPM& ppm, + const BBox& bbox) { + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); + const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); + const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); + const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1); + + for (int x = x1; x <= x2; ++x) { + // bbox top border + ppm.buffer[(y1 * ppm.w + x) * 3] = 255; + ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(y2 * ppm.w + x) * 3] = 255; + ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = y1; y <= y2; ++y) { + // bbox left border + ppm.buffer[(y * ppm.w + x1) * 3] = 255; + ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + x2) * 3] = 255; + ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0; + } + + outfile.write(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, + std::vector& dets) { + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + + for (auto bbox : dets) { + for (int x = int(bbox.x1); x < int(bbox.x2); ++x) { + // bbox top border + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = int(bbox.y1); y < int(bbox.y2); ++y) { + // bbox left border + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 2] = 0; + } + } + + outfile.write(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +class TimerBase { + public: + virtual void start() {} + virtual void stop() {} + float microseconds() const noexcept { return mMs * 1000.f; } + float milliseconds() const noexcept { return mMs; } + float seconds() const noexcept { return mMs / 1000.f; } + void reset() noexcept { mMs = 0.f; } + + protected: + float mMs{0.0f}; +}; + +class GpuTimer : public TimerBase { + public: + explicit GpuTimer(cudaStream_t stream) : mStream(stream) { + CHECK(cudaEventCreate(&mStart)); + CHECK(cudaEventCreate(&mStop)); + } + ~GpuTimer() { + CHECK(cudaEventDestroy(mStart)); + CHECK(cudaEventDestroy(mStop)); + } + void start() { CHECK(cudaEventRecord(mStart, mStream)); } + void stop() { + CHECK(cudaEventRecord(mStop, mStream)); + float ms{0.0f}; + CHECK(cudaEventSynchronize(mStop)); + CHECK(cudaEventElapsedTime(&ms, mStart, mStop)); + mMs += ms; + } + + private: + cudaEvent_t mStart, mStop; + cudaStream_t mStream; +}; // class GpuTimer + +template class CpuTimer : public TimerBase { + public: + using clock_type = Clock; + + void start() { mStart = Clock::now(); } + void stop() { + mStop = Clock::now(); + mMs += std::chrono::duration{mStop - mStart}.count(); + } + + private: + std::chrono::time_point mStart, mStop; +}; // class CpuTimer + +using PreciseCpuTimer = CpuTimer; + +inline std::vector splitString(std::string str, + char delimiter = ',') { + std::vector splitVect; + std::stringstream ss(str); + std::string substr; + + while (ss.good()) { + getline(ss, substr, delimiter); + splitVect.emplace_back(std::move(substr)); + } + return splitVect; +} + +// Return m rounded up to nearest multiple of n +inline int roundUp(int m, int n) { return ((m + n - 1) / n) * n; } + +inline int getC(const Dims& d) { return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1; } + +inline int getH(const Dims& d) { return d.nbDims >= 2 ? d.d[d.nbDims - 2] : 1; } + +inline int getW(const Dims& d) { return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1; } + +inline void loadLibrary(const std::string& path) { +#ifdef _MSC_VER + void* handle = LoadLibrary(path.c_str()); +#else + int32_t flags{RTLD_LAZY}; +#if ENABLE_ASAN + // https://github.com/google/sanitizers/issues/89 + // asan doesn't handle module unloading correctly and there are no plans on + // doing + // so. In order to get proper stack traces, don't delete the shared library on + // close so that asan can resolve the symbols correctly. + flags |= RTLD_NODELETE; +#endif // ENABLE_ASAN + + void* handle = dlopen(path.c_str(), flags); +#endif + if (handle == nullptr) { +#ifdef _MSC_VER + sample::gLogError << "Could not load plugin library: " << path << std::endl; +#else + sample::gLogError << "Could not load plugin library: " << path + << ", due to: " << dlerror() << std::endl; +#endif + } +} + +inline int32_t getSMVersion() { + int32_t deviceIndex = 0; + CHECK(cudaGetDevice(&deviceIndex)); + + int32_t major, minor; + CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, + deviceIndex)); + CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, + deviceIndex)); + + return ((major << 8) | minor); +} + +inline bool isSMSafe() { + const int32_t smVersion = getSMVersion(); + return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 || + smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807; +} + +inline bool isDataTypeSupported(DataType dataType) { + auto builder = SampleUniquePtr( + nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); + if (!builder) { + return false; + } + + if ((dataType == DataType::kINT8 && !builder->platformHasFastInt8()) || + (dataType == DataType::kHALF && !builder->platformHasFastFp16())) { + return false; + } + + return true; +} + +} // namespace samplesCommon + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) { + os << "("; + for (int i = 0; i < dims.nbDims; ++i) { + os << (i ? ", " : "") << dims.d[i]; + } + return os << ")"; +} + +#endif // TENSORRT_COMMON_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/getOptions.cpp b/csrc/fastdeploy/backends/tensorrt/common/getOptions.cpp new file mode 100644 index 000000000..84b06581a --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/getOptions.cpp @@ -0,0 +1,223 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "getOptions.h" +#include "logger.h" + +#include +#include +#include +#include +#include + +namespace nvinfer1 { +namespace utility { + +//! Matching for TRTOptions is defined as follows: +//! +//! If A and B both have longName set, A matches B if and only if A.longName == +//! B.longName and (A.shortName == B.shortName if both have short name set). +//! +//! If A only has shortName set and B only has longName set, then A does not +//! match B. It is assumed that when 2 TRTOptions are compared, one of them is +//! the definition of a TRTOption in the input to getOptions. As such, if the +//! definition only has shortName set, it will never be equal to a TRTOption +//! that does not have shortName set (and same for longName). +//! +//! If A and B both have shortName set but B does not have longName set, A +//! matches B if and only if A.shortName == B.shortName. +//! +//! If A has neither long or short name set, A matches B if and only if B has +//! neither long or short name set. +bool matches(const TRTOption& a, const TRTOption& b) { + if (!a.longName.empty() && !b.longName.empty()) { + if (a.shortName && b.shortName) { + return (a.longName == b.longName) && (a.shortName == b.shortName); + } + return a.longName == b.longName; + } + + // If only one of them is not set, this will return false anyway. + return a.shortName == b.shortName; +} + +//! getTRTOptionIndex returns the index of a TRTOption in a vector of +//! TRTOptions, -1 if not found. +int getTRTOptionIndex(const std::vector& options, + const TRTOption& opt) { + for (size_t i = 0; i < options.size(); ++i) { + if (matches(opt, options[i])) { + return i; + } + } + return -1; +} + +//! validateTRTOption will return a string containing an error message if +//! options +//! contain non-numeric characters, or if there are duplicate option names +//! found. +//! Otherwise, returns the empty string. +std::string validateTRTOption(const std::set& seenShortNames, + const std::set& seenLongNames, + const TRTOption& opt) { + if (opt.shortName != 0) { + if (!std::isalnum(opt.shortName)) { + return "Short name '" + std::to_string(opt.shortName) + + "' is non-alphanumeric"; + } + + if (seenShortNames.find(opt.shortName) != seenShortNames.end()) { + return "Short name '" + std::to_string(opt.shortName) + + "' is a duplicate"; + } + } + + if (!opt.longName.empty()) { + for (const char& c : opt.longName) { + if (!std::isalnum(c) && c != '-' && c != '_') { + return "Long name '" + opt.longName + + "' contains characters that are not '-', '_', or alphanumeric"; + } + } + + if (seenLongNames.find(opt.longName) != seenLongNames.end()) { + return "Long name '" + opt.longName + "' is a duplicate"; + } + } + return ""; +} + +//! validateTRTOptions will return a string containing an error message if any +//! options contain non-numeric characters, or if there are duplicate option +//! names found. Otherwise, returns the empty string. +std::string validateTRTOptions(const std::vector& options) { + std::set seenShortNames; + std::set seenLongNames; + for (size_t i = 0; i < options.size(); ++i) { + const std::string errMsg = + validateTRTOption(seenShortNames, seenLongNames, options[i]); + if (!errMsg.empty()) { + return "Error '" + errMsg + "' at TRTOption " + std::to_string(i); + } + + seenShortNames.insert(options[i].shortName); + seenLongNames.insert(options[i].longName); + } + return ""; +} + +//! parseArgs parses an argument list and returns a TRTParsedArgs with the +//! fields set accordingly. Assumes that options is validated. +//! ErrMsg will be set if: +//! - an argument is null +//! - an argument is empty +//! - an argument does not have option (i.e. "-" and "--") +//! - a short argument has more than 1 character +//! - the last argument in the list requires a value +TRTParsedArgs parseArgs(int argc, const char* const* argv, + const std::vector& options) { + TRTParsedArgs parsedArgs; + parsedArgs.values.resize(options.size()); + + for (int i = 1; i < argc; ++i) // index of current command-line argument + { + if (argv[i] == nullptr) { + return TRTParsedArgs{"Null argument at index " + std::to_string(i)}; + } + + const std::string argStr(argv[i]); + if (argStr.empty()) { + return TRTParsedArgs{"Empty argument at index " + std::to_string(i)}; + } + + // No starting hyphen means it is a positional argument + if (argStr[0] != '-') { + parsedArgs.positionalArgs.push_back(argStr); + continue; + } + + if (argStr == "-" || argStr == "--") { + return TRTParsedArgs{"Argument does not specify an option at index " + + std::to_string(i)}; + } + + // If only 1 hyphen, char after is the flag. + TRTOption opt{' ', "", false, ""}; + std::string value; + if (argStr[1] != '-') { + // Must only have 1 char after the hyphen + if (argStr.size() > 2) { + return TRTParsedArgs{ + "Short arg contains more than 1 character at index " + + std::to_string(i)}; + } + opt.shortName = argStr[1]; + } else { + opt.longName = argStr.substr(2); + + // We need to support --foo=bar syntax, so look for '=' + const size_t eqIndex = opt.longName.find('='); + if (eqIndex < opt.longName.size()) { + value = opt.longName.substr(eqIndex + 1); + opt.longName = opt.longName.substr(0, eqIndex); + } + } + + const int idx = getTRTOptionIndex(options, opt); + if (idx < 0) { + continue; + } + + if (options[idx].valueRequired) { + if (!value.empty()) { + parsedArgs.values[idx].second.push_back(value); + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + continue; + } + + if (i + 1 >= argc) { + return TRTParsedArgs{"Last argument requires value, but none given"}; + } + + const std::string nextArg(argv[i + 1]); + if (nextArg.size() >= 1 && nextArg[0] == '-') { + sample::gLogWarning << "Warning: Using '" << nextArg + << "' as a value for '" << argStr + << "', Should this be its own flag?" << std::endl; + } + + parsedArgs.values[idx].second.push_back(nextArg); + i += 1; // Next argument already consumed + + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + } else { + parsedArgs.values[idx].first += 1; + } + } + return parsedArgs; +} + +TRTParsedArgs getOptions(int argc, const char* const* argv, + const std::vector& options) { + const std::string errMsg = validateTRTOptions(options); + if (!errMsg.empty()) { + return TRTParsedArgs{errMsg}; + } + return parseArgs(argc, argv, options); +} +} // namespace utility +} // namespace nvinfer1 diff --git a/csrc/fastdeploy/backends/tensorrt/common/getOptions.h b/csrc/fastdeploy/backends/tensorrt/common/getOptions.h new file mode 100644 index 000000000..efe466632 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/getOptions.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_GET_OPTIONS_H +#define TRT_GET_OPTIONS_H + +#include +#include +#include + +namespace nvinfer1 { +namespace utility { + +//! TRTOption defines a command line option. At least 1 of shortName and +//! longName +//! must be defined. +//! If bool initialization is undefined behavior on your system, valueRequired +//! must also be explicitly defined. +//! helpText is optional. +struct TRTOption { + char shortName; //!< Option name in short (single hyphen) form (i.e. -a, -b) + std::string longName; //!< Option name in long (double hyphen) form (i.e. + //!--foo, --bar) + bool valueRequired; //!< True if a value is needed for an option (i.e. -N 4, + //!--foo bar) + std::string helpText; //!< Text to show when printing out the command usage +}; + +//! TRTParsedArgs is returned by getOptions after it has parsed a command line +//! argument list (argv). +//! +//! errMsg is a string containing an error message if any errors occurred. If it +//! is empty, no errors occurred. +//! +//! values stores a vector of pairs for each option (ordered by order in the +//! input). Each pair contains an int (the number of occurrences) and a vector +//! of strings (a list of values). The user should know which of these to use, +//! and which options required values. For non-value options, only occurrences +//! is +//! populated. For value-required options, occurrences == # of values. Values do +//! not need to be unique. +//! +//! positionalArgs stores additional arguments that are passed in without an +//! option (these must not start with a hyphen). +struct TRTParsedArgs { + std::string errMsg; + std::vector>> values; + std::vector positionalArgs; +}; + +//! Parse the input arguments passed to main() and extract options as well as +//! positional arguments. +//! +//! Options are supposed to be passed to main() with a preceding hyphen '-'. +//! +//! If there is a single preceding hyphen, there should be exactly 1 character +//! after the hyphen, which is interpreted as the option. +//! +//! If there are 2 preceding hyphens, the entire argument (without the hyphens) +//! is interpreted as the option. +//! +//! If the option requires a value, the next argument is used as the value. +//! +//! Positional arguments must not start with a hyphen. +//! +//! If an argument requires a value, the next argument is interpreted as the +//! value, even if it is the form of a valid option (i.e. --foo --bar will store +//! "--bar" as a value for option "foo" if "foo" requires a value). +//! We also support --name=value syntax. In this case, 'value' would be used as +//! the value, NOT the next argument. +//! +//! For options: +//! { { 'a', "", false }, +//! { 'b', "", false }, +//! { 0, "cee", false }, +//! { 'd', "", true }, +//! { 'e', "", true }, +//! { 'f', "foo", true } } +//! +//! ./main hello world -a -a --cee -d 12 -f 34 +//! and +//! ./main hello world -a -a --cee -d 12 --foo 34 +//! +//! will result in: +//! +//! TRTParsedArgs { +//! errMsg: "", +//! values: { { 2, {} }, +//! { 0, {} }, +//! { 1, {} }, +//! { 1, {"12"} }, +//! { 0, {} }, +//! { 1, {"34"} } } +//! positionalArgs: {"hello", "world"}, +//! } +//! +//! Non-POSIX behavior: +//! - Does not support "-abcde" as a shorthand for "-a -b -c -d -e". Each +//! option must have its own hyphen prefix. +//! - Does not support -e12 as a shorthand for "-e 12". Values MUST be +//! whitespace-separated from the option it is for. +//! +//! @param[in] argc The number of arguments passed to main (including the +//! file name, which is disregarded) +//! @param[in] argv The arguments passed to main (including the file name, +//! which is disregarded) +//! @param[in] options List of TRTOptions to parse +//! @return TRTParsedArgs. See TRTParsedArgs documentation for descriptions of +//! the fields. +TRTParsedArgs getOptions(int argc, const char* const* argv, + const std::vector& options); +} // namespace utility +} // namespace nvinfer1 + +#endif // TRT_GET_OPTIONS_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/half.h b/csrc/fastdeploy/backends/tensorrt/common/half.h new file mode 100644 index 000000000..5ca797000 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/half.h @@ -0,0 +1,3787 @@ +// half - IEEE 754-based half-precision floating point library. +// +// Copyright (c) 2012-2017 Christian Rau +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated +// documentation files (the "Software"), to deal in the Software without +// restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to the +// following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the +// Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR +// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +// OR OTHER DEALINGS IN THE SOFTWARE. + +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Version 1.12.0 + +/// \file +/// Main header file for half precision functionality. + +#ifndef HALF_HALF_HPP +#define HALF_HALF_HPP + +/// Combined gcc version number. +#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// check C++11 language features +#if defined(__clang__) // clang +#if __has_feature(cxx_static_assert) && \ + !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if __has_feature(cxx_user_literals) && \ + !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && \ + !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +/*#elif defined(__INTEL_COMPILER) + //Intel C++ + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) + ???????? + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) + ???????? + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) + ???????? + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) + ???????? + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif*/ +#elif defined(__GNUC__) // gcc +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#endif +#elif defined(_MSC_VER) // Visual C++ +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#define HALF_POP_WARNINGS 1 +#pragma warning(push) +#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, +// negative unsigned +#endif + +// check C++11 library features +#include +#if defined(_LIBCPP_VERSION) // libc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#elif defined(__GLIBCXX__) // libstdc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifdef __clang__ +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#else +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#endif +#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++ +#if _CPPLIB_VER >= 520 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#if _CPPLIB_VER >= 610 +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#endif +#endif +#undef HALF_GNUC_VERSION + +// support constexpr +#if HALF_ENABLE_CPP11_CONSTEXPR +#define HALF_CONSTEXPR constexpr +#define HALF_CONSTEXPR_CONST constexpr +#else +#define HALF_CONSTEXPR +#define HALF_CONSTEXPR_CONST const +#endif + +// support noexcept +#if HALF_ENABLE_CPP11_NOEXCEPT +#define HALF_NOEXCEPT noexcept +#define HALF_NOTHROW noexcept +#else +#define HALF_NOEXCEPT +#define HALF_NOTHROW throw() +#endif + +#include +#include +#include +#include +#include +#include +#if HALF_ENABLE_CPP11_TYPE_TRAITS +#include +#endif +#if HALF_ENABLE_CPP11_CSTDINT +#include +#endif +#if HALF_ENABLE_CPP11_HASH +#include +#endif + +/// Default rounding mode. +/// This specifies the rounding mode used for all conversions between +/// [half](\ref half_float::half)s and `float`s as +/// well as for the half_cast() if not specifying a rounding mode explicitly. It +/// can be redefined (before including +/// half.hpp) to one of the standard rounding modes using their respective +/// constants or the equivalent values of +/// `std::float_round_style`: +/// +/// `std::float_round_style` | value | rounding +/// ---------------------------------|-------|------------------------- +/// `std::round_indeterminate` | -1 | fastest (default) +/// `std::round_toward_zero` | 0 | toward zero +/// `std::round_to_nearest` | 1 | to nearest +/// `std::round_toward_infinity` | 2 | toward positive infinity +/// `std::round_toward_neg_infinity` | 3 | toward negative infinity +/// +/// By default this is set to `-1` (`std::round_indeterminate`), which uses +/// truncation (round toward zero, but with +/// overflows set to infinity) and is the fastest rounding mode possible. It can +/// even be set to +/// `std::numeric_limits::round_style` to synchronize the rounding mode +/// with that of the underlying +/// single-precision implementation. +#ifndef HALF_ROUND_STYLE +#define HALF_ROUND_STYLE 1 // = std::round_to_nearest +#endif + +/// Tie-breaking behaviour for round to nearest. +/// This specifies if ties in round to nearest should be resolved by rounding to +/// the nearest even value. By default this +/// is defined to `0` resulting in the faster but slightly more biased behaviour +/// of rounding away from zero in half-way +/// cases (and thus equal to the round() function), but can be redefined to `1` +/// (before including half.hpp) if more +/// IEEE-conformant behaviour is needed. +#ifndef HALF_ROUND_TIES_TO_EVEN +#define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero +#endif + +/// Value signaling overflow. +/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to +/// a positive value signaling the overflow +/// of an operation, in particular it just evaluates to positive infinity. +#define HUGE_VALH std::numeric_limits::infinity() + +/// Fast half-precision fma function. +/// This symbol is only defined if the fma() function generally executes as fast +/// as, or faster than, a separate +/// half-precision multiplication followed by an addition. Due to the internal +/// single-precision implementation of all +/// arithmetic operations, this is in fact always the case. +#define FP_FAST_FMAH 1 + +#ifndef FP_ILOGB0 +#define FP_ILOGB0 INT_MIN +#endif +#ifndef FP_ILOGBNAN +#define FP_ILOGBNAN INT_MAX +#endif +#ifndef FP_SUBNORMAL +#define FP_SUBNORMAL 0 +#endif +#ifndef FP_ZERO +#define FP_ZERO 1 +#endif +#ifndef FP_NAN +#define FP_NAN 2 +#endif +#ifndef FP_INFINITE +#define FP_INFINITE 3 +#endif +#ifndef FP_NORMAL +#define FP_NORMAL 4 +#endif + +/// Main namespace for half precision functionality. +/// This namespace contains all the functionality provided by the library. +namespace half_float { +class half; + +#if HALF_ENABLE_CPP11_USER_LITERALS +/// Library-defined half-precision literals. +/// Import this namespace to enable half-precision floating point literals: +/// ~~~~{.cpp} +/// using namespace half_float::literal; +/// half_float::half = 4.2_h; +/// ~~~~ +namespace literal { +half operator"" _h(long double); +} +#endif + +/// \internal +/// \brief Implementation details. +namespace detail { +#if HALF_ENABLE_CPP11_TYPE_TRAITS +/// Conditional type. +template +struct conditional : std::conditional {}; + +/// Helper for tag dispatching. +template struct bool_type : std::integral_constant {}; +using std::false_type; +using std::true_type; + +/// Type traits for floating point types. +template struct is_float : std::is_floating_point {}; +#else +/// Conditional type. +template struct conditional { typedef T type; }; +template struct conditional { + typedef F type; +}; + +/// Helper for tag dispatching. +template struct bool_type {}; +typedef bool_type true_type; +typedef bool_type false_type; + +/// Type traits for floating point types. +template struct is_float : false_type {}; +template struct is_float : is_float {}; +template struct is_float : is_float {}; +template struct is_float : is_float {}; +template <> struct is_float : true_type {}; +template <> struct is_float : true_type {}; +template <> struct is_float : true_type {}; +#endif + +/// Type traits for floating point bits. +template struct bits { typedef unsigned char type; }; +template struct bits : bits {}; +template struct bits : bits {}; +template struct bits : bits {}; + +#if HALF_ENABLE_CPP11_CSTDINT +/// Unsigned integer of (at least) 16 bits width. +typedef std::uint_least16_t uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> struct bits { typedef std::uint_least32_t type; }; + +/// Unsigned integer of (at least) 64 bits width. +template <> struct bits { typedef std::uint_least64_t type; }; +#else +/// Unsigned integer of (at least) 16 bits width. +typedef unsigned short uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> +struct bits + : conditional::digits >= 32, unsigned int, + unsigned long> {}; + +#if HALF_ENABLE_CPP11_LONG_LONG +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits + : conditional::digits >= 64, + unsigned long, unsigned long long> {}; +#else +/// Unsigned integer of (at least) 64 bits width. +template <> struct bits { typedef unsigned long type; }; +#endif +#endif + +/// Tag type for binary construction. +struct binary_t {}; + +/// Tag for binary construction. +HALF_CONSTEXPR_CONST binary_t binary = binary_t(); + +/// Temporary half-precision expression. +/// This class represents a half-precision expression which just stores a +/// single-precision value internally. +struct expr { + /// Conversion constructor. + /// \param f single-precision value to convert + explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + HALF_CONSTEXPR operator float() const HALF_NOEXCEPT { return value_; } + + private: + /// Internal expression value stored in single-precision. + float value_; +}; + +/// SFINAE helper for generic half-precision functions. +/// This class template has to be specialized for each valid combination of +/// argument types to provide a corresponding +/// `type` member equivalent to \a T. +/// \tparam T type to return +template +struct enable {}; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; +template struct enable { typedef T type; }; + +/// Return type for specialized generic 2-argument half-precision functions. +/// This class template has to be specialized for each valid combination of +/// argument types to provide a corresponding +/// `type` member denoting the appropriate return type. +/// \tparam T first argument type +/// \tparam U first argument type +template struct result : enable {}; +template <> struct result { typedef half type; }; + +/// \name Classification helpers +/// \{ + +/// Check for infinity. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if infinity +/// \retval false else +template bool builtin_isinf(T arg) { +#if HALF_ENABLE_CPP11_CMATH + return std::isinf(arg); +#elif defined(_MSC_VER) + return !::_finite(static_cast(arg)) && + !::_isnan(static_cast(arg)); +#else + return arg == std::numeric_limits::infinity() || + arg == -std::numeric_limits::infinity(); +#endif +} + +/// Check for NaN. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if not a number +/// \retval false else +template bool builtin_isnan(T arg) { +#if HALF_ENABLE_CPP11_CMATH + return std::isnan(arg); +#elif defined(_MSC_VER) + return ::_isnan(static_cast(arg)) != 0; +#else + return arg != arg; +#endif +} + +/// Check sign. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if signbit set +/// \retval false else +template bool builtin_signbit(T arg) { +#if HALF_ENABLE_CPP11_CMATH + return std::signbit(arg); +#else + return arg < T() || (arg == T() && T(1) / arg < T()); +#endif +} + +/// \} +/// \name Conversion +/// \{ + +/// Convert IEEE single-precision to half-precision. +/// Credit for this goes to [Jeroen van der +/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \param value single-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(float value, true_type) { + typedef bits::type uint32; + uint32 bits; // = *reinterpret_cast(&value); + // //violating + // strict aliasing! + std::memcpy(&bits, &value, sizeof(float)); + /* uint16 hbits = (bits>>16) & 0x8000; + bits &= 0x7FFFFFFF; + int exp = bits >> 23; + if(exp == 255) + return hbits | 0x7C00 | + (0x3FF&-static_cast((bits&0x7FFFFF)!=0)); + if(exp > 142) + { + if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7BFF + (R!=std::round_toward_zero); + } + int g, s; + if(exp > 112) + { + g = (bits>>12) & 1; + s = (bits&0xFFF) != 0; + hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF); + } + else if(exp > 101) + { + int i = 125 - exp; + bits = (bits&0x7FFFFF) | 0x800000; + g = (bits>>i) & 1; + s = (bits&((1L<> (i+1); + } + else + { + g = 0; + s = bits != 0; + } + if(R == std::round_to_nearest) + #if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s|hbits); + #else + hbits += g; + #endif + else if(R == std::round_toward_infinity) + hbits += ~(hbits>>15) & (s|g); + else if(R == std::round_toward_neg_infinity) + hbits += (hbits>>15) & (g|s); + */ + static const uint16 base_table[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, + 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, + 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, + 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, + 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, + 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, + 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, + 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000, 0xC400, 0xC800, + 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, + 0xF000, 0xF400, 0xF800, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00}; + static const unsigned char shift_table[512] = { + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, + 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, + 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 13}; + uint16 hbits = + base_table[bits >> 23] + + static_cast((bits & 0x7FFFFF) >> shift_table[bits >> 23]); + if (R == std::round_to_nearest) + hbits += + (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) | + (((bits >> 23) & 0xFF) == 102)) & + ((hbits & 0x7C00) != 0x7C00) +#if HALF_ROUND_TIES_TO_EVEN + & (((((static_cast(1) << (shift_table[bits >> 23] - 1)) - 1) & + bits) != 0) | + hbits) +#endif + ; + else if (R == std::round_toward_zero) + hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23]; + else if (R == std::round_toward_infinity) + hbits += + ((((bits & 0x7FFFFF & + ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) | + (((bits >> 23) <= 102) & ((bits >> 23) != 0))) & + (hbits < 0x7C00)) - + ((hbits == 0xFC00) & ((bits >> 23) != 511)); + else if (R == std::round_toward_neg_infinity) + hbits += + ((((bits & 0x7FFFFF & + ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) | + (((bits >> 23) <= 358) & ((bits >> 23) != 256))) & + (hbits < 0xFC00) & (hbits >> 15)) - + ((hbits == 0x7C00) & ((bits >> 23) != 255)); + return hbits; +} + +/// Convert IEEE double-precision to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \param value double-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(double value, true_type) { + typedef bits::type uint32; + typedef bits::type uint64; + uint64 bits; // = *reinterpret_cast(&value); + // //violating + // strict aliasing! + std::memcpy(&bits, &value, sizeof(double)); + uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF; + uint16 hbits = (hi >> 16) & 0x8000; + hi &= 0x7FFFFFFF; + int exp = hi >> 20; + if (exp == 2047) + return hbits | 0x7C00 | + (0x3FF & -static_cast((bits & 0xFFFFFFFFFFFFF) != 0)); + if (exp > 1038) { + if (R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits >> 15); + if (R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits >> 15); + return hbits | 0x7BFF + (R != std::round_toward_zero); + } + int g, s = lo != 0; + if (exp > 1008) { + g = (hi >> 9) & 1; + s |= (hi & 0x1FF) != 0; + hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF); + } else if (exp > 997) { + int i = 1018 - exp; + hi = (hi & 0xFFFFF) | 0x100000; + g = (hi >> i) & 1; + s |= (hi & ((1L << i) - 1)) != 0; + hbits |= hi >> (i + 1); + } else { + g = 0; + s |= hi != 0; + } + if (R == std::round_to_nearest) +#if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s | hbits); +#else + hbits += g; +#endif + else if (R == std::round_toward_infinity) + hbits += ~(hbits >> 15) & (s | g); + else if (R == std::round_toward_neg_infinity) + hbits += (hbits >> 15) & (g | s); + return hbits; +} + +/// Convert non-IEEE floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(T value, ...) { + uint16 hbits = static_cast(builtin_signbit(value)) << 15; + if (value == T()) + return hbits; + if (builtin_isnan(value)) + return hbits | 0x7FFF; + if (builtin_isinf(value)) + return hbits | 0x7C00; + int exp; + std::frexp(value, &exp); + if (exp > 16) { + if (R == std::round_toward_infinity) + return hbits | (0x7C00 - (hbits >> 15)); + else if (R == std::round_toward_neg_infinity) + return hbits | (0x7BFF + (hbits >> 15)); + return hbits | (0x7BFF + (R != std::round_toward_zero)); + } + if (exp < -13) + value = std::ldexp(value, 24); + else { + value = std::ldexp(value, 11 - exp); + hbits |= ((exp + 13) << 10); + } + T ival, frac = std::modf(value, &ival); + hbits += static_cast(std::abs(static_cast(ival))); + if (R == std::round_to_nearest) { + frac = std::abs(frac); +#if HALF_ROUND_TIES_TO_EVEN + hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits); +#else + hbits += frac >= T(0.5); +#endif + } else if (R == std::round_toward_infinity) + hbits += frac > T(); + else if (R == std::round_toward_neg_infinity) + hbits += frac < T(); + return hbits; +} + +/// Convert floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template uint16 float2half(T value) { + return float2half_impl( + value, bool_type < std::numeric_limits::is_iec559 && + sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \tparam S `true` if value negative, `false` else +/// \tparam T type to convert (builtin integer type) +/// \param value non-negative integral value +/// \return binary representation of half-precision value +template +uint16 int2half_impl(T value) { +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, + "int to half conversion only supports builtin integer types"); +#endif + if (S) + value = -value; + uint16 bits = S << 15; + if (value > 0xFFFF) { + if (R == std::round_toward_infinity) + bits |= 0x7C00 - S; + else if (R == std::round_toward_neg_infinity) + bits |= 0x7BFF + S; + else + bits |= 0x7BFF + (R != std::round_toward_zero); + } else if (value) { + uint32_t m = value, exp = 24; + for (; m < 0x400; m <<= 1, --exp) + ; + for (; m > 0x7FF; m >>= 1, ++exp) + ; + bits |= (exp << 10) + m; + if (exp > 24) { + if (R == std::round_to_nearest) + bits += (value >> (exp - 25)) & 1 +#if HALF_ROUND_TIES_TO_EVEN + & (((((1 << (exp - 25)) - 1) & value) != 0) | bits) +#endif + ; + else if (R == std::round_toward_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S; + else if (R == std::round_toward_neg_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S; + } + } + return bits; +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \tparam T type to convert (builtin integer type) +/// \param value integral value +/// \return binary representation of half-precision value +template uint16 int2half(T value) { + return (value < 0) ? int2half_impl(value) + : int2half_impl(value); +} + +/// Convert half-precision to IEEE single-precision. +/// Credit for this goes to [Jeroen van der +/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \param value binary representation of half-precision value +/// \return single-precision value +inline float half2float_impl(uint16 value, float, true_type) { + typedef bits::type uint32; + /* uint32 bits = static_cast(value&0x8000) << 16; + int abs = value & 0x7FFF; + if(abs) + { + bits |= 0x38000000 << static_cast(abs>=0x7C00); + for(; abs<0x400; abs<<=1,bits-=0x800000) ; + bits += static_cast(abs) << 13; + } + */ + static const uint32 mantissa_table[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, + 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, + 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, + 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, + 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, + 0x35F00000, 0x35F80000, 0x36000000, 0x36040000, 0x36080000, 0x360C0000, + 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, + 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, + 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, + 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, + 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, + 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, + 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, + 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, + 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, + 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, + 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, + 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, + 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, + 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, + 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, + 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, + 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 0x37200000, 0x37210000, + 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, + 0x372E0000, 0x372F0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, + 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, + 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, + 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000, + 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, + 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, + 0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, + 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, + 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, + 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, + 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, + 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, + 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, + 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, + 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, + 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, + 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, + 0x379F0000, 0x379F8000, 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, + 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, + 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, + 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, + 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, + 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, + 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, + 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, + 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, + 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, + 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, + 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, + 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 0x37C80000, 0x37C88000, + 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, + 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, + 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, + 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, + 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, + 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, + 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, + 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 0x37E00000, 0x37E08000, + 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, + 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, + 0x37E70000, 0x37E78000, 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, + 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, + 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, + 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, + 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, + 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, + 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, + 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, + 0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, + 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, + 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, + 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, + 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000, + 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, + 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, + 0x380B8000, 0x380BC000, 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, + 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, + 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, + 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, + 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, + 0x38130000, 0x38134000, 0x38138000, 0x3813C000, 0x38140000, 0x38144000, + 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, + 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, + 0x38178000, 0x3817C000, 0x38180000, 0x38184000, 0x38188000, 0x3818C000, + 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, + 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, + 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, + 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, + 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, + 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, + 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, + 0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000, + 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, + 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, + 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, + 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, + 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, + 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, + 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000, + 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, + 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, + 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, + 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, + 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 0x38380000, 0x38384000, + 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, + 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, + 0x383B8000, 0x383BC000, 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, + 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, + 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, + 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, + 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, + 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, + 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, + 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, + 0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, + 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, + 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, + 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, + 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000, 0x38504000, + 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, + 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, + 0x38538000, 0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, + 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, + 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, + 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, + 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, + 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 0x385C0000, 0x385C4000, + 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, + 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, + 0x385F8000, 0x385FC000, 0x38600000, 0x38604000, 0x38608000, 0x3860C000, + 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, + 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, + 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, + 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, + 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, + 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, + 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, + 0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, + 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, + 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, + 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, + 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, + 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, + 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, + 0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000, + 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, + 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, + 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, + 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, + 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 0x38000000, 0x38002000, + 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, + 0x3801C000, 0x3801E000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, + 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, + 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 0x38060000, 0x38062000, + 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, + 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, + 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, + 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, + 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, + 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, + 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 0x380C0000, 0x380C2000, + 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, + 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, + 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, + 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, + 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, + 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, + 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 0x38120000, 0x38122000, + 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, + 0x3813C000, 0x3813E000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, + 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, + 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, + 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, + 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, + 0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, + 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, + 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, + 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, + 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, + 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, + 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, + 0x381FC000, 0x381FE000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, + 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, + 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, + 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, 0x38240000, 0x38242000, + 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, + 0x3825C000, 0x3825E000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, + 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, + 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, + 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, + 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, + 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, + 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, + 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, + 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, + 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, + 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, + 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, + 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, + 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, + 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 0x38360000, 0x38362000, + 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, + 0x3837C000, 0x3837E000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, + 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, + 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, + 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, + 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, + 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, + 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, + 0x383DC000, 0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, + 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, + 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, + 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, + 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, + 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, + 0x3843C000, 0x3843E000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, + 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, + 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, + 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 0x38480000, 0x38482000, + 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, + 0x3849C000, 0x3849E000, 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, + 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, + 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, + 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, + 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, + 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, + 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, + 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, + 0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, + 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, + 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, + 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, + 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, + 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, + 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, + 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, + 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, + 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 0x385A0000, 0x385A2000, + 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, + 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, + 0x385BC000, 0x385BE000, 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, + 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, + 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, + 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, + 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, + 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 0x38600000, 0x38602000, + 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, + 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, + 0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, + 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, + 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, + 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, + 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, + 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, + 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, + 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, + 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, + 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, + 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 0x386C0000, 0x386C2000, + 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, + 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, + 0x386DC000, 0x386DE000, 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, + 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, + 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, + 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, + 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, + 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, + 0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, + 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, + 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, + 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, + 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, + 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, + 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, + 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, + 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, + 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, + 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, + 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, 0x387E0000, 0x387E2000, + 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, + 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, + 0x387FC000, 0x387FE000}; + static const uint32 exponent_table[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, + 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, + 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, + 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, + 0x0F000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, + 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, + 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, + 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000}; + static const unsigned short offset_table[64] = { + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 0, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024}; + uint32 bits = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + + exponent_table[value >> 10]; + // return *reinterpret_cast(&bits); + ////violating + // strict aliasing! + float out; + std::memcpy(&out, &bits, sizeof(float)); + return out; +} + +/// Convert half-precision to IEEE double-precision. +/// \param value binary representation of half-precision value +/// \return double-precision value +inline double half2float_impl(uint16 value, double, true_type) { + typedef bits::type uint32; + typedef bits::type uint64; + uint32 hi = static_cast(value & 0x8000) << 16; + int abs = value & 0x7FFF; + if (abs) { + hi |= 0x3F000000 << static_cast(abs >= 0x7C00); + for (; abs < 0x400; abs <<= 1, hi -= 0x100000) + ; + hi += static_cast(abs) << 10; + } + uint64 bits = static_cast(hi) << 32; + // return *reinterpret_cast(&bits); + ////violating + // strict aliasing! + double out; + std::memcpy(&out, &bits, sizeof(double)); + return out; +} + +/// Convert half-precision to non-IEEE floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template T half2float_impl(uint16 value, T, ...) { + T out; + int abs = value & 0x7FFF; + if (abs > 0x7C00) + out = std::numeric_limits::has_quiet_NaN + ? std::numeric_limits::quiet_NaN() + : T(); + else if (abs == 0x7C00) + out = std::numeric_limits::has_infinity + ? std::numeric_limits::infinity() + : std::numeric_limits::max(); + else if (abs > 0x3FF) + out = std::ldexp(static_cast((abs & 0x3FF) | 0x400), (abs >> 10) - 25); + else + out = std::ldexp(static_cast(abs), -24); + return (value & 0x8000) ? -out : out; +} + +/// Convert half-precision to floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template T half2float(uint16 value) { + return half2float_impl(value, T(), + bool_type < std::numeric_limits::is_iec559 && + sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \tparam T type to convert to (buitlin integer type with at least 16 bits +/// precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return +/// integral value +template +T half2int_impl(uint16 value) { +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, + "half to int conversion only supports builtin integer types"); +#endif + uint32_t e = value & 0x7FFF; + if (e >= 0x7C00) + return (value & 0x8000) ? std::numeric_limits::min() + : std::numeric_limits::max(); + if (e < 0x3800) { + if (R == std::round_toward_infinity) + return T(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + return -T(value > 0x8000); + return T(); + } + uint32_t m = (value & 0x3FF) | 0x400; + e >>= 10; + if (e < 25) { + if (R == std::round_to_nearest) + m += (1 << (24 - e)) - (~(m >> (25 - e)) & E); + else if (R == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U); + else if (R == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (25 - e)) - 1U); + m >>= 25 - e; + } else + m <<= e - 25; + return (value & 0x8000) ? -static_cast(m) : static_cast(m); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \tparam T type to convert to (buitlin integer type with at least 16 bits +/// precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return +/// integral value +template T half2int(uint16 value) { + return half2int_impl(value); +} + +/// Convert half-precision floating point to integer using +/// round-to-nearest-away-from-zero. +/// \tparam T type to convert to (buitlin integer type with at least 16 bits +/// precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return +/// integral value +template T half2int_up(uint16 value) { + return half2int_impl(value); +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template +uint16 round_half_impl(uint16 value) { + uint32_t e = value & 0x7FFF; + uint16 result = value; + if (e < 0x3C00) { + result &= 0x8000; + if (R == std::round_to_nearest) + result |= 0x3C00U & -(e >= (0x3800 + E)); + else if (R == std::round_toward_infinity) + result |= 0x3C00U & -(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + result |= 0x3C00U & -(value > 0x8000); + } else if (e < 0x6400) { + e = 25 - (e >> 10); + uint32_t mask = (1 << e) - 1; + if (R == std::round_to_nearest) + result += (1 << (e - 1)) - (~(result >> e) & E); + else if (R == std::round_toward_infinity) + result += mask & ((value >> 15) - 1); + else if (R == std::round_toward_neg_infinity) + result += mask & -(value >> 15); + result &= ~mask; + } + return result; +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest +/// rounding +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template uint16 round_half(uint16 value) { + return round_half_impl(value); +} + +/// Round half-precision number to nearest integer value using +/// round-to-nearest-away-from-zero. +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +inline uint16 round_half_up(uint16 value) { + return round_half_impl(value); +} +/// \} + +struct functions; +template struct unary_specialized; +template struct binary_specialized; +template struct half_caster; +} // namespace detail + +/// Half-precision floating point type. +/// This class implements an IEEE-conformant half-precision floating point type +/// with the usual arithmetic operators and +/// conversions. It is implicitly convertible to single-precision floating +/// point, which makes artihmetic expressions and +/// functions with mixed-type operands to be of the most precise operand type. +/// Additionally all arithmetic operations +/// (and many mathematical functions) are carried out in single-precision +/// internally. All conversions from single- to +/// half-precision are done using the library's default rounding mode, but +/// temporary results inside chained arithmetic +/// expressions are kept in single-precision as long as possible (while of +/// course still maintaining a strong +/// half-precision type). +/// +/// According to the C++98/03 definition, the half type is not a POD type. But +/// according to C++11's less strict and +/// extended definitions it is both a standard layout type and a trivially +/// copyable type (even if not a POD type), which +/// means it can be standard-conformantly copied using raw binary copies. But in +/// this context some more words about the +/// actual size of the type. Although the half is representing an IEEE 16-bit +/// type, it does not neccessarily have to be +/// of exactly 16-bits size. But on any reasonable implementation the actual +/// binary representation of this type will +/// most probably not ivolve any additional "magic" or padding beyond the simple +/// binary representation of the underlying +/// 16-bit IEEE number, even if not strictly guaranteed by the standard. But +/// even then it only has an actual size of 16 +/// bits if your C++ implementation supports an unsigned integer type of exactly +/// 16 bits width. But this should be the +/// case on nearly any reasonable platform. +/// +/// So if your C++ implementation is not totally exotic or imposes special +/// alignment requirements, it is a reasonable +/// assumption that the data of a half is just comprised of the 2 bytes of the +/// underlying IEEE representation. +class half { + friend struct detail::functions; + friend struct detail::unary_specialized; + friend struct detail::binary_specialized; + template + friend struct detail::half_caster; + friend class std::numeric_limits; +#if HALF_ENABLE_CPP11_HASH + friend struct std::hash; +#endif +#if HALF_ENABLE_CPP11_USER_LITERALS + friend half literal::operator"" _h(long double); +#endif + + public: + /// Default constructor. + /// This initializes the half to 0. Although this does not match the builtin + /// types' default-initialization semantics + /// and may be less efficient than no initialization, it is needed to provide + /// proper value-initialization semantics. + HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {} + + /// Copy constructor. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + half(detail::expr rhs) + : data_(detail::float2half(static_cast(rhs))) {} + + /// Conversion constructor. + /// \param rhs float to convert + explicit half(float rhs) : data_(detail::float2half(rhs)) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + operator float() const { return detail::half2float(data_); } + + /// Assignment operator. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + /// \return reference to this half + half& operator=(detail::expr rhs) { return *this = static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to add + /// \return reference to this half + template + typename detail::enable::type operator+=(T rhs) { + return *this += static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to subtract + /// \return reference to this half + template + typename detail::enable::type operator-=(T rhs) { + return *this -= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to multiply with + /// \return reference to this half + template + typename detail::enable::type operator*=(T rhs) { + return *this *= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to divide by + /// \return reference to this half + template + typename detail::enable::type operator/=(T rhs) { + return *this /= static_cast(rhs); + } + + /// Assignment operator. + /// \param rhs single-precision value to copy from + /// \return reference to this half + half& operator=(float rhs) { + data_ = detail::float2half(rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to add + /// \return reference to this half + half& operator+=(float rhs) { + data_ = + detail::float2half(detail::half2float(data_) + rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to subtract + /// \return reference to this half + half& operator-=(float rhs) { + data_ = + detail::float2half(detail::half2float(data_) - rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to multiply with + /// \return reference to this half + half& operator*=(float rhs) { + data_ = + detail::float2half(detail::half2float(data_) * rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to divide by + /// \return reference to this half + half& operator/=(float rhs) { + data_ = + detail::float2half(detail::half2float(data_) / rhs); + return *this; + } + + /// Prefix increment. + /// \return incremented half value + half& operator++() { return *this += 1.0f; } + + /// Prefix decrement. + /// \return decremented half value + half& operator--() { return *this -= 1.0f; } + + /// Postfix increment. + /// \return non-incremented half value + half operator++(int) { + half out(*this); + ++*this; + return out; + } + + /// Postfix decrement. + /// \return non-decremented half value + half operator--(int) { + half out(*this); + --*this; + return out; + } + + private: + /// Rounding mode to use + static const std::float_round_style round_style = + (std::float_round_style)(HALF_ROUND_STYLE); + + /// Constructor. + /// \param bits binary representation to set half to + HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT + : data_(bits) {} + + /// Internal binary representation + detail::uint16 data_; +}; + +#if HALF_ENABLE_CPP11_USER_LITERALS +namespace literal { +/// Half literal. +/// While this returns an actual half-precision value, half literals can +/// unfortunately not be constant expressions due +/// to rather involved conversions. +/// \param value literal value +/// \return half with given value (if representable) +inline half operator"" _h(long double value) { + return half(detail::binary, detail::float2half(value)); +} +} // namespace literal +#endif + +namespace detail { +/// Wrapper implementing unspecialized half-precision functions. +struct functions { + /// Addition implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision sum stored in single-precision + static expr plus(float x, float y) { return expr(x + y); } + + /// Subtraction implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision difference stored in single-precision + static expr minus(float x, float y) { return expr(x - y); } + + /// Multiplication implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision product stored in single-precision + static expr multiplies(float x, float y) { return expr(x * y); } + + /// Division implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision quotient stored in single-precision + static expr divides(float x, float y) { return expr(x / y); } + + /// Output implementation. + /// \param out stream to write to + /// \param arg value to write + /// \return reference to stream + template + static std::basic_ostream& + write(std::basic_ostream& out, float arg) { + return out << arg; + } + + /// Input implementation. + /// \param in stream to read from + /// \param arg half to read into + /// \return reference to stream + template + static std::basic_istream& + read(std::basic_istream& in, half& arg) { + float f; + if (in >> f) + arg = f; + return in; + } + + /// Modulo implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr fmod(float x, float y) { return expr(std::fmod(x, y)); } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr remainder(float x, float y) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remainder(x, y)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return expr(builtin_signbit(x) ? -0.0f : 0.0f); + ax = std::fmod(ax, ay + ay); + float y2 = 0.5f * ay; + if (ax > y2) { + ax -= ay; + if (ax >= y2) + ax -= ay; + } + return expr(builtin_signbit(x) ? -ax : ax); +#endif + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \param quo address to store quotient bits at + /// \return Half-precision division remainder stored in single-precision + static expr remquo(float x, float y, int* quo) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remquo(x, y, quo)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + bool sign = builtin_signbit(x), + qsign = static_cast(sign ^ builtin_signbit(y)); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); + ax = std::fmod(ax, 8.0f * ay); + int cquo = 0; + if (ax >= 4.0f * ay) { + ax -= 4.0f * ay; + cquo += 4; + } + if (ax >= 2.0f * ay) { + ax -= 2.0f * ay; + cquo += 2; + } + float y2 = 0.5f * ay; + if (ax > y2) { + ax -= ay; + ++cquo; + if (ax >= y2) { + ax -= ay; + ++cquo; + } + } + return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); +#endif + } + + /// Positive difference implementation. + /// \param x first operand + /// \param y second operand + /// \return Positive difference stored in single-precision + static expr fdim(float x, float y) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fdim(x, y)); +#else + return expr((x <= y) ? 0.0f : (x - y)); +#endif + } + + /// Fused multiply-add implementation. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return \a x * \a y + \a z stored in single-precision + static expr fma(float x, float y, float z) { +#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) + return expr(std::fma(x, y, z)); +#else + return expr(x * y + z); +#endif + } + + /// Get NaN. + /// \return Half-precision quiet NaN + static half nanh() { return half(binary, 0x7FFF); } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp(float arg) { return expr(std::exp(arg)); } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr expm1(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::expm1(arg)); +#else + return expr(static_cast(std::exp(static_cast(arg)) - 1.0)); +#endif + } + + /// Binary exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp2(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::exp2(arg)); +#else + return expr( + static_cast(std::exp(arg * 0.69314718055994530941723212145818))); +#endif + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log(float arg) { return expr(std::log(arg)); } + + /// Common logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log10(float arg) { return expr(std::log10(arg)); } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log1p(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log1p(arg)); +#else + return expr(static_cast(std::log(1.0 + arg))); +#endif + } + + /// Binary logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log2(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log2(arg)); +#else + return expr(static_cast(std::log(static_cast(arg)) * + 1.4426950408889634073599246810019)); +#endif + } + + /// Square root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sqrt(float arg) { return expr(std::sqrt(arg)); } + + /// Cubic root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cbrt(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::cbrt(arg)); +#else + if (builtin_isnan(arg) || builtin_isinf(arg)) + return expr(arg); + return expr(builtin_signbit(arg) + ? -static_cast( + std::pow(-static_cast(arg), 1.0 / 3.0)) + : static_cast( + std::pow(static_cast(arg), 1.0 / 3.0))); +#endif + } + + /// Hypotenuse implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr hypot(float x, float y) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::hypot(x, y)); +#else + return expr( + (builtin_isinf(x) || builtin_isinf(y)) + ? std::numeric_limits::infinity() + : static_cast(std::sqrt(static_cast(x) * x + + static_cast(y) * y))); +#endif + } + + /// Power implementation. + /// \param base value to exponentiate + /// \param exp power to expontiate to + /// \return function value stored in single-preicision + static expr pow(float base, float exp) { return expr(std::pow(base, exp)); } + + /// Sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sin(float arg) { return expr(std::sin(arg)); } + + /// Cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cos(float arg) { return expr(std::cos(arg)); } + + /// Tan implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tan(float arg) { return expr(std::tan(arg)); } + + /// Arc sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asin(float arg) { return expr(std::asin(arg)); } + + /// Arc cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acos(float arg) { return expr(std::acos(arg)); } + + /// Arc tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atan(float arg) { return expr(std::atan(arg)); } + + /// Arc tangent implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr atan2(float x, float y) { return expr(std::atan2(x, y)); } + + /// Hyperbolic sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sinh(float arg) { return expr(std::sinh(arg)); } + + /// Hyperbolic cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cosh(float arg) { return expr(std::cosh(arg)); } + + /// Hyperbolic tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tanh(float arg) { return expr(std::tanh(arg)); } + + /// Hyperbolic area sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asinh(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::asinh(arg)); +#else + return expr( + (arg == -std::numeric_limits::infinity()) + ? arg + : static_cast(std::log(arg + std::sqrt(arg * arg + 1.0)))); +#endif + } + + /// Hyperbolic area cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acosh(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::acosh(arg)); +#else + return expr((arg < -1.0f) ? std::numeric_limits::quiet_NaN() + : static_cast(std::log( + arg + std::sqrt(arg * arg - 1.0)))); +#endif + } + + /// Hyperbolic area tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atanh(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::atanh(arg)); +#else + return expr(static_cast(0.5 * std::log((1.0 + arg) / (1.0 - arg)))); +#endif + } + + /// Error function implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erf(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erf(arg)); +#else + return expr(static_cast(erf(static_cast(arg)))); +#endif + } + + /// Complementary implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erfc(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erfc(arg)); +#else + return expr(static_cast(1.0 - erf(static_cast(arg)))); +#endif + } + + /// Gamma logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr lgamma(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::lgamma(arg)); +#else + if (builtin_isinf(arg)) + return expr(std::numeric_limits::infinity()); + if (arg < 0.0f) { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::infinity()); + return expr(static_cast( + 1.1447298858494001741434273513531 - + std::log(std::abs(std::sin(3.1415926535897932384626433832795 * f))) - + lgamma(1.0 - arg))); + } + return expr(static_cast(lgamma(static_cast(arg)))); +#endif + } + + /// Gamma implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tgamma(float arg) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::tgamma(arg)); +#else + if (arg == 0.0f) + return builtin_signbit(arg) + ? expr(-std::numeric_limits::infinity()) + : expr(std::numeric_limits::infinity()); + if (arg < 0.0f) { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::quiet_NaN()); + double value = 3.1415926535897932384626433832795 / + (std::sin(3.1415926535897932384626433832795 * f) * + std::exp(lgamma(1.0 - arg))); + return expr( + static_cast((std::fmod(i, 2.0f) == 0.0f) ? -value : value)); + } + if (builtin_isinf(arg)) + return expr(arg); + return expr(static_cast(std::exp(lgamma(static_cast(arg))))); +#endif + } + + /// Floor implementation. + /// \param arg value to round + /// \return rounded value + static half floor(half arg) { + return half(binary, round_half(arg.data_)); + } + + /// Ceiling implementation. + /// \param arg value to round + /// \return rounded value + static half ceil(half arg) { + return half(binary, round_half(arg.data_)); + } + + /// Truncation implementation. + /// \param arg value to round + /// \return rounded value + static half trunc(half arg) { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half round(half arg) { return half(binary, round_half_up(arg.data_)); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lround(half arg) { return detail::half2int_up(arg.data_); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half rint(half arg) { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lrint(half arg) { + return detail::half2int(arg.data_); + } + +#if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llround(half arg) { + return detail::half2int_up(arg.data_); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llrint(half arg) { + return detail::half2int(arg.data_); + } +#endif + + /// Decompression implementation. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return normalized significant + static half frexp(half arg, int* exp) { + int m = arg.data_ & 0x7FFF, e = -14; + if (m >= 0x7C00 || !m) + return *exp = 0, arg; + for (; m < 0x400; m <<= 1, --e) + ; + return *exp = e + (m >> 10), + half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF)); + } + + /// Decompression implementation. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part + static half modf(half arg, half* iptr) { + uint32_t e = arg.data_ & 0x7FFF; + if (e >= 0x6400) + return *iptr = arg, half(binary, arg.data_ & (0x8000U | -(e > 0x7C00))); + if (e < 0x3C00) + return iptr->data_ = arg.data_ & 0x8000, arg; + e >>= 10; + uint32_t mask = (1 << (25 - e)) - 1, m = arg.data_ & mask; + iptr->data_ = arg.data_ & ~mask; + if (!m) + return half(binary, arg.data_ & 0x8000); + for (; m < 0x400; m <<= 1, --e) + ; + return half(binary, static_cast((arg.data_ & 0x8000) | (e << 10) | + (m & 0x3FF))); + } + + /// Scaling implementation. + /// \param arg number to scale + /// \param exp power of two to scale by + /// \return scaled number + static half scalbln(half arg, long exp) { + uint32_t m = arg.data_ & 0x7FFF; + if (m >= 0x7C00 || !m) + return arg; + for (; m < 0x400; m <<= 1, --exp) + ; + exp += m >> 10; + uint16 value = arg.data_ & 0x8000; + if (exp > 30) { + if (half::round_style == std::round_toward_zero) + value |= 0x7BFF; + else if (half::round_style == std::round_toward_infinity) + value |= 0x7C00 - (value >> 15); + else if (half::round_style == std::round_toward_neg_infinity) + value |= 0x7BFF + (value >> 15); + else + value |= 0x7C00; + } else if (exp > 0) + value |= (exp << 10) | (m & 0x3FF); + else if (exp > -11) { + m = (m & 0x3FF) | 0x400; + if (half::round_style == std::round_to_nearest) { + m += 1 << -exp; +#if HALF_ROUND_TIES_TO_EVEN + m -= (m >> (1 - exp)) & 1; +#endif + } else if (half::round_style == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U); + else if (half::round_style == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (1 - exp)) - 1U); + value |= m >> (1 - exp); + } else if (half::round_style == std::round_toward_infinity) + value -= (value >> 15) - 1; + else if (half::round_style == std::round_toward_neg_infinity) + value += value >> 15; + return half(binary, value); + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static int ilogb(half arg) { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return FP_ILOGB0; + if (abs < 0x7C00) { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + return exp; + } + if (abs > 0x7C00) + return FP_ILOGBNAN; + return INT_MAX; + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static half logb(half arg) { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return half(binary, 0xFC00); + if (abs < 0x7C00) { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + uint16 bits = (exp < 0) << 15; + if (exp) { + uint32_t m = std::abs(exp) << 6, e = 18; + for (; m < 0x400; m <<= 1, --e) + ; + bits |= (e << 10) + m; + } + return half(binary, bits); + } + if (abs > 0x7C00) + return arg; + return half(binary, 0x7C00); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nextafter(half from, half to) { + uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; + if (fabs > 0x7C00) + return from; + if (tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs)) + return to; + if (!fabs) + return half(binary, (to.data_ & 0x8000) + 1); + bool lt = + ((fabs == from.data_) ? static_cast(fabs) + : -static_cast(fabs)) < + ((tabs == to.data_) ? static_cast(tabs) : -static_cast(tabs)); + return half(binary, + from.data_ + + (((from.data_ >> 15) ^ static_cast(lt)) << 1) - + 1); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nexttoward(half from, long double to) { + if (isnan(from)) + return from; + long double lfrom = static_cast(from); + if (builtin_isnan(to) || lfrom == to) + return half(static_cast(to)); + if (!(from.data_ & 0x7FFF)) + return half(binary, + (static_cast(builtin_signbit(to)) << 15) + 1); + return half( + binary, + from.data_ + + (((from.data_ >> 15) ^ static_cast(lfrom < to)) << 1) - + 1); + } + + /// Sign implementation + /// \param x first operand + /// \param y second operand + /// \return composed value + static half copysign(half x, half y) { + return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000)); + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static int fpclassify(half arg) { + uint32_t abs = arg.data_ & 0x7FFF; + return abs ? ((abs > 0x3FF) ? ((abs >= 0x7C00) + ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE) + : FP_NORMAL) + : FP_SUBNORMAL) + : FP_ZERO; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if finite number + /// \retval false else + static bool isfinite(half arg) { return (arg.data_ & 0x7C00) != 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static bool isinf(half arg) { return (arg.data_ & 0x7FFF) == 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if not a number + /// \retval false else + static bool isnan(half arg) { return (arg.data_ & 0x7FFF) > 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if normal number + /// \retval false else + static bool isnormal(half arg) { + return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00); + } + + /// Sign bit implementation. + /// \param arg value to check + /// \retval true if signed + /// \retval false if unsigned + static bool signbit(half arg) { return (arg.data_ & 0x8000) != 0; } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + static bool isequal(half x, half y) { + return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) && !isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + static bool isnotequal(half x, half y) { + return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) || isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x > \a y + /// \retval false else + static bool isgreater(half x, half y) { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 && + (((xabs == x.data_) ? xabs : -xabs) > + ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x >= \a y + /// \retval false else + static bool isgreaterequal(half x, half y) { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 && + (((xabs == x.data_) ? xabs : -xabs) >= + ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x < \a y + /// \retval false else + static bool isless(half x, half y) { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 && + (((xabs == x.data_) ? xabs : -xabs) < + ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x <= \a y + /// \retval false else + static bool islessequal(half x, half y) { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 && + (((xabs == x.data_) ? xabs : -xabs) <= + ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if either \a x > \a y nor \a x < \a y + /// \retval false else + static bool islessgreater(half x, half y) { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00 || yabs > 0x7C00) + return false; + int a = (xabs == x.data_) ? xabs : -xabs, + b = (yabs == y.data_) ? yabs : -yabs; + return a < b || a > b; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operand unordered + /// \retval false else + static bool isunordered(half x, half y) { return isnan(x) || isnan(y); } + + private: + static double erf(double arg) { + if (builtin_isinf(arg)) + return (arg < 0.0) ? -1.0 : 1.0; + double x2 = arg * arg, ax2 = 0.147 * x2, + value = std::sqrt( + 1.0 - std::exp(-x2 * (1.2732395447351626861510701069801 + ax2) / + (1.0 + ax2))); + return builtin_signbit(arg) ? -value : value; + } + + static double lgamma(double arg) { + double v = 1.0; + for (; arg < 8.0; ++arg) + v *= arg; + double w = 1.0 / (arg * arg); + return (((((((-0.02955065359477124183006535947712 * w + + 0.00641025641025641025641025641026) * + w + + -0.00191752691752691752691752691753) * + w + + 8.4175084175084175084175084175084e-4) * + w + + -5.952380952380952380952380952381e-4) * + w + + 7.9365079365079365079365079365079e-4) * + w + + -0.00277777777777777777777777777778) * + w + + 0.08333333333333333333333333333333) / + arg + + 0.91893853320467274178032973640562 - std::log(v) - arg + + (arg - 0.5) * std::log(arg); + } +}; + +/// Wrapper for unary half-precision functions needing specialization for +/// individual argument types. +/// \tparam T argument type +template struct unary_specialized { + /// Negation implementation. + /// \param arg value to negate + /// \return negated value + static HALF_CONSTEXPR half negate(half arg) { + return half(binary, arg.data_ ^ 0x8000); + } + + /// Absolute value implementation. + /// \param arg function argument + /// \return absolute value + static half fabs(half arg) { return half(binary, arg.data_ & 0x7FFF); } +}; +template <> struct unary_specialized { + static HALF_CONSTEXPR expr negate(float arg) { return expr(-arg); } + static expr fabs(float arg) { return expr(std::fabs(arg)); } +}; + +/// Wrapper for binary half-precision functions needing specialization for +/// individual argument types. +/// \tparam T first argument type +/// \tparam U first argument type +template struct binary_specialized { + /// Minimum implementation. + /// \param x first operand + /// \param y second operand + /// \return minimum value + static expr fmin(float x, float y) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmin(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::min(x, y)); +#endif + } + + /// Maximum implementation. + /// \param x first operand + /// \param y second operand + /// \return maximum value + static expr fmax(float x, float y) { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmax(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::max(x, y)); +#endif + } +}; +template <> struct binary_specialized { + static half fmin(half x, half y) { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) > + ((yabs == y.data_) ? yabs : -yabs)) + ? y + : x; + } + static half fmax(half x, half y) { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) < + ((yabs == y.data_) ? yabs : -yabs)) + ? y + : x; + } +}; + +/// Helper class for half casts. +/// This class template has to be specialized for all valid cast argument to +/// define an appropriate static `cast` member +/// function and a corresponding `type` member denoting its return type. +/// \tparam T destination type +/// \tparam U source type +/// \tparam R rounding mode to use +template +struct half_caster {}; +template struct half_caster { +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, + "half_cast from non-arithmetic type unsupported"); +#endif + + static half cast(U arg) { return cast_impl(arg, is_float()); }; + + private: + static half cast_impl(U arg, true_type) { + return half(binary, float2half(arg)); + } + static half cast_impl(U arg, false_type) { + return half(binary, int2half(arg)); + } +}; +template struct half_caster { +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, + "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(half arg) { return cast_impl(arg, is_float()); } + + private: + static T cast_impl(half arg, true_type) { return half2float(arg.data_); } + static T cast_impl(half arg, false_type) { return half2int(arg.data_); } +}; +template struct half_caster { +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, + "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(expr arg) { return cast_impl(arg, is_float()); } + + private: + static T cast_impl(float arg, true_type) { return static_cast(arg); } + static T cast_impl(half arg, false_type) { return half2int(arg.data_); } +}; +template struct half_caster { + static half cast(half arg) { return arg; } +}; +template +struct half_caster : half_caster {}; + +/// \name Comparison operators +/// \{ + +/// Comparison for equality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands equal +/// \retval false else +template +typename enable::type operator==(T x, U y) { + return functions::isequal(x, y); +} + +/// Comparison for inequality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands not equal +/// \retval false else +template +typename enable::type operator!=(T x, U y) { + return functions::isnotequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +template +typename enable::type operator<(T x, U y) { + return functions::isless(x, y); +} + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +template +typename enable::type operator>(T x, U y) { + return functions::isgreater(x, y); +} + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +template +typename enable::type operator<=(T x, U y) { + return functions::islessequal(x, y); +} + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +template +typename enable::type operator>=(T x, U y) { + return functions::isgreaterequal(x, y); +} + +/// \} +/// \name Arithmetic operators +/// \{ + +/// Add halfs. +/// \param x left operand +/// \param y right operand +/// \return sum of half expressions +template +typename enable::type operator+(T x, U y) { + return functions::plus(x, y); +} + +/// Subtract halfs. +/// \param x left operand +/// \param y right operand +/// \return difference of half expressions +template +typename enable::type operator-(T x, U y) { + return functions::minus(x, y); +} + +/// Multiply halfs. +/// \param x left operand +/// \param y right operand +/// \return product of half expressions +template +typename enable::type operator*(T x, U y) { + return functions::multiplies(x, y); +} + +/// Divide halfs. +/// \param x left operand +/// \param y right operand +/// \return quotient of half expressions +template +typename enable::type operator/(T x, U y) { + return functions::divides(x, y); +} + +/// Identity. +/// \param arg operand +/// \return uncahnged operand +template +HALF_CONSTEXPR typename enable::type operator+(T arg) { + return arg; +} + +/// Negation. +/// \param arg operand +/// \return negated operand +template +HALF_CONSTEXPR typename enable::type operator-(T arg) { + return unary_specialized::negate(arg); +} + +/// \} +/// \name Input and output +/// \{ + +/// Output operator. +/// \param out output stream to write into +/// \param arg half expression to write +/// \return reference to output stream +template +typename enable&, T>::type +operator<<(std::basic_ostream& out, T arg) { + return functions::write(out, arg); +} + +/// Input operator. +/// \param in input stream to read from +/// \param arg half to read into +/// \return reference to input stream +template +std::basic_istream& +operator>>(std::basic_istream& in, half& arg) { + return functions::read(in, arg); +} + +/// \} +/// \name Basic mathematical operations +/// \{ + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type abs(T arg) { +// return unary_specialized::fabs(arg); } +inline half abs(half arg) { return unary_specialized::fabs(arg); } +inline expr abs(expr arg) { return unary_specialized::fabs(arg); } + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type fabs(T arg) { +// return unary_specialized::fabs(arg); } +inline half fabs(half arg) { return unary_specialized::fabs(arg); } +inline expr fabs(expr arg) { return unary_specialized::fabs(arg); } + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type +// fmod(T x, U y) { return functions::fmod(x, y); } +inline expr fmod(half x, half y) { return functions::fmod(x, y); } +inline expr fmod(half x, expr y) { return functions::fmod(x, y); } +inline expr fmod(expr x, half y) { return functions::fmod(x, y); } +inline expr fmod(expr x, expr y) { return functions::fmod(x, y); } + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type +// remainder(T x, U y) { return +// functions::remainder(x, y); } +inline expr remainder(half x, half y) { return functions::remainder(x, y); } +inline expr remainder(half x, expr y) { return functions::remainder(x, y); } +inline expr remainder(expr x, half y) { return functions::remainder(x, y); } +inline expr remainder(expr x, expr y) { return functions::remainder(x, y); } + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \param quo address to store some bits of quotient at +/// \return remainder of floating point division. +// template typename enable::type +// remquo(T x, U y, int *quo) { return +// functions::remquo(x, y, quo); } +inline expr remquo(half x, half y, int* quo) { + return functions::remquo(x, y, quo); +} +inline expr remquo(half x, expr y, int* quo) { + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, half y, int* quo) { + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, expr y, int* quo) { + return functions::remquo(x, y, quo); +} + +/// Fused multiply add. +/// \param x first operand +/// \param y second operand +/// \param z third operand +/// \return ( \a x * \a y ) + \a z rounded as one operation. +// template typename +// enable::type fma(T x, U y, V z) { return +// functions::fma(x, y, z); } +inline expr fma(half x, half y, half z) { return functions::fma(x, y, z); } +inline expr fma(half x, half y, expr z) { return functions::fma(x, y, z); } +inline expr fma(half x, expr y, half z) { return functions::fma(x, y, z); } +inline expr fma(half x, expr y, expr z) { return functions::fma(x, y, z); } +inline expr fma(expr x, half y, half z) { return functions::fma(x, y, z); } +inline expr fma(expr x, half y, expr z) { return functions::fma(x, y, z); } +inline expr fma(expr x, expr y, half z) { return functions::fma(x, y, z); } +inline expr fma(expr x, expr y, expr z) { return functions::fma(x, y, z); } + +/// Maximum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return maximum of operands +// template typename result::type +// fmax(T +// x, U y) { return +// binary_specialized::fmax(x, y); } +inline half fmax(half x, half y) { + return binary_specialized::fmax(x, y); +} +inline expr fmax(half x, expr y) { + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, half y) { + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, expr y) { + return binary_specialized::fmax(x, y); +} + +/// Minimum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return minimum of operands +// template typename result::type +// fmin(T +// x, U y) { return +// binary_specialized::fmin(x, y); } +inline half fmin(half x, half y) { + return binary_specialized::fmin(x, y); +} +inline expr fmin(half x, expr y) { + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, half y) { + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, expr y) { + return binary_specialized::fmin(x, y); +} + +/// Positive difference. +/// \param x first operand +/// \param y second operand +/// \return \a x - \a y or 0 if difference negative +// template typename enable::type +// fdim(T x, U y) { return functions::fdim(x, y); } +inline expr fdim(half x, half y) { return functions::fdim(x, y); } +inline expr fdim(half x, expr y) { return functions::fdim(x, y); } +inline expr fdim(expr x, half y) { return functions::fdim(x, y); } +inline expr fdim(expr x, expr y) { return functions::fdim(x, y); } + +/// Get NaN value. +/// \return quiet NaN +inline half nanh(const char*) { return functions::nanh(); } + +/// \} +/// \name Exponential functions +/// \{ + +/// Exponential function. +/// \param arg function argument +/// \return e raised to \a arg +// template typename enable::type exp(T arg) { +// return functions::exp(arg); } +inline expr exp(half arg) { return functions::exp(arg); } +inline expr exp(expr arg) { return functions::exp(arg); } + +/// Exponential minus one. +/// \param arg function argument +/// \return e raised to \a arg subtracted by 1 +// template typename enable::type expm1(T arg) +//{ +// return functions::expm1(arg); } +inline expr expm1(half arg) { return functions::expm1(arg); } +inline expr expm1(expr arg) { return functions::expm1(arg); } + +/// Binary exponential. +/// \param arg function argument +/// \return 2 raised to \a arg +// template typename enable::type exp2(T arg) { +// return functions::exp2(arg); } +inline expr exp2(half arg) { return functions::exp2(arg); } +inline expr exp2(expr arg) { return functions::exp2(arg); } + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base e +// template typename enable::type log(T arg) { +// return functions::log(arg); } +inline expr log(half arg) { return functions::log(arg); } +inline expr log(expr arg) { return functions::log(arg); } + +/// Common logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 10 +// template typename enable::type log10(T arg) +//{ +// return functions::log10(arg); } +inline expr log10(half arg) { return functions::log10(arg); } +inline expr log10(expr arg) { return functions::log10(arg); } + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg plus 1 to base e +// template typename enable::type log1p(T arg) +//{ +// return functions::log1p(arg); } +inline expr log1p(half arg) { return functions::log1p(arg); } +inline expr log1p(expr arg) { return functions::log1p(arg); } + +/// Binary logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 2 +// template typename enable::type log2(T arg) { +// return functions::log2(arg); } +inline expr log2(half arg) { return functions::log2(arg); } +inline expr log2(expr arg) { return functions::log2(arg); } + +/// \} +/// \name Power functions +/// \{ + +/// Square root. +/// \param arg function argument +/// \return square root of \a arg +// template typename enable::type sqrt(T arg) { +// return functions::sqrt(arg); } +inline expr sqrt(half arg) { return functions::sqrt(arg); } +inline expr sqrt(expr arg) { return functions::sqrt(arg); } + +/// Cubic root. +/// \param arg function argument +/// \return cubic root of \a arg +// template typename enable::type cbrt(T arg) { +// return functions::cbrt(arg); } +inline expr cbrt(half arg) { return functions::cbrt(arg); } +inline expr cbrt(expr arg) { return functions::cbrt(arg); } + +/// Hypotenuse function. +/// \param x first argument +/// \param y second argument +/// \return square root of sum of squares without internal over- or underflows +// template typename enable::type +// hypot(T x, U y) { return functions::hypot(x, y); +//} +inline expr hypot(half x, half y) { return functions::hypot(x, y); } +inline expr hypot(half x, expr y) { return functions::hypot(x, y); } +inline expr hypot(expr x, half y) { return functions::hypot(x, y); } +inline expr hypot(expr x, expr y) { return functions::hypot(x, y); } + +/// Power function. +/// \param base first argument +/// \param exp second argument +/// \return \a base raised to \a exp +// template typename enable::type +// pow(T base, U exp) { return functions::pow(base, +// exp); } +inline expr pow(half base, half exp) { return functions::pow(base, exp); } +inline expr pow(half base, expr exp) { return functions::pow(base, exp); } +inline expr pow(expr base, half exp) { return functions::pow(base, exp); } +inline expr pow(expr base, expr exp) { return functions::pow(base, exp); } + +/// \} +/// \name Trigonometric functions +/// \{ + +/// Sine function. +/// \param arg function argument +/// \return sine value of \a arg +// template typename enable::type sin(T arg) { +// return functions::sin(arg); } +inline expr sin(half arg) { return functions::sin(arg); } +inline expr sin(expr arg) { return functions::sin(arg); } + +/// Cosine function. +/// \param arg function argument +/// \return cosine value of \a arg +// template typename enable::type cos(T arg) { +// return functions::cos(arg); } +inline expr cos(half arg) { return functions::cos(arg); } +inline expr cos(expr arg) { return functions::cos(arg); } + +/// Tangent function. +/// \param arg function argument +/// \return tangent value of \a arg +// template typename enable::type tan(T arg) { +// return functions::tan(arg); } +inline expr tan(half arg) { return functions::tan(arg); } +inline expr tan(expr arg) { return functions::tan(arg); } + +/// Arc sine. +/// \param arg function argument +/// \return arc sine value of \a arg +// template typename enable::type asin(T arg) { +// return functions::asin(arg); } +inline expr asin(half arg) { return functions::asin(arg); } +inline expr asin(expr arg) { return functions::asin(arg); } + +/// Arc cosine function. +/// \param arg function argument +/// \return arc cosine value of \a arg +// template typename enable::type acos(T arg) { +// return functions::acos(arg); } +inline expr acos(half arg) { return functions::acos(arg); } +inline expr acos(expr arg) { return functions::acos(arg); } + +/// Arc tangent function. +/// \param arg function argument +/// \return arc tangent value of \a arg +// template typename enable::type atan(T arg) { +// return functions::atan(arg); } +inline expr atan(half arg) { return functions::atan(arg); } +inline expr atan(expr arg) { return functions::atan(arg); } + +/// Arc tangent function. +/// \param x first argument +/// \param y second argument +/// \return arc tangent value +// template typename enable::type +// atan2(T x, U y) { return functions::atan2(x, y); +//} +inline expr atan2(half x, half y) { return functions::atan2(x, y); } +inline expr atan2(half x, expr y) { return functions::atan2(x, y); } +inline expr atan2(expr x, half y) { return functions::atan2(x, y); } +inline expr atan2(expr x, expr y) { return functions::atan2(x, y); } + +/// \} +/// \name Hyperbolic functions +/// \{ + +/// Hyperbolic sine. +/// \param arg function argument +/// \return hyperbolic sine value of \a arg +// template typename enable::type sinh(T arg) { +// return functions::sinh(arg); } +inline expr sinh(half arg) { return functions::sinh(arg); } +inline expr sinh(expr arg) { return functions::sinh(arg); } + +/// Hyperbolic cosine. +/// \param arg function argument +/// \return hyperbolic cosine value of \a arg +// template typename enable::type cosh(T arg) { +// return functions::cosh(arg); } +inline expr cosh(half arg) { return functions::cosh(arg); } +inline expr cosh(expr arg) { return functions::cosh(arg); } + +/// Hyperbolic tangent. +/// \param arg function argument +/// \return hyperbolic tangent value of \a arg +// template typename enable::type tanh(T arg) { +// return functions::tanh(arg); } +inline expr tanh(half arg) { return functions::tanh(arg); } +inline expr tanh(expr arg) { return functions::tanh(arg); } + +/// Hyperbolic area sine. +/// \param arg function argument +/// \return area sine value of \a arg +// template typename enable::type asinh(T arg) +//{ +// return functions::asinh(arg); } +inline expr asinh(half arg) { return functions::asinh(arg); } +inline expr asinh(expr arg) { return functions::asinh(arg); } + +/// Hyperbolic area cosine. +/// \param arg function argument +/// \return area cosine value of \a arg +// template typename enable::type acosh(T arg) +//{ +// return functions::acosh(arg); } +inline expr acosh(half arg) { return functions::acosh(arg); } +inline expr acosh(expr arg) { return functions::acosh(arg); } + +/// Hyperbolic area tangent. +/// \param arg function argument +/// \return area tangent value of \a arg +// template typename enable::type atanh(T arg) +//{ +// return functions::atanh(arg); } +inline expr atanh(half arg) { return functions::atanh(arg); } +inline expr atanh(expr arg) { return functions::atanh(arg); } + +/// \} +/// \name Error and gamma functions +/// \{ + +/// Error function. +/// \param arg function argument +/// \return error function value of \a arg +// template typename enable::type erf(T arg) { +// return functions::erf(arg); } +inline expr erf(half arg) { return functions::erf(arg); } +inline expr erf(expr arg) { return functions::erf(arg); } + +/// Complementary error function. +/// \param arg function argument +/// \return 1 minus error function value of \a arg +// template typename enable::type erfc(T arg) { +// return functions::erfc(arg); } +inline expr erfc(half arg) { return functions::erfc(arg); } +inline expr erfc(expr arg) { return functions::erfc(arg); } + +/// Natural logarithm of gamma function. +/// \param arg function argument +/// \return natural logarith of gamma function for \a arg +// template typename enable::type lgamma(T arg) +//{ +// return functions::lgamma(arg); } +inline expr lgamma(half arg) { return functions::lgamma(arg); } +inline expr lgamma(expr arg) { return functions::lgamma(arg); } + +/// Gamma function. +/// \param arg function argument +/// \return gamma function value of \a arg +// template typename enable::type tgamma(T arg) +//{ +// return functions::tgamma(arg); } +inline expr tgamma(half arg) { return functions::tgamma(arg); } +inline expr tgamma(expr arg) { return functions::tgamma(arg); } + +/// \} +/// \name Rounding +/// \{ + +/// Nearest integer not less than half value. +/// \param arg half to round +/// \return nearest integer not less than \a arg +// template typename enable::type ceil(T arg) { +// return functions::ceil(arg); } +inline half ceil(half arg) { return functions::ceil(arg); } +inline half ceil(expr arg) { return functions::ceil(arg); } + +/// Nearest integer not greater than half value. +/// \param arg half to round +/// \return nearest integer not greater than \a arg +// template typename enable::type floor(T arg) +//{ +// return functions::floor(arg); } +inline half floor(half arg) { return functions::floor(arg); } +inline half floor(expr arg) { return functions::floor(arg); } + +/// Nearest integer not greater in magnitude than half value. +/// \param arg half to round +/// \return nearest integer not greater in magnitude than \a arg +// template typename enable::type trunc(T arg) +//{ +// return functions::trunc(arg); } +inline half trunc(half arg) { return functions::trunc(arg); } +inline half trunc(expr arg) { return functions::trunc(arg); } + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type round(T arg) +//{ +// return functions::round(arg); } +inline half round(half arg) { return functions::round(arg); } +inline half round(expr arg) { return functions::round(arg); } + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type lround(T arg) +//{ +// return functions::lround(arg); } +inline long lround(half arg) { return functions::lround(arg); } +inline long lround(expr arg) { return functions::lround(arg); } + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type nearbyint(T +// arg) { return functions::nearbyint(arg); } +inline half nearbyint(half arg) { return functions::rint(arg); } +inline half nearbyint(expr arg) { return functions::rint(arg); } + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type rint(T arg) { +// return functions::rint(arg); } +inline half rint(half arg) { return functions::rint(arg); } +inline half rint(expr arg) { return functions::rint(arg); } + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type lrint(T arg) +//{ +// return functions::lrint(arg); } +inline long lrint(half arg) { return functions::lrint(arg); } +inline long lrint(expr arg) { return functions::lrint(arg); } +#if HALF_ENABLE_CPP11_LONG_LONG +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type +// llround(T +// arg) { return functions::llround(arg); } +inline long long llround(half arg) { return functions::llround(arg); } +inline long long llround(expr arg) { return functions::llround(arg); } + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type llrint(T +// arg) { return functions::llrint(arg); } +inline long long llrint(half arg) { return functions::llrint(arg); } +inline long long llrint(expr arg) { return functions::llrint(arg); } +#endif + +/// \} +/// \name Floating point manipulation +/// \{ + +/// Decompress floating point number. +/// \param arg number to decompress +/// \param exp address to store exponent at +/// \return significant in range [0.5, 1) +// template typename enable::type frexp(T arg, +// int *exp) { return functions::frexp(arg, exp); } +inline half frexp(half arg, int* exp) { return functions::frexp(arg, exp); } +inline half frexp(expr arg, int* exp) { return functions::frexp(arg, exp); } + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type ldexp(T arg, +// int exp) { return functions::scalbln(arg, exp); +//} +inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); } +inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); } + +/// Extract integer and fractional parts. +/// \param arg number to decompress +/// \param iptr address to store integer part at +/// \return fractional part +// template typename enable::type modf(T arg, +// half *iptr) { return functions::modf(arg, iptr); +//} +inline half modf(half arg, half* iptr) { return functions::modf(arg, iptr); } +inline half modf(expr arg, half* iptr) { return functions::modf(arg, iptr); } + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbn(T arg, +// int exp) { return functions::scalbln(arg, exp); +//} +inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); } +inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); } + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbln(T +// arg, +// long exp) { return functions::scalbln(arg, +// exp); +//} +inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); } +inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); } + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +/// \retval FP_ILOGB0 for zero +/// \retval FP_ILOGBNAN for NaN +/// \retval MAX_INT for infinity +// template typename enable::type ilogb(T arg) { +// return functions::ilogb(arg); } +inline int ilogb(half arg) { return functions::ilogb(arg); } +inline int ilogb(expr arg) { return functions::ilogb(arg); } + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +// template typename enable::type logb(T arg) { +// return functions::logb(arg); } +inline half logb(half arg) { return functions::logb(arg); } +inline half logb(expr arg) { return functions::logb(arg); } + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type +// nextafter(T from, U to) { return +// functions::nextafter(from, to); } +inline half nextafter(half from, half to) { + return functions::nextafter(from, to); +} +inline half nextafter(half from, expr to) { + return functions::nextafter(from, to); +} +inline half nextafter(expr from, half to) { + return functions::nextafter(from, to); +} +inline half nextafter(expr from, expr to) { + return functions::nextafter(from, to); +} + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type nexttoward(T +// from, long double to) { return +// functions::nexttoward(from, to); } +inline half nexttoward(half from, long double to) { + return functions::nexttoward(from, to); +} +inline half nexttoward(expr from, long double to) { + return functions::nexttoward(from, to); +} + +/// Take sign. +/// \param x value to change sign for +/// \param y value to take sign from +/// \return value equal to \a x in magnitude and to \a y in sign +// template typename enable::type +// copysign(T x, U y) { return +// functions::copysign(x, y); } +inline half copysign(half x, half y) { return functions::copysign(x, y); } +inline half copysign(half x, expr y) { return functions::copysign(x, y); } +inline half copysign(expr x, half y) { return functions::copysign(x, y); } +inline half copysign(expr x, expr y) { return functions::copysign(x, y); } + +/// \} +/// \name Floating point classification +/// \{ + +/// Classify floating point value. +/// \param arg number to classify +/// \retval FP_ZERO for positive and negative zero +/// \retval FP_SUBNORMAL for subnormal numbers +/// \retval FP_INFINITY for positive and negative infinity +/// \retval FP_NAN for NaNs +/// \retval FP_NORMAL for all other (normal) values +// template typename enable::type fpclassify(T +// arg) { return functions::fpclassify(arg); } +inline int fpclassify(half arg) { return functions::fpclassify(arg); } +inline int fpclassify(expr arg) { return functions::fpclassify(arg); } + +/// Check if finite number. +/// \param arg number to check +/// \retval true if neither infinity nor NaN +/// \retval false else +// template typename enable::type isfinite(T +// arg) +//{ return functions::isfinite(arg); } +inline bool isfinite(half arg) { return functions::isfinite(arg); } +inline bool isfinite(expr arg) { return functions::isfinite(arg); } + +/// Check for infinity. +/// \param arg number to check +/// \retval true for positive or negative infinity +/// \retval false else +// template typename enable::type isinf(T arg) +//{ +// return functions::isinf(arg); } +inline bool isinf(half arg) { return functions::isinf(arg); } +inline bool isinf(expr arg) { return functions::isinf(arg); } + +/// Check for NaN. +/// \param arg number to check +/// \retval true for NaNs +/// \retval false else +// template typename enable::type isnan(T arg) +//{ +// return functions::isnan(arg); } +inline bool isnan(half arg) { return functions::isnan(arg); } +inline bool isnan(expr arg) { return functions::isnan(arg); } + +/// Check if normal number. +/// \param arg number to check +/// \retval true if normal number +/// \retval false if either subnormal, zero, infinity or NaN +// template typename enable::type isnormal(T +// arg) +//{ return functions::isnormal(arg); } +inline bool isnormal(half arg) { return functions::isnormal(arg); } +inline bool isnormal(expr arg) { return functions::isnormal(arg); } + +/// Check sign. +/// \param arg number to check +/// \retval true for negative number +/// \retval false for positive number +// template typename enable::type signbit(T +// arg) +//{ return functions::signbit(arg); } +inline bool signbit(half arg) { return functions::signbit(arg); } +inline bool signbit(expr arg) { return functions::signbit(arg); } + +/// \} +/// \name Comparison +/// \{ + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +// template typename enable::type +// isgreater(T x, U y) { return +// functions::isgreater(x, y); } +inline bool isgreater(half x, half y) { return functions::isgreater(x, y); } +inline bool isgreater(half x, expr y) { return functions::isgreater(x, y); } +inline bool isgreater(expr x, half y) { return functions::isgreater(x, y); } +inline bool isgreater(expr x, expr y) { return functions::isgreater(x, y); } + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +// template typename enable::type +// isgreaterequal(T x, U y) { return +// functions::isgreaterequal(x, y); } +inline bool isgreaterequal(half x, half y) { + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(half x, expr y) { + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, half y) { + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, expr y) { + return functions::isgreaterequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +// template typename enable::type +// isless(T x, U y) { return functions::isless(x, +// y); +//} +inline bool isless(half x, half y) { return functions::isless(x, y); } +inline bool isless(half x, expr y) { return functions::isless(x, y); } +inline bool isless(expr x, half y) { return functions::isless(x, y); } +inline bool isless(expr x, expr y) { return functions::isless(x, y); } + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +// template typename enable::type +// islessequal(T x, U y) { return +// functions::islessequal(x, y); } +inline bool islessequal(half x, half y) { return functions::islessequal(x, y); } +inline bool islessequal(half x, expr y) { return functions::islessequal(x, y); } +inline bool islessequal(expr x, half y) { return functions::islessequal(x, y); } +inline bool islessequal(expr x, expr y) { return functions::islessequal(x, y); } + +/// Comarison for less or greater. +/// \param x first operand +/// \param y second operand +/// \retval true if either less or greater +/// \retval false else +// template typename enable::type +// islessgreater(T x, U y) { return +// functions::islessgreater(x, y); } +inline bool islessgreater(half x, half y) { + return functions::islessgreater(x, y); +} +inline bool islessgreater(half x, expr y) { + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, half y) { + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, expr y) { + return functions::islessgreater(x, y); +} + +/// Check if unordered. +/// \param x first operand +/// \param y second operand +/// \retval true if unordered (one or two NaN operands) +/// \retval false else +// template typename enable::type +// isunordered(T x, U y) { return +// functions::isunordered(x, y); } +inline bool isunordered(half x, half y) { return functions::isunordered(x, y); } +inline bool isunordered(half x, expr y) { return functions::isunordered(x, y); } +inline bool isunordered(expr x, half y) { return functions::isunordered(x, y); } +inline bool isunordered(expr x, expr y) { return functions::isunordered(x, y); } + +/// \name Casting +/// \{ + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic +/// type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` +/// that a `static_cast` would otherwise do. +/// It uses the default rounding mode. +/// +/// Using this cast with neither of the two types being a [half](\ref +/// half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref +/// half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template T half_cast(U arg) { + return half_caster::cast(arg); +} + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic +/// type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` +/// that a `static_cast` would otherwise do. +/// +/// Using this cast with neither of the two types being a [half](\ref +/// half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref +/// half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam R rounding mode to use. +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template T half_cast(U arg) { + return half_caster::cast(arg); +} +/// \} +} // namespace detail + +using detail::operator==; +using detail::operator!=; +using detail::operator<; +using detail::operator>; +using detail::operator<=; +using detail::operator>=; +using detail::operator+; +using detail::operator-; +using detail::operator*; +using detail::operator/; +using detail::operator<<; +using detail::operator>>; + +using detail::abs; +using detail::acos; +using detail::acosh; +using detail::asin; +using detail::asinh; +using detail::atan; +using detail::atan2; +using detail::atanh; +using detail::cbrt; +using detail::ceil; +using detail::cos; +using detail::cosh; +using detail::erf; +using detail::erfc; +using detail::exp; +using detail::exp2; +using detail::expm1; +using detail::fabs; +using detail::fdim; +using detail::floor; +using detail::fma; +using detail::fmax; +using detail::fmin; +using detail::fmod; +using detail::hypot; +using detail::lgamma; +using detail::log; +using detail::log10; +using detail::log1p; +using detail::log2; +using detail::lrint; +using detail::lround; +using detail::nanh; +using detail::nearbyint; +using detail::pow; +using detail::remainder; +using detail::remquo; +using detail::rint; +using detail::round; +using detail::sin; +using detail::sinh; +using detail::sqrt; +using detail::tan; +using detail::tanh; +using detail::tgamma; +using detail::trunc; +#if HALF_ENABLE_CPP11_LONG_LONG +using detail::llrint; +using detail::llround; +#endif +using detail::copysign; +using detail::fpclassify; +using detail::frexp; +using detail::ilogb; +using detail::isfinite; +using detail::isgreater; +using detail::isgreaterequal; +using detail::isinf; +using detail::isless; +using detail::islessequal; +using detail::islessgreater; +using detail::isnan; +using detail::isnormal; +using detail::isunordered; +using detail::ldexp; +using detail::logb; +using detail::modf; +using detail::nextafter; +using detail::nexttoward; +using detail::scalbln; +using detail::scalbn; +using detail::signbit; + +using detail::half_cast; +} // namespace half_float + +/// Extensions to the C++ standard library. +namespace std { +/// Numeric limits for half-precision floats. +/// Because of the underlying single-precision implementation of many +/// operations, it inherits some properties from +/// `std::numeric_limits`. +template <> +class numeric_limits : public numeric_limits { + public: + /// Supports signed values. + static HALF_CONSTEXPR_CONST bool is_signed = true; + + /// Is not exact. + static HALF_CONSTEXPR_CONST bool is_exact = false; + + /// Doesn't provide modulo arithmetic. + static HALF_CONSTEXPR_CONST bool is_modulo = false; + + /// IEEE conformant. + static HALF_CONSTEXPR_CONST bool is_iec559 = true; + + /// Supports infinity. + static HALF_CONSTEXPR_CONST bool has_infinity = true; + + /// Supports quiet NaNs. + static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; + + /// Supports subnormal values. + static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; + + /// Rounding mode. + /// Due to the mix of internal single-precision computations (using the + /// rounding mode of the underlying + /// single-precision implementation) with the rounding mode of the + /// single-to-half conversions, the actual rounding + /// mode might be `std::round_indeterminate` if the default half-precision + /// rounding mode doesn't match the + /// single-precision rounding mode. + static HALF_CONSTEXPR_CONST float_round_style round_style = + (std::numeric_limits::round_style == half_float::half::round_style) + ? half_float::half::round_style + : round_indeterminate; + + /// Significant digits. + static HALF_CONSTEXPR_CONST int digits = 11; + + /// Significant decimal digits. + static HALF_CONSTEXPR_CONST int digits10 = 3; + + /// Required decimal digits to represent all possible values. + static HALF_CONSTEXPR_CONST int max_digits10 = 5; + + /// Number base. + static HALF_CONSTEXPR_CONST int radix = 2; + + /// One more than smallest exponent. + static HALF_CONSTEXPR_CONST int min_exponent = -13; + + /// Smallest normalized representable power of 10. + static HALF_CONSTEXPR_CONST int min_exponent10 = -4; + + /// One more than largest exponent + static HALF_CONSTEXPR_CONST int max_exponent = 16; + + /// Largest finitely representable power of 10. + static HALF_CONSTEXPR_CONST int max_exponent10 = 4; + + /// Smallest positive normal value. + static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0x0400); + } + + /// Smallest finite value. + static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0xFBFF); + } + + /// Largest finite value. + static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0x7BFF); + } + + /// Difference between one and next representable value. + static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0x1400); + } + + /// Maximum rounding error. + static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, + (round_style == std::round_to_nearest) ? 0x3800 + : 0x3C00); + } + + /// Positive infinity. + static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0x7C00); + } + + /// Quiet NaN. + static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0x7FFF); + } + + /// Signalling NaN. + static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0x7DFF); + } + + /// Smallest positive subnormal value. + static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { + return half_float::half(half_float::detail::binary, 0x0001); + } +}; + +#if HALF_ENABLE_CPP11_HASH +/// Hash function for half-precision floats. +/// This is only defined if C++11 `std::hash` is supported and enabled. +template <> +struct hash //: unary_function +{ + /// Type of function argument. + typedef half_float::half argument_type; + + /// Function return type. + typedef size_t result_type; + + /// Compute hash function. + /// \param arg half to hash + /// \return hash value + result_type operator()(argument_type arg) const { + return hash()(static_cast(arg.data_) & + -(arg.data_ != 0x8000)); + } +}; +#endif +} // namespace std + +#undef HALF_CONSTEXPR +#undef HALF_CONSTEXPR_CONST +#undef HALF_NOEXCEPT +#undef HALF_NOTHROW +#ifdef HALF_POP_WARNINGS +#pragma warning(pop) +#undef HALF_POP_WARNINGS +#endif + +#endif diff --git a/csrc/fastdeploy/backends/tensorrt/common/logger.cpp b/csrc/fastdeploy/backends/tensorrt/common/logger.cpp new file mode 100644 index 000000000..1e1671558 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/logger.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "logger.h" +#include "ErrorRecorder.h" +#include "logging.h" + +SampleErrorRecorder gRecorder; +namespace sample { +Logger gLogger{Logger::Severity::kINFO}; +LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)}; +LogStreamConsumer gLogInfo{LOG_INFO(gLogger)}; +LogStreamConsumer gLogWarning{LOG_WARN(gLogger)}; +LogStreamConsumer gLogError{LOG_ERROR(gLogger)}; +LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)}; + +void setReportableSeverity(Logger::Severity severity) { + gLogger.setReportableSeverity(severity); + gLogVerbose.setReportableSeverity(severity); + gLogInfo.setReportableSeverity(severity); + gLogWarning.setReportableSeverity(severity); + gLogError.setReportableSeverity(severity); + gLogFatal.setReportableSeverity(severity); +} +} // namespace sample diff --git a/csrc/fastdeploy/backends/tensorrt/common/logger.h b/csrc/fastdeploy/backends/tensorrt/common/logger.h new file mode 100644 index 000000000..ab642744e --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/logger.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LOGGER_H +#define LOGGER_H + +#include "logging.h" + +class SampleErrorRecorder; +extern SampleErrorRecorder gRecorder; +namespace sample { +extern Logger gLogger; +extern LogStreamConsumer gLogVerbose; +extern LogStreamConsumer gLogInfo; +extern LogStreamConsumer gLogWarning; +extern LogStreamConsumer gLogError; +extern LogStreamConsumer gLogFatal; + +void setReportableSeverity(Logger::Severity severity); +} // namespace sample + +#endif // LOGGER_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/logging.h b/csrc/fastdeploy/backends/tensorrt/common/logging.h new file mode 100644 index 000000000..abcb6b406 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/logging.h @@ -0,0 +1,573 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_LOGGING_H +#define TENSORRT_LOGGING_H + +#include "NvInferRuntimeCommon.h" +#include "sampleOptions.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sample { + +using Severity = nvinfer1::ILogger::Severity; + +class LogStreamConsumerBuffer : public std::stringbuf { + public: + LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, + bool shouldLog) + : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} + + LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept + : mOutput(other.mOutput), mPrefix(other.mPrefix), + mShouldLog(other.mShouldLog) {} + LogStreamConsumerBuffer(const LogStreamConsumerBuffer& other) = delete; + LogStreamConsumerBuffer() = delete; + LogStreamConsumerBuffer& operator=(const LogStreamConsumerBuffer&) = delete; + LogStreamConsumerBuffer& operator=(LogStreamConsumerBuffer&&) = delete; + + ~LogStreamConsumerBuffer() override { + // std::streambuf::pbase() gives a pointer to the beginning of the buffered + // part of the output sequence + // std::streambuf::pptr() gives a pointer to the current position of the + // output sequence + // if the pointer to the beginning is not equal to the pointer to the + // current position, + // call putOutput() to log the output to the stream + if (pbase() != pptr()) { + putOutput(); + } + } + + //! + //! synchronizes the stream buffer and returns 0 on success + //! synchronizing the stream buffer consists of inserting the buffer contents + //! into the stream, + //! resetting the buffer and flushing the stream + //! + int32_t sync() override { + putOutput(); + return 0; + } + + void putOutput() { + if (mShouldLog) { + // prepend timestamp + std::time_t timestamp = std::time(nullptr); + tm* tm_local = std::localtime(×tamp); + mOutput << "["; + mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon + << "/"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; + mOutput << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year + << "-"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; + // std::stringbuf::str() gets the string contents of the buffer + // insert the buffer contents pre-appended by the appropriate prefix into + // the stream + mOutput << mPrefix << str(); + } + // set the buffer to empty + str(""); + // flush the stream + mOutput.flush(); + } + + void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } + + private: + std::ostream& mOutput; + std::string mPrefix; + bool mShouldLog{}; +}; // class LogStreamConsumerBuffer + +//! +//! \class LogStreamConsumerBase +//! \brief Convenience object used to initialize LogStreamConsumerBuffer before +//! std::ostream in LogStreamConsumer +//! +class LogStreamConsumerBase { + public: + LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, + bool shouldLog) + : mBuffer(stream, prefix, shouldLog) {} + + protected: + std::mutex mLogMutex; + LogStreamConsumerBuffer mBuffer; +}; // class LogStreamConsumerBase + +//! +//! \class LogStreamConsumer +//! \brief Convenience object used to facilitate use of C++ stream syntax when +//! logging messages. +//! Order of base classes is LogStreamConsumerBase and then std::ostream. +//! This is because the LogStreamConsumerBase class is used to initialize the +//! LogStreamConsumerBuffer member field +//! in LogStreamConsumer and then the address of the buffer is passed to +//! std::ostream. +//! This is necessary to prevent the address of an uninitialized buffer from +//! being passed to std::ostream. +//! Please do not change the order of the parent classes. +//! +class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { + public: + //! + //! \brief Creates a LogStreamConsumer which logs messages with level + //! severity. + //! Reportable severity determines if the messages are severe enough to be + //! logged. + //! + LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity, + nvinfer1::ILogger::Severity severity) + : LogStreamConsumerBase(severityOstream(severity), + severityPrefix(severity), + severity <= reportableSeverity), + std::ostream(&mBuffer) // links the stream buffer with the stream + , + mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} + + LogStreamConsumer(LogStreamConsumer&& other) noexcept + : LogStreamConsumerBase(severityOstream(other.mSeverity), + severityPrefix(other.mSeverity), + other.mShouldLog), + std::ostream(&mBuffer) // links the stream buffer with the stream + , + mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} + LogStreamConsumer(const LogStreamConsumer& other) = delete; + LogStreamConsumer() = delete; + ~LogStreamConsumer() = default; + LogStreamConsumer& operator=(const LogStreamConsumer&) = delete; + LogStreamConsumer& operator=(LogStreamConsumer&&) = delete; + + void setReportableSeverity(Severity reportableSeverity) { + mShouldLog = mSeverity <= reportableSeverity; + mBuffer.setShouldLog(mShouldLog); + } + + std::mutex& getMutex() { return mLogMutex; } + + bool getShouldLog() const { return mShouldLog; } + + private: + static std::ostream& severityOstream(Severity severity) { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + static std::string severityPrefix(Severity severity) { + switch (severity) { + case Severity::kINTERNAL_ERROR: + return "[F] "; + case Severity::kERROR: + return "[E] "; + case Severity::kWARNING: + return "[W] "; + case Severity::kINFO: + return "[I] "; + case Severity::kVERBOSE: + return "[V] "; + default: + assert(0); + return ""; + } + } + + bool mShouldLog; + Severity mSeverity; +}; // class LogStreamConsumer + +template +LogStreamConsumer& operator<<(LogStreamConsumer& logger, const T& obj) { + if (logger.getShouldLog()) { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << obj; + } + return logger; +} + +//! +//! Special handling std::endl +//! +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, + std::ostream& (*f)(std::ostream&)) { + if (logger.getShouldLog()) { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << f; + } + return logger; +} + +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, + const nvinfer1::Dims& dims) { + if (logger.getShouldLog()) { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + for (int32_t i = 0; i < dims.nbDims; ++i) { + os << (i ? "x" : "") << dims.d[i]; + } + } + return logger; +} + +//! +//! \class Logger +//! +//! \brief Class which manages logging of TensorRT tools and samples +//! +//! \details This class provides a common interface for TensorRT tools and +//! samples to log information to the console, +//! and supports logging two types of messages: +//! +//! - Debugging messages with an associated severity (info, warning, error, or +//! internal error/fatal) +//! - Test pass/fail messages +//! +//! The advantage of having all samples use this class for logging as opposed to +//! emitting directly to stdout/stderr is +//! that the logic for controlling the verbosity and formatting of sample output +//! is centralized in one location. +//! +//! In the future, this class could be extended to support dumping test results +//! to a file in some standard format +//! (for example, JUnit XML), and providing additional metadata (e.g. timing the +//! duration of a test run). +//! +//! TODO: For backwards compatibility with existing samples, this class inherits +//! directly from the nvinfer1::ILogger +//! interface, which is problematic since there isn't a clean separation between +//! messages coming from the TensorRT +//! library and messages coming from the sample. +//! +//! In the future (once all samples are updated to use Logger::getTRTLogger() to +//! access the ILogger) we can refactor the +//! class to eliminate the inheritance and instead make the nvinfer1::ILogger +//! implementation a member of the Logger +//! object. +//! +class Logger : public nvinfer1::ILogger { + public: + explicit Logger(Severity severity = Severity::kWARNING) + : mReportableSeverity(severity) {} + + //! + //! \enum TestResult + //! \brief Represents the state of a given test + //! + enum class TestResult { + kRUNNING, //!< The test is running + kPASSED, //!< The test passed + kFAILED, //!< The test failed + kWAIVED //!< The test was waived + }; + + //! + //! \brief Forward-compatible method for retrieving the nvinfer::ILogger + //! associated with this Logger + //! \return The nvinfer1::ILogger associated with this Logger + //! + //! TODO Once all samples are updated to use this method to register the + //! logger with TensorRT, + //! we can eliminate the inheritance of Logger from ILogger + //! + nvinfer1::ILogger& getTRTLogger() noexcept { return *this; } + + //! + //! \brief Implementation of the nvinfer1::ILogger::log() virtual method + //! + //! Note samples should not be calling this function directly; it will + //! eventually go away once we eliminate the + //! inheritance from nvinfer1::ILogger + //! + void log(Severity severity, const char* msg) noexcept override { + LogStreamConsumer(mReportableSeverity, severity) + << "[TRT] " << std::string(msg) << std::endl; + } + + //! + //! \brief Method for controlling the verbosity of logging output + //! + //! \param severity The logger will only emit messages that have severity of + //! this level or higher. + //! + void setReportableSeverity(Severity severity) noexcept { + mReportableSeverity = severity; + } + + //! + //! \brief Opaque handle that holds logging information for a particular test + //! + //! This object is an opaque handle to information used by the Logger to print + //! test results. + //! The sample must call Logger::defineTest() in order to obtain a TestAtom + //! that can be used + //! with Logger::reportTest{Start,End}(). + //! + class TestAtom { + public: + TestAtom(TestAtom&&) = default; + + private: + friend class Logger; + + TestAtom(bool started, const std::string& name, const std::string& cmdline) + : mStarted(started), mName(name), mCmdline(cmdline) {} + + bool mStarted; + std::string mName; + std::string mCmdline; + }; + + //! + //! \brief Define a test for logging + //! + //! \param[in] name The name of the test. This should be a string starting + //! with + //! "TensorRT" and containing dot-separated strings + //! containing + //! the characters [A-Za-z0-9_]. + //! For example, "TensorRT.sample_googlenet" + //! \param[in] cmdline The command line used to reproduce the test + // + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, + const std::string& cmdline) { + return TestAtom(false, name, cmdline); + } + + //! + //! \brief A convenience overloaded version of defineTest() that accepts an + //! array of command-line arguments + //! as input + //! + //! \param[in] name The name of the test + //! \param[in] argc The number of command-line arguments + //! \param[in] argv The array of command-line arguments (given as C strings) + //! + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, int32_t argc, + char const* const* argv) { + // Append TensorRT version as info + const std::string vname = + name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]"; + auto cmdline = genCmdlineString(argc, argv); + return defineTest(vname, cmdline); + } + + //! + //! \brief Report that a test has started. + //! + //! \pre reportTestStart() has not been called yet for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has started + //! + static void reportTestStart(TestAtom& testAtom) { + reportTestResult(testAtom, TestResult::kRUNNING); + assert(!testAtom.mStarted); + testAtom.mStarted = true; + } + + //! + //! \brief Report that a test has ended. + //! + //! \pre reportTestStart() has been called for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has ended + //! \param[in] result The result of the test. Should be one of + //! TestResult::kPASSED, + //! TestResult::kFAILED, TestResult::kWAIVED + //! + static void reportTestEnd(TestAtom const& testAtom, TestResult result) { + assert(result != TestResult::kRUNNING); + assert(testAtom.mStarted); + reportTestResult(testAtom, result); + } + + static int32_t reportPass(TestAtom const& testAtom) { + reportTestEnd(testAtom, TestResult::kPASSED); + return EXIT_SUCCESS; + } + + static int32_t reportFail(TestAtom const& testAtom) { + reportTestEnd(testAtom, TestResult::kFAILED); + return EXIT_FAILURE; + } + + static int32_t reportWaive(TestAtom const& testAtom) { + reportTestEnd(testAtom, TestResult::kWAIVED); + return EXIT_SUCCESS; + } + + static int32_t reportTest(TestAtom const& testAtom, bool pass) { + return pass ? reportPass(testAtom) : reportFail(testAtom); + } + + Severity getReportableSeverity() const { return mReportableSeverity; } + + private: + //! + //! \brief returns an appropriate string for prefixing a log message with the + //! given severity + //! + static const char* severityPrefix(Severity severity) { + switch (severity) { + case Severity::kINTERNAL_ERROR: + return "[F] "; + case Severity::kERROR: + return "[E] "; + case Severity::kWARNING: + return "[W] "; + case Severity::kINFO: + return "[I] "; + case Severity::kVERBOSE: + return "[V] "; + default: + assert(0); + return ""; + } + } + + //! + //! \brief returns an appropriate string for prefixing a test result message + //! with the given result + //! + static const char* testResultString(TestResult result) { + switch (result) { + case TestResult::kRUNNING: + return "RUNNING"; + case TestResult::kPASSED: + return "PASSED"; + case TestResult::kFAILED: + return "FAILED"; + case TestResult::kWAIVED: + return "WAIVED"; + default: + assert(0); + return ""; + } + } + + //! + //! \brief returns an appropriate output stream (cout or cerr) to use with the + //! given severity + //! + static std::ostream& severityOstream(Severity severity) { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + //! + //! \brief method that implements logging test results + //! + static void reportTestResult(TestAtom const& testAtom, TestResult result) { + severityOstream(Severity::kINFO) + << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " + << testAtom.mCmdline << std::endl; + } + + //! + //! \brief generate a command line string from the given (argc, argv) values + //! + static std::string genCmdlineString(int32_t argc, char const* const* argv) { + std::stringstream ss; + for (int32_t i = 0; i < argc; i++) { + if (i > 0) { + ss << " "; + } + ss << argv[i]; + } + return ss.str(); + } + + Severity mReportableSeverity; +}; // class Logger + +namespace { +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages +//! of severity kVERBOSE +//! +//! Example usage: +//! +//! LOG_VERBOSE(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages +//! of severity kINFO +//! +//! Example usage: +//! +//! LOG_INFO(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_INFO(const Logger& logger) { + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages +//! of severity kWARNING +//! +//! Example usage: +//! +//! LOG_WARN(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_WARN(const Logger& logger) { + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages +//! of severity kERROR +//! +//! Example usage: +//! +//! LOG_ERROR(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_ERROR(const Logger& logger) { + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages +//! of severity kINTERNAL_ERROR +//! ("fatal" severity) +//! +//! Example usage: +//! +//! LOG_FATAL(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_FATAL(const Logger& logger) { + return LogStreamConsumer(logger.getReportableSeverity(), + Severity::kINTERNAL_ERROR); +} +} // anonymous namespace +} // namespace sample +#endif // TENSORRT_LOGGING_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/parserOnnxConfig.h b/csrc/fastdeploy/backends/tensorrt/common/parserOnnxConfig.h new file mode 100644 index 000000000..8569ca01c --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/parserOnnxConfig.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PARSER_ONNX_CONFIG_H +#define PARSER_ONNX_CONFIG_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +#include "NvOnnxParser.h" + +#define ONNX_DEBUG 1 + +/** + * \class ParserOnnxConfig + * \brief Configuration Manager Class Concrete Implementation + * + * \note: + * + */ + +using namespace std; + +class ParserOnnxConfig : public nvonnxparser::IOnnxConfig { + protected: + string mModelFilename{}; + string mTextFilename{}; + string mFullTextFilename{}; + nvinfer1::DataType mModelDtype; + nvonnxparser::IOnnxConfig::Verbosity mVerbosity; + bool mPrintLayercInfo; + + public: + ParserOnnxConfig() + : mModelDtype(nvinfer1::DataType::kFLOAT), + mVerbosity(static_cast(nvinfer1::ILogger::Severity::kWARNING)), + mPrintLayercInfo(false) { +#ifdef ONNX_DEBUG + if (isDebug()) { + std::cout << " ParserOnnxConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + + protected: + ~ParserOnnxConfig() { +#ifdef ONNX_DEBUG + if (isDebug()) { + std::cout << "ParserOnnxConfig::dtor(): " << this << std::endl; + } +#endif + } + + public: + virtual void setModelDtype(const nvinfer1::DataType modelDtype) noexcept { + mModelDtype = modelDtype; + } + + virtual nvinfer1::DataType getModelDtype() const noexcept { + return mModelDtype; + } + + virtual const char* getModelFileName() const noexcept { + return mModelFilename.c_str(); + } + virtual void setModelFileName(const char* onnxFilename) noexcept { + mModelFilename = string(onnxFilename); + } + virtual nvonnxparser::IOnnxConfig::Verbosity + getVerbosityLevel() const noexcept { + return mVerbosity; + } + virtual void addVerbosity() noexcept { ++mVerbosity; } + virtual void reduceVerbosity() noexcept { --mVerbosity; } + virtual void + setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept { + mVerbosity = verbosity; + } + + virtual const char* getTextFileName() const noexcept { + return mTextFilename.c_str(); + } + virtual void setTextFileName(const char* textFilename) noexcept { + mTextFilename = string(textFilename); + } + virtual const char* getFullTextFileName() const noexcept { + return mFullTextFilename.c_str(); + } + virtual void setFullTextFileName(const char* fullTextFilename) noexcept { + mFullTextFilename = string(fullTextFilename); + } + virtual bool getPrintLayerInfo() const noexcept { return mPrintLayercInfo; } + virtual void setPrintLayerInfo(bool src) noexcept { + mPrintLayercInfo = src; + } //!< get the boolean variable corresponding to the Layer Info, see + //! getPrintLayerInfo() + + virtual bool isDebug() const noexcept { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + virtual void destroy() noexcept { delete this; } + +}; // class ParserOnnxConfig + +#endif diff --git a/csrc/fastdeploy/backends/tensorrt/common/safeCommon.h b/csrc/fastdeploy/backends/tensorrt/common/safeCommon.h new file mode 100644 index 000000000..1aa92ad22 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/safeCommon.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_SAFE_COMMON_H +#define TENSORRT_SAFE_COMMON_H + +#include "NvInferRuntimeCommon.h" +#include +#include +#include +#include +#include + +#define CHECK(status) \ + do { \ + auto ret = (status); \ + if (ret != 0) { \ + std::cerr << "Cuda failure: " << ret << std::endl; \ + abort(); \ + } \ + } while (0) + +namespace samplesCommon { +template inline std::shared_ptr infer_object(T* obj) { + if (!obj) { + throw std::runtime_error("Failed to create object"); + } + return std::shared_ptr(obj); +} + +inline uint32_t elementSize(nvinfer1::DataType t) { + switch (t) { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: + return 4; + case nvinfer1::DataType::kHALF: + return 2; + case nvinfer1::DataType::kINT8: + return 1; + case nvinfer1::DataType::kBOOL: + return 1; + } + return 0; +} + +template inline A divUp(A x, B n) { + return (x + n - 1) / n; +} + +} // namespace samplesCommon + +#endif // TENSORRT_SAFE_COMMON_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleConfig.h b/csrc/fastdeploy/backends/tensorrt/common/sampleConfig.h new file mode 100644 index 000000000..a097f4dbe --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleConfig.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SampleConfig_H +#define SampleConfig_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +class SampleConfig : public nvonnxparser::IOnnxConfig { + public: + enum class InputDataFormat : int { kASCII = 0, kPPM = 1 }; + + private: + std::string mModelFilename; + std::string mEngineFilename; + std::string mTextFilename; + std::string mFullTextFilename; + std::string mImageFilename; + std::string mReferenceFilename; + std::string mOutputFilename; + std::string mCalibrationFilename; + std::string mTimingCacheFilename; + int64_t mLabel{-1}; + int64_t mMaxBatchSize{32}; + int64_t mCalibBatchSize{0}; + int64_t mMaxNCalibBatch{0}; + int64_t mFirstCalibBatch{0}; + int64_t mUseDLACore{-1}; + nvinfer1::DataType mModelDtype{nvinfer1::DataType::kFLOAT}; + bool mTF32{true}; + Verbosity mVerbosity{static_cast(nvinfer1::ILogger::Severity::kWARNING)}; + bool mPrintLayercInfo{false}; + bool mDebugBuilder{false}; + InputDataFormat mInputDataFormat{InputDataFormat::kASCII}; + uint64_t mTopK{0}; + float mFailurePercentage{-1.0f}; + float mTolerance{0.0f}; + float mAbsTolerance{1e-5f}; + + public: + SampleConfig() { +#ifdef ONNX_DEBUG + if (isDebug()) { + std::cout << " SampleConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + + protected: + ~SampleConfig() { +#ifdef ONNX_DEBUG + if (isDebug()) { + std::cout << "SampleConfig::dtor(): " << this << std::endl; + } +#endif + } + + public: + void setModelDtype(const nvinfer1::DataType mdt) noexcept { + mModelDtype = mdt; + } + + nvinfer1::DataType getModelDtype() const noexcept { return mModelDtype; } + + bool getTF32() const noexcept { return mTF32; } + + void setTF32(bool enabled) noexcept { mTF32 = enabled; } + + const char* getModelFileName() const noexcept { + return mModelFilename.c_str(); + } + + void setModelFileName(const char* onnxFilename) noexcept { + mModelFilename = std::string(onnxFilename); + } + Verbosity getVerbosityLevel() const noexcept { return mVerbosity; } + void addVerbosity() noexcept { ++mVerbosity; } + void reduceVerbosity() noexcept { --mVerbosity; } + virtual void setVerbosityLevel(Verbosity v) noexcept { mVerbosity = v; } + const char* getEngineFileName() const noexcept { + return mEngineFilename.c_str(); + } + void setEngineFileName(const char* engineFilename) noexcept { + mEngineFilename = std::string(engineFilename); + } + const char* getTextFileName() const noexcept { return mTextFilename.c_str(); } + void setTextFileName(const char* textFilename) noexcept { + mTextFilename = std::string(textFilename); + } + const char* getFullTextFileName() const noexcept { + return mFullTextFilename.c_str(); + } + void setFullTextFileName(const char* fullTextFilename) noexcept { + mFullTextFilename = std::string(fullTextFilename); + } + void setLabel(int64_t label) noexcept { mLabel = label; } //!< set the Label + + int64_t getLabel() const noexcept { return mLabel; } //!< get the Label + + bool getPrintLayerInfo() const noexcept { return mPrintLayercInfo; } + + void setPrintLayerInfo(bool b) noexcept { + mPrintLayercInfo = b; + } //!< get the boolean variable corresponding to the Layer Info, see + //! getPrintLayerInfo() + + void setMaxBatchSize(int64_t maxBatchSize) noexcept { + mMaxBatchSize = maxBatchSize; + } //!< set the Max Batch Size + int64_t getMaxBatchSize() const noexcept { + return mMaxBatchSize; + } //!< get the Max Batch Size + + void setCalibBatchSize(int64_t CalibBatchSize) noexcept { + mCalibBatchSize = CalibBatchSize; + } //!< set the calibration batch size + int64_t getCalibBatchSize() const noexcept { + return mCalibBatchSize; + } //!< get calibration batch size + + void setMaxNCalibBatch(int64_t MaxNCalibBatch) noexcept { + mMaxNCalibBatch = MaxNCalibBatch; + } //!< set Max Number of Calibration Batches + int64_t getMaxNCalibBatch() const noexcept { + return mMaxNCalibBatch; + } //!< get the Max Number of Calibration Batches + + void setFirstCalibBatch(int64_t FirstCalibBatch) noexcept { + mFirstCalibBatch = FirstCalibBatch; + } //!< set the first calibration batch + int64_t getFirstCalibBatch() const noexcept { + return mFirstCalibBatch; + } //!< get the first calibration batch + + void setUseDLACore(int64_t UseDLACore) noexcept { + mUseDLACore = UseDLACore; + } //!< set the DLA core to use + int64_t getUseDLACore() const noexcept { + return mUseDLACore; + } //!< get the DLA core to use + + void setDebugBuilder() noexcept { + mDebugBuilder = true; + } //!< enable the Debug info, while building the engine. + bool getDebugBuilder() const noexcept { + return mDebugBuilder; + } //!< get the boolean variable, corresponding to the debug builder + + const char* + getImageFileName() const noexcept //!< set Image file name (PPM or ASCII) + { + return mImageFilename.c_str(); + } + void setImageFileName( + const char* imageFilename) noexcept //!< get the Image file name + { + mImageFilename = std::string(imageFilename); + } + const char* getReferenceFileName() const noexcept { + return mReferenceFilename.c_str(); + } + void setReferenceFileName( + const char* referenceFilename) noexcept //!< set reference file name + { + mReferenceFilename = std::string(referenceFilename); + } + + void setInputDataFormat(InputDataFormat idt) noexcept { + mInputDataFormat = idt; + } //!< specifies expected data format of the image file (PPM or ASCII) + InputDataFormat getInputDataFormat() const noexcept { + return mInputDataFormat; + } //!< returns the expected data format of the image file. + + const char* getOutputFileName() + const noexcept //!< specifies the file to save the results + { + return mOutputFilename.c_str(); + } + void setOutputFileName( + const char* outputFilename) noexcept //!< get the output file name + { + mOutputFilename = std::string(outputFilename); + } + + const char* getCalibrationFileName() const noexcept { + return mCalibrationFilename.c_str(); + } //!< specifies the file containing the list of image files for int8 + //! calibration + void setCalibrationFileName( + const char* calibrationFilename) noexcept //!< get the int 8 calibration + //! list file name + { + mCalibrationFilename = std::string(calibrationFilename); + } + + uint64_t getTopK() const noexcept { return mTopK; } + void setTopK(uint64_t topK) noexcept { + mTopK = topK; + } //!< If this options is specified, return the K top probabilities. + + float getFailurePercentage() const noexcept { return mFailurePercentage; } + + void setFailurePercentage(float f) noexcept { mFailurePercentage = f; } + + float getAbsoluteTolerance() const noexcept { return mAbsTolerance; } + + void setAbsoluteTolerance(float a) noexcept { mAbsTolerance = a; } + + float getTolerance() const noexcept { return mTolerance; } + + void setTolerance(float t) noexcept { mTolerance = t; } + + const char* getTimingCacheFilename() const noexcept { + return mTimingCacheFilename.c_str(); + } + + void setTimingCacheFileName(const char* timingCacheFilename) noexcept { + mTimingCacheFilename = std::string(timingCacheFilename); + } + + bool isDebug() const noexcept { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + void destroy() noexcept { delete this; } + +}; // class SampleConfig + +#endif diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleDevice.h b/csrc/fastdeploy/backends/tensorrt/common/sampleDevice.h new file mode 100644 index 000000000..cdbb08019 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleDevice.h @@ -0,0 +1,397 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_DEVICE_H +#define TRT_SAMPLE_DEVICE_H + +#include +#include +#include +#include +#include + +namespace sample { + +inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) { + if (ret != cudaSuccess) { + err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; + abort(); + } +} + +class TrtCudaEvent; + +namespace { + +void cudaSleep(void* sleep) { + std::this_thread::sleep_for( + std::chrono::duration(*static_cast(sleep))); +} + +} // namespace + +//! +//! \class TrtCudaStream +//! \brief Managed CUDA stream +//! +class TrtCudaStream { + public: + TrtCudaStream() { cudaCheck(cudaStreamCreate(&mStream)); } + + TrtCudaStream(const TrtCudaStream&) = delete; + + TrtCudaStream& operator=(const TrtCudaStream&) = delete; + + TrtCudaStream(TrtCudaStream&&) = delete; + + TrtCudaStream& operator=(TrtCudaStream&&) = delete; + + ~TrtCudaStream() { cudaCheck(cudaStreamDestroy(mStream)); } + + cudaStream_t get() const { return mStream; } + + void synchronize() { cudaCheck(cudaStreamSynchronize(mStream)); } + + void wait(TrtCudaEvent& event); + + void sleep(float* ms) { + cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms)); + } + + private: + cudaStream_t mStream{}; +}; + +//! +//! \class TrtCudaEvent +//! \brief Managed CUDA event +//! +class TrtCudaEvent { + public: + explicit TrtCudaEvent(bool blocking = true) { + const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault; + cudaCheck(cudaEventCreateWithFlags(&mEvent, flags)); + } + + TrtCudaEvent(const TrtCudaEvent&) = delete; + + TrtCudaEvent& operator=(const TrtCudaEvent&) = delete; + + TrtCudaEvent(TrtCudaEvent&&) = delete; + + TrtCudaEvent& operator=(TrtCudaEvent&&) = delete; + + ~TrtCudaEvent() { cudaCheck(cudaEventDestroy(mEvent)); } + + cudaEvent_t get() const { return mEvent; } + + void record(const TrtCudaStream& stream) { + cudaCheck(cudaEventRecord(mEvent, stream.get())); + } + + void synchronize() { cudaCheck(cudaEventSynchronize(mEvent)); } + + // Returns time elapsed time in milliseconds + float operator-(const TrtCudaEvent& e) const { + float time{0}; + cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); + return time; + } + + private: + cudaEvent_t mEvent{}; +}; + +inline void TrtCudaStream::wait(TrtCudaEvent& event) { + cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0)); +} + +//! +//! \class TrtCudaGraph +//! \brief Managed CUDA graph +//! +class TrtCudaGraph { + public: + explicit TrtCudaGraph() = default; + + TrtCudaGraph(const TrtCudaGraph&) = delete; + + TrtCudaGraph& operator=(const TrtCudaGraph&) = delete; + + TrtCudaGraph(TrtCudaGraph&&) = delete; + + TrtCudaGraph& operator=(TrtCudaGraph&&) = delete; + + ~TrtCudaGraph() { + if (mGraphExec) { + cudaGraphExecDestroy(mGraphExec); + } + } + + void beginCapture(TrtCudaStream& stream) { + cudaCheck( + cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); + } + + bool launch(TrtCudaStream& stream) { + return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess; + } + + void endCapture(TrtCudaStream& stream) { + cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph)); + cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + cudaCheck(cudaGraphDestroy(mGraph)); + } + + void endCaptureOnError(TrtCudaStream& stream) { + // There are two possibilities why stream capture would fail: + // (1) stream is in cudaErrorStreamCaptureInvalidated state. + // (2) TRT reports a failure. + // In case (1), the returning mGraph should be nullptr. + // In case (2), the returning mGraph is not nullptr, but it should not be + // used. + const auto ret = cudaStreamEndCapture(stream.get(), &mGraph); + if (ret == cudaErrorStreamCaptureInvalidated) { + assert(mGraph == nullptr); + } else { + assert(ret == cudaSuccess); + assert(mGraph != nullptr); + cudaCheck(cudaGraphDestroy(mGraph)); + mGraph = nullptr; + } + // Clean up any CUDA error. + cudaGetLastError(); + sample::gLogWarning << "The CUDA graph capture on the stream has failed." + << std::endl; + } + + private: + cudaGraph_t mGraph{}; + cudaGraphExec_t mGraphExec{}; +}; + +//! +//! \class TrtCudaBuffer +//! \brief Managed buffer for host and device +//! +template class TrtCudaBuffer { + public: + TrtCudaBuffer() = default; + + TrtCudaBuffer(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer(TrtCudaBuffer&& rhs) { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + + TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) { + if (this != &rhs) { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + return *this; + } + + ~TrtCudaBuffer() { reset(); } + + TrtCudaBuffer(size_t size) { A()(&mPtr, size); } + + void allocate(size_t size) { + reset(); + A()(&mPtr, size); + } + + void reset(void* ptr = nullptr) { + if (mPtr) { + D()(mPtr); + } + mPtr = ptr; + } + + void* get() const { return mPtr; } + + private: + void* mPtr{nullptr}; +}; + +struct DeviceAllocator { + void operator()(void** ptr, size_t size) { cudaCheck(cudaMalloc(ptr, size)); } +}; + +struct DeviceDeallocator { + void operator()(void* ptr) { cudaCheck(cudaFree(ptr)); } +}; + +struct ManagedAllocator { + void operator()(void** ptr, size_t size) { + cudaCheck(cudaMallocManaged(ptr, size)); + } +}; + +struct HostAllocator { + void operator()(void** ptr, size_t size) { + cudaCheck(cudaMallocHost(ptr, size)); + } +}; + +struct HostDeallocator { + void operator()(void* ptr) { cudaCheck(cudaFreeHost(ptr)); } +}; + +using TrtDeviceBuffer = TrtCudaBuffer; +using TrtManagedBuffer = TrtCudaBuffer; + +using TrtHostBuffer = TrtCudaBuffer; + +//! +//! \class MirroredBuffer +//! \brief Coupled host and device buffers +//! +class IMirroredBuffer { + public: + //! + //! Allocate memory for the mirrored buffer give the size + //! of the allocation. + //! + virtual void allocate(size_t size) = 0; + + //! + //! Get the pointer to the device side buffer. + //! + //! \return pointer to device memory or nullptr if uninitialized. + //! + virtual void* getDeviceBuffer() const = 0; + + //! + //! Get the pointer to the host side buffer. + //! + //! \return pointer to host memory or nullptr if uninitialized. + //! + virtual void* getHostBuffer() const = 0; + + //! + //! Copy the memory from host to device. + //! + virtual void hostToDevice(TrtCudaStream& stream) = 0; + + //! + //! Copy the memory from device to host. + //! + virtual void deviceToHost(TrtCudaStream& stream) = 0; + + //! + //! Interface to get the size of the memory + //! + //! \return the size of memory allocated. + //! + virtual size_t getSize() const = 0; + + //! + //! Virtual destructor declaraion + //! + virtual ~IMirroredBuffer() = default; + +}; // class IMirroredBuffer + +//! +//! Class to have a seperate memory buffer for discrete device and host +//! allocations. +//! +class DiscreteMirroredBuffer : public IMirroredBuffer { + public: + void allocate(size_t size) { + mSize = size; + mHostBuffer.allocate(size); + mDeviceBuffer.allocate(size); + } + + void* getDeviceBuffer() const { return mDeviceBuffer.get(); } + + void* getHostBuffer() const { return mHostBuffer.get(); } + + void hostToDevice(TrtCudaStream& stream) { + cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, + cudaMemcpyHostToDevice, stream.get())); + } + + void deviceToHost(TrtCudaStream& stream) { + cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, + cudaMemcpyDeviceToHost, stream.get())); + } + + size_t getSize() const { return mSize; } + + private: + size_t mSize{0}; + TrtHostBuffer mHostBuffer; + TrtDeviceBuffer mDeviceBuffer; +}; // class DiscreteMirroredBuffer + +//! +//! Class to have a unified memory buffer for embedded devices. +//! +class UnifiedMirroredBuffer : public IMirroredBuffer { + public: + void allocate(size_t size) { + mSize = size; + mBuffer.allocate(size); + } + + void* getDeviceBuffer() const { return mBuffer.get(); } + + void* getHostBuffer() const { return mBuffer.get(); } + + void hostToDevice(TrtCudaStream& stream) { + // Does nothing since we are using unified memory. + } + + void deviceToHost(TrtCudaStream& stream) { + // Does nothing since we are using unified memory. + } + + size_t getSize() const { return mSize; } + + private: + size_t mSize{0}; + TrtManagedBuffer mBuffer; +}; // class UnifiedMirroredBuffer + +inline void setCudaDevice(int device, std::ostream& os) { + cudaCheck(cudaSetDevice(device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + + // clang-format off + os << "=== Device Information ===" << std::endl; + os << "Selected Device: " << properties.name << std::endl; + os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; + os << "SMs: " << properties.multiProcessorCount << std::endl; + os << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; + os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; + os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; + os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" + << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; + os << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; + // clang-format on +} + +} // namespace sample + +#endif // TRT_SAMPLE_DEVICE_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleEngines.cpp b/csrc/fastdeploy/backends/tensorrt/common/sampleEngines.cpp new file mode 100644 index 000000000..6c1ab35b1 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleEngines.cpp @@ -0,0 +1,1710 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#include "NvCaffeParser.h" +#include "NvInfer.h" +#include "NvOnnxParser.h" + +#include "ErrorRecorder.h" +#include "common.h" +#include "half.h" +#include "logger.h" +#include "sampleEngines.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +#if !defined(_WIN32) +#include +#endif + +using namespace nvinfer1; + +namespace sample { + +namespace { + +//struct CaffeBufferShutter { +// ~CaffeBufferShutter() { nvcaffeparser1::shutdownProtobufLibrary(); } +//}; + +std::map +readScalesFromCalibrationCache(const std::string& calibrationFile) { + std::map tensorScales; + std::ifstream cache{calibrationFile}; + if (!cache.is_open()) { + sample::gLogError << "[TRT] Can not open provided calibration cache file" + << std::endl; + return tensorScales; + } + std::string line; + while (std::getline(cache, line)) { + auto colonPos = line.find_last_of(':'); + if (colonPos != std::string::npos) { + // Scales should be stored in calibration cache as 32-bit floating numbers + // encoded as 32-bit integers + int32_t scalesAsInt = + std::stoi(line.substr(colonPos + 2, 8), nullptr, 16); + const auto tensorName = line.substr(0, colonPos); + tensorScales[tensorName] = *reinterpret_cast(&scalesAsInt); + } + } + cache.close(); + return tensorScales; +} +} // namespace + +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, + const std::vector& inputFormats, + const std::vector& outputFormats, + const std::string& calibrationFile) { + const auto tensorScales = readScalesFromCalibrationCache(calibrationFile); + const bool broadcastInputFormats = + broadcastIOFormats(inputFormats, network.getNbInputs()); + for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i) { + int32_t formatIdx = broadcastInputFormats ? 0 : i; + if (!inputFormats.empty() && + inputFormats[formatIdx].first == DataType::kINT8) { + auto* input = network.getInput(i); + const auto calibScale = tensorScales.at(input->getName()); + input->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } + const bool broadcastOutputFormats = + broadcastIOFormats(outputFormats, network.getNbInputs()); + for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i) { + int32_t formatIdx = broadcastOutputFormats ? 0 : i; + if (!outputFormats.empty() && + outputFormats[formatIdx].first == DataType::kINT8) { + auto* output = network.getOutput(i); + const auto calibScale = tensorScales.at(output->getName()); + output->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } +} + +#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err) \ + { \ + if ((condition) == false) { \ + (err) << (msg) << std::endl; \ + return retval; \ + } \ + } + +Parser modelToNetwork(const ModelOptions& model, + nvinfer1::INetworkDefinition& network, + std::ostream& err) { + sample::gLogInfo << "Start parsing network model" << std::endl; + Parser parser; + const std::string& modelName = model.baseModel.model; + switch (model.baseModel.format) { +/* + case ModelFormat::kCAFFE: { + using namespace nvcaffeparser1; + parser.caffeParser.reset(createCaffeParser()); + CaffeBufferShutter bufferShutter; + const auto* const blobNameToTensor = parser.caffeParser->parse( + model.prototxt.c_str(), modelName.empty() ? nullptr : modelName.c_str(), + network, DataType::kFLOAT); + if (!blobNameToTensor) { + err << "Failed to parse caffe model or prototxt, tensors blob not found" + << std::endl; + parser.caffeParser.reset(); + break; + } + + for (const auto& s : model.outputs) { + if (blobNameToTensor->find(s.c_str()) == nullptr) { + err << "Could not find output blob " << s << std::endl; + parser.caffeParser.reset(); + break; + } + network.markOutput(*blobNameToTensor->find(s.c_str())); + } + break; + } +*/ + case ModelFormat::kONNX: { + using namespace nvonnxparser; + parser.onnxParser.reset( + createParser(network, sample::gLogger.getTRTLogger())); + if (!parser.onnxParser->parseFromFile( + model.baseModel.model.c_str(), + static_cast(sample::gLogger.getReportableSeverity()))) { + err << "Failed to parse onnx file" << std::endl; + parser.onnxParser.reset(); + } + break; + } + case ModelFormat::kANY: + break; + } + + sample::gLogInfo << "Finish parsing network model" << std::endl; + return parser; +} + +namespace { + +class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 { + public: + RndInt8Calibrator(int batches, std::vector& elemCount, + const std::string& cacheFile, + const nvinfer1::INetworkDefinition& network, + std::ostream& err); + + ~RndInt8Calibrator() { + for (auto& elem : mInputDeviceBuffers) { + cudaCheck(cudaFree(elem.second), mErr); + } + } + + bool getBatch(void* bindings[], const char* names[], + int nbBindings) noexcept override; + + int getBatchSize() const noexcept override { return 1; } + + const void* readCalibrationCache(size_t& length) noexcept override; + + virtual void writeCalibrationCache(const void*, size_t) noexcept override {} + + private: + int mBatches{}; + int mCurrentBatch{}; + std::string mCacheFile; + std::map mInputDeviceBuffers; + std::vector mCalibrationCache; + std::ostream& mErr; +}; + +RndInt8Calibrator::RndInt8Calibrator(int batches, + std::vector& elemCount, + const std::string& cacheFile, + const INetworkDefinition& network, + std::ostream& err) + : mBatches(batches), mCurrentBatch(0), mCacheFile(cacheFile), mErr(err) { + std::ifstream tryCache(cacheFile, std::ios::binary); + if (tryCache.good()) { + return; + } + + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0F, 1.0F); + auto gen = [&generator, &distribution]() { return distribution(generator); }; + + for (int i = 0; i < network.getNbInputs(); i++) { + auto* input = network.getInput(i); + std::vector rnd_data(elemCount[i]); + std::generate_n(rnd_data.begin(), elemCount[i], gen); + + void* data; + cudaCheck(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); + cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), + cudaMemcpyHostToDevice), + mErr); + + mInputDeviceBuffers.insert(std::make_pair(input->getName(), data)); + } +} + +bool RndInt8Calibrator::getBatch(void* bindings[], const char* names[], + int nbBindings) noexcept { + if (mCurrentBatch >= mBatches) { + return false; + } + + for (int i = 0; i < nbBindings; ++i) { + bindings[i] = mInputDeviceBuffers[names[i]]; + } + + ++mCurrentBatch; + + return true; +} + +const void* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept { + mCalibrationCache.clear(); + std::ifstream input(mCacheFile, std::ios::binary); + input >> std::noskipws; + if (input.good()) { + std::copy(std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(mCalibrationCache)); + } + + length = mCalibrationCache.size(); + return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr; +} + +bool setTensorDynamicRange(const INetworkDefinition& network, + float inRange = 2.0F, float outRange = 4.0F) { + // Ensure that all layer inputs have a dynamic range. + for (int l = 0; l < network.getNbLayers(); l++) { + auto* layer = network.getLayer(l); + for (int i = 0; i < layer->getNbInputs(); i++) { + ITensor* input{layer->getInput(i)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input && !input->dynamicRangeIsSet()) { + // Concat should propagate dynamic range from outputs to inputs to avoid + // Re-quantization during the concatenation + auto dynRange = (layer->getType() == LayerType::kCONCATENATION) + ? outRange + : inRange; + if (!input->setDynamicRange(-dynRange, dynRange)) { + return false; + } + } + } + for (int o = 0; o < layer->getNbOutputs(); o++) { + ITensor* output{layer->getOutput(o)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output && !output->dynamicRangeIsSet()) { + // Pooling must have the same input and output dynamic range. + if (layer->getType() == LayerType::kPOOLING) { + if (!output->setDynamicRange(-inRange, inRange)) { + return false; + } + } else { + if (!output->setDynamicRange(-outRange, outRange)) { + return false; + } + } + } + } + } + return true; +} + +// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0. +template +void sparsify(const T* values, int64_t count, int32_t k, int32_t rs, + std::vector& sparseWeights) { + const auto c = count / (k * rs); + sparseWeights.resize(count * sizeof(T)); + auto* sparseValues = reinterpret_cast(sparseWeights.data()); + + constexpr int32_t window = 4; + constexpr int32_t nonzeros = 2; + + const int32_t crs = c * rs; + const auto getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { + return ki * crs + ci * rs + rsi; + }; + + for (int64_t ki = 0; ki < k; ++ki) { + for (int64_t rsi = 0; rsi < rs; ++rsi) { + int32_t w = 0; + int32_t nz = 0; + for (int64_t ci = 0; ci < c; ++ci) { + const auto index = getIndex(ki, ci, rsi); + if (nz < nonzeros) { + sparseValues[index] = values[index]; + ++nz; + } else { + sparseValues[index] = 0; + } + if (++w == window) { + w = 0; + nz = 0; + } + } + } + } +} + +void sparsify(const Weights& weights, int32_t k, int32_t rs, + std::vector& sparseWeights) { + switch (weights.type) { + case DataType::kFLOAT: + sparsify(static_cast(weights.values), weights.count, k, rs, + sparseWeights); + break; + case DataType::kHALF: + sparsify(static_cast(weights.values), + weights.count, k, rs, sparseWeights); + break; + case DataType::kINT8: + case DataType::kINT32: + case DataType::kBOOL: + break; + } +} + +template +void setSparseWeights(L& l, int32_t k, int32_t rs, + std::vector& sparseWeights) { + auto weights = l.getKernelWeights(); + sparsify(weights, k, rs, sparseWeights); + weights.values = sparseWeights.data(); + l.setKernelWeights(weights); +} + +template +void transpose2DWeights(void* dst, void const* src, int32_t const m, + int32_t const n) { + ASSERT(dst != src); + T* tdst = reinterpret_cast(dst); + T const* tsrc = reinterpret_cast(src); + for (int32_t mi = 0; mi < m; ++mi) { + for (int32_t ni = 0; ni < n; ++ni) { + int32_t const isrc = mi * n + ni; + int32_t const idst = ni * m + mi; + tdst[idst] = tsrc[isrc]; + } + } +} + +// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle +// layers. +// Forward analysis on the API graph to determine which weights to sparsify. +void sparsifyMatMulKernelWeights( + INetworkDefinition& network, + std::vector>& sparseWeights) { + using TensorToLayer = std::unordered_map; + using LayerToTensor = std::unordered_map; + + // 1. Collect layers and tensors information from the network. + TensorToLayer matmulI2L; + TensorToLayer constO2L; + TensorToLayer shuffleI2L; + LayerToTensor shuffleL2O; + auto collectMappingInfo = [&](int32_t const idx) { + ILayer* l = network.getLayer(idx); + switch (l->getType()) { + case LayerType::kMATRIX_MULTIPLY: { + // assume weights on the second input. + matmulI2L.insert({l->getInput(1), l}); + break; + } + case LayerType::kCONSTANT: { + DataType const dtype = static_cast(l)->getWeights().type; + if (dtype == DataType::kFLOAT || dtype == DataType::kHALF) { + // Sparsify float only. + constO2L.insert({l->getOutput(0), l}); + } + break; + } + case LayerType::kSHUFFLE: { + shuffleI2L.insert({l->getInput(0), l}); + shuffleL2O.insert({l, l->getOutput(0)}); + break; + } + default: + break; + } + }; + int32_t const nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; ++i) { + collectMappingInfo(i); + } + if (matmulI2L.size() == 0 || constO2L.size() == 0) { + // No MatrixMultiply or Constant layer found, no weights to sparsify. + return; + } + + // Helper for analysis + auto isTranspose = [](Permutation const& perm) -> bool { + return (perm.order[0] == 1 && perm.order[1] == 0); + }; + auto is2D = [](Dims const& dims) -> bool { return dims.nbDims == 2; }; + auto isIdenticalReshape = [](Dims const& dims) -> bool { + for (int32_t i = 0; i < dims.nbDims; ++i) { + if (dims.d[i] != i || dims.d[i] != -1) { + return false; + } + } + return true; + }; + auto tensorReachedViaTranspose = [&](ITensor* t, + bool& needTranspose) -> ITensor* { + while (shuffleI2L.find(t) != shuffleI2L.end()) { + IShuffleLayer* s = static_cast(shuffleI2L.at(t)); + if (!is2D(s->getInput(0)->getDimensions()) || + !is2D(s->getReshapeDimensions()) || + !isIdenticalReshape(s->getReshapeDimensions())) { + break; + } + + if (isTranspose(s->getFirstTranspose())) { + needTranspose = !needTranspose; + } + if (isTranspose(s->getSecondTranspose())) { + needTranspose = !needTranspose; + } + + t = shuffleL2O.at(s); + } + return t; + }; + + // 2. Forward analysis to collect the Constant layers connected to MatMul via + // Transpose + std::unordered_map constantLayerToSparse; + for (auto& o2l : constO2L) { + // If need to transpose the weights of the Constant layer. + // Need to transpose by default due to semantic difference. + bool needTranspose{true}; + ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose); + if (matmulI2L.find(t) == matmulI2L.end()) { + continue; + } + + // check MatMul params... + IMatrixMultiplyLayer* mm = + static_cast(matmulI2L.at(t)); + bool const twoInputs = mm->getNbInputs() == 2; + bool const all2D = is2D(mm->getInput(0)->getDimensions()) && + is2D(mm->getInput(1)->getDimensions()); + bool const isSimple = mm->getOperation(0) == MatrixOperation::kNONE && + mm->getOperation(1) != MatrixOperation::kVECTOR; + if (!(twoInputs && all2D && isSimple)) { + continue; + } + if (mm->getOperation(1) == MatrixOperation::kTRANSPOSE) { + needTranspose = !needTranspose; + } + + constantLayerToSparse.insert( + {static_cast(o2l.second), needTranspose}); + } + + // 3. Finally, sparsify the weights + auto sparsifyConstantWeights = [&sparseWeights](IConstantLayer* layer, + bool const needTranspose) { + Dims dims = layer->getOutput(0)->getDimensions(); + ASSERT(dims.nbDims == 2); + int32_t const idxN = needTranspose ? 1 : 0; + int32_t const n = dims.d[idxN]; + int32_t const k = dims.d[1 - idxN]; + sparseWeights.emplace_back(); + std::vector& spw = sparseWeights.back(); + Weights w = layer->getWeights(); + DataType const dtype = w.type; + ASSERT(dtype == DataType::kFLOAT || + dtype == + DataType::kHALF); // non-float weights should have been ignored. + + if (needTranspose) { + if (dtype == DataType::kFLOAT) { + spw.resize(w.count * sizeof(float)); + transpose2DWeights(spw.data(), w.values, k, n); + } else if (dtype == DataType::kHALF) { + spw.resize(w.count * sizeof(half_float::half)); + transpose2DWeights(spw.data(), w.values, k, n); + } + + w.values = spw.data(); + std::vector tmpW; + sparsify(w, n, 1, tmpW); + + if (dtype == DataType::kFLOAT) { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } else if (dtype == DataType::kHALF) { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } + } else { + sparsify(w, n, 1, spw); + } + + w.values = spw.data(); + layer->setWeights(w); + }; + for (auto& l : constantLayerToSparse) { + sparsifyConstantWeights(l.first, l.second); + } +} + +void sparsify(INetworkDefinition& network, + std::vector>& sparseWeights) { + for (int32_t l = 0; l < network.getNbLayers(); ++l) { + auto* layer = network.getLayer(l); + const auto t = layer->getType(); + if (t == LayerType::kCONVOLUTION) { + auto& conv = *static_cast(layer); + const auto& dims = conv.getKernelSizeNd(); + if (dims.nbDims > 2) { + continue; + } + const auto k = conv.getNbOutputMaps(); + const auto rs = dims.d[0] * dims.d[1]; + sparseWeights.emplace_back(); + setSparseWeights(conv, k, rs, sparseWeights.back()); + } else if (t == LayerType::kFULLY_CONNECTED) { + auto& fc = *static_cast(layer); + const auto k = fc.getNbOutputChannels(); + sparseWeights.emplace_back(); + setSparseWeights(fc, k, 1, sparseWeights.back()); + } + } + + sparsifyMatMulKernelWeights(network, sparseWeights); +} + +void setLayerPrecisions(INetworkDefinition& network, + LayerPrecisions const& layerPrecisions) { + bool const hasGlobalPrecision{layerPrecisions.find("*") != + layerPrecisions.end()}; + auto const globalPrecision = + hasGlobalPrecision ? layerPrecisions.at("*") : nvinfer1::DataType::kFLOAT; + bool hasLayerPrecisionSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + if (layerPrecisions.find(layer->getName()) != layerPrecisions.end()) { + layer->setPrecision(layerPrecisions.at(layer->getName())); + } else if (hasGlobalPrecision) { + // We should not set the layer precision if its default precision is INT32 + // or Bool. + if (layer->getPrecision() == nvinfer1::DataType::kINT32 || + layer->getPrecision() == nvinfer1::DataType::kBOOL) { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " + << layerName << " because the " + << " default layer precision is INT32 or Bool." + << std::endl; + continue; + } + // We should not set the constant layer precision if its weights are in + // INT32. + if (layer->getType() == nvinfer1::LayerType::kCONSTANT && + static_cast(layer)->getWeights().type == + nvinfer1::DataType::kINT32) { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " + << layerName << " because this " + << "constant layer has INT32 weights." << std::endl; + continue; + } + // We should not set the layer precision if the layer operates on a shape + // tensor. + if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor()) { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " + << layerName << " because this layer " + << "operates on a shape tensor." << std::endl; + continue; + } + if ((layer->getType() == nvinfer1::LayerType::kIDENTITY || + layer->getType() == nvinfer1::LayerType::kSHUFFLE) && + layer->getNbInputs() >= 1 && + layer->getInput(0)->getType() == nvinfer1::DataType::kINT32 && + layer->getNbOutputs() >= 1 && + layer->getOutput(0)->getType() == nvinfer1::DataType::kINT32) { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " + << layerName << " because this " + << "layer has INT32 input and output." << std::endl; + continue; + } + // All heuristics passed. Set the layer precision. + layer->setPrecision(globalPrecision); + } + } + + if (hasLayerPrecisionSkipped) { + sample::gLogInfo << "Skipped setting precisions for some layers. Check " + "verbose logs for more details." + << std::endl; + } +} + +void setLayerOutputTypes(INetworkDefinition& network, + LayerOutputTypes const& layerOutputTypes) { + bool const hasGlobalOutputType{layerOutputTypes.find("*") != + layerOutputTypes.end()}; + auto const globalOutputType = hasGlobalOutputType + ? layerOutputTypes.at("*").at(0) + : nvinfer1::DataType::kFLOAT; + bool hasLayerOutputTypeSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto const nbOutputs = layer->getNbOutputs(); + if (layerOutputTypes.find(layer->getName()) != layerOutputTypes.end()) { + auto const& outputTypes = layerOutputTypes.at(layer->getName()); + bool const isBroadcast = (outputTypes.size() == 1); + if (!isBroadcast && + static_cast(outputTypes.size()) != nbOutputs) { + sample::gLogError + << "Layer " << layerName << " has " << nbOutputs << " outputs but " + << outputTypes.size() + << " output types are given in --layerOutputTypes flag." + << std::endl; + throw std::invalid_argument("Invalid --layerOutputTypes flag."); + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) { + layer->setOutputType(outputIdx, + outputTypes.at(isBroadcast ? 0 : outputIdx)); + } + } else if (hasGlobalOutputType) { + // We should not set the layer output types if its default precision is + // INT32 or Bool. + if (layer->getPrecision() == nvinfer1::DataType::kINT32 || + layer->getPrecision() == nvinfer1::DataType::kBOOL) { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " + << layerName << " because the " + << " default layer precision is INT32 or Bool." + << std::endl; + continue; + } + // We should not set the constant layer output types if its weights are in + // INT32. + if (layer->getType() == nvinfer1::LayerType::kCONSTANT && + static_cast(layer)->getWeights().type == + nvinfer1::DataType::kINT32) { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " + << layerName << " because this " + << "constant layer has INT32 weights." << std::endl; + continue; + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) { + // We should not set the output type if the output is a shape tensor. + if (layer->getOutput(0)->isShapeTensor()) { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output type for output " + << outputIdx << " of layer " << layerName + << " because it is a shape tensor." << std::endl; + continue; + } + layer->setOutputType(outputIdx, globalOutputType); + } + } + } + + if (hasLayerOutputTypeSkipped) { + sample::gLogInfo << "Skipped setting output types for some layers. Check " + "verbose logs for more details." + << std::endl; + } +} + +void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build) { + auto const roundToBytes = [](double const sizeInMB) { + return static_cast(sizeInMB * (1 << 20)); + }; + if (build.workspace >= 0) { + config.setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, + roundToBytes(build.workspace)); + } + if (build.dlaSRAM >= 0) { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_MANAGED_SRAM, + roundToBytes(build.dlaSRAM)); + } + if (build.dlaLocalDRAM >= 0) { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_LOCAL_DRAM, + roundToBytes(build.dlaLocalDRAM)); + } + if (build.dlaGlobalDRAM >= 0) { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_GLOBAL_DRAM, + roundToBytes(build.dlaGlobalDRAM)); + } +} + +} // namespace + +bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, + IBuilder& builder, INetworkDefinition& network, + IBuilderConfig& config, std::ostream& err, + std::vector>& sparseWeights) { + IOptimizationProfile* profile{nullptr}; + if (build.maxBatch) { + builder.setMaxBatchSize(build.maxBatch); + } else { + profile = builder.createOptimizationProfile(); + } + + bool hasDynamicShapes{false}; + + bool broadcastInputFormats = + broadcastIOFormats(build.inputFormats, network.getNbInputs()); + + if (profile) { + // Check if the provided input tensor names match the input tensors of the + // engine. + // Throw an error if the provided input tensor names cannot be found because + // it implies a potential typo. + for (const auto& shape : build.shapes) { + bool tensorNameFound{false}; + for (int32_t i = 0; i < network.getNbInputs(); ++i) { + if (network.getInput(i)->getName() == shape.first) { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) { + sample::gLogError + << "Cannot find input tensor with name \"" << shape.first + << "\" in the network " + << "inputs! Please make sure the input tensor names are correct." + << std::endl; + return false; + } + } + } + + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) { + // Set formats and data types of inputs + auto* input = network.getInput(i); + if (!build.inputFormats.empty()) { + int inputFormatIndex = broadcastInputFormats ? 0 : i; + input->setType(build.inputFormats[inputFormatIndex].first); + input->setAllowedFormats(build.inputFormats[inputFormatIndex].second); + } else { + switch (input->getType()) { + case DataType::kINT32: + case DataType::kBOOL: + case DataType::kHALF: + // Leave these as is. + break; + case DataType::kFLOAT: + case DataType::kINT8: + // User did not specify a floating-point format. Default to kFLOAT. + input->setType(DataType::kFLOAT); + break; + } + input->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); + } + + if (profile) { + auto const dims = input->getDimensions(); + auto const isScalar = dims.nbDims == 0; + auto const isDynamicInput = + std::any_of(dims.d, dims.d + dims.nbDims, + [](int32_t dim) { return dim == -1; }) || + input->isShapeTensor(); + if (isDynamicInput) { + hasDynamicShapes = true; + auto shape = build.shapes.find(input->getName()); + ShapeRange shapes{}; + + // If no shape is provided, set dynamic dimensions to 1. + if (shape == build.shapes.end()) { + constexpr int DEFAULT_DIMENSION = 1; + std::vector staticDims; + if (input->isShapeTensor()) { + if (isScalar) { + staticDims.push_back(1); + } else { + staticDims.resize(dims.d[0]); + std::fill(staticDims.begin(), staticDims.end(), + DEFAULT_DIMENSION); + } + } else { + staticDims.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), + [&](int dimension) { + return dimension > 0 ? dimension + : DEFAULT_DIMENSION; + }); + } + sample::gLogWarning + << "Dynamic dimensions required for input: " << input->getName() + << ", but no shapes were provided. Automatically overriding " + "shape to: " + << staticDims << std::endl; + std::fill(shapes.begin(), shapes.end(), staticDims); + } else { + shapes = shape->second; + } + + std::vector profileDims{}; + if (input->isShapeTensor()) { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues( + input->getName(), OptProfileSelector::kMIN, + profileDims.data(), + static_cast(profileDims.size())), + "Error in set shape values MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues( + input->getName(), OptProfileSelector::kOPT, + profileDims.data(), + static_cast(profileDims.size())), + "Error in set shape values OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues( + input->getName(), OptProfileSelector::kMAX, + profileDims.data(), + static_cast(profileDims.size())), + "Error in set shape values MAX", false, err); + } else { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(input->getName(), OptProfileSelector::kMIN, + toDims(profileDims)), + "Error in set dimensions to profile MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(input->getName(), OptProfileSelector::kOPT, + toDims(profileDims)), + "Error in set dimensions to profile OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(input->getName(), OptProfileSelector::kMAX, + toDims(profileDims)), + "Error in set dimensions to profile MAX", false, err); + } + } + } + } + + if (!hasDynamicShapes && !build.shapes.empty()) { + sample::gLogError << "Static model does not take explicit shapes since the " + "shape of inference tensors will be " + "determined by the model itself" + << std::endl; + return false; + } + + if (profile && hasDynamicShapes) { + SMP_RETVAL_IF_FALSE(profile->isValid(), + "Required optimization profile is invalid", false, err); + SMP_RETVAL_IF_FALSE(config.addOptimizationProfile(profile) != -1, + "Error in add optimization profile", false, err); + } + + bool broadcastOutputFormats = + broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false); + + for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) { + // Set formats and data types of outputs + auto* output = network.getOutput(i); + if (!build.outputFormats.empty()) { + int outputFormatIndex = broadcastOutputFormats ? 0 : i; + output->setType(build.outputFormats[outputFormatIndex].first); + output->setAllowedFormats(build.outputFormats[outputFormatIndex].second); + } else { + output->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); + } + } + + setMemoryPoolLimits(config, build); + + if (build.timingCacheMode == TimingCacheMode::kDISABLE) { + config.setFlag(BuilderFlag::kDISABLE_TIMING_CACHE); + } + + if (!build.tf32) { + config.clearFlag(BuilderFlag::kTF32); + } + + if (build.refittable) { + config.setFlag(BuilderFlag::kREFIT); + } + + if (build.sparsity != SparsityFlag::kDISABLE) { + config.setFlag(BuilderFlag::kSPARSE_WEIGHTS); + if (build.sparsity == SparsityFlag::kFORCE) { + sparsify(network, sparseWeights); + } + } + + config.setProfilingVerbosity(build.profilingVerbosity); + config.setMinTimingIterations(build.minTiming); + config.setAvgTimingIterations(build.avgTiming); + + if (build.fp16) { + config.setFlag(BuilderFlag::kFP16); + } + + if (build.int8) { + config.setFlag(BuilderFlag::kINT8); + } + + if (build.int8 && !build.fp16) { + sample::gLogInfo << "FP32 and INT8 precisions have been specified - more " + "performance might be enabled by additionally " + "specifying --fp16 or --best" + << std::endl; + } + + auto isInt8 = [](const IOFormat& format) { + return format.first == DataType::kINT8; + }; + auto int8IO = std::count_if(build.inputFormats.begin(), + build.inputFormats.end(), isInt8) + + std::count_if(build.outputFormats.begin(), + build.outputFormats.end(), isInt8); + + auto hasQDQLayers = [](INetworkDefinition& network) { + // Determine if our network has QDQ layers. + const auto nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; i++) { + const auto& layer = network.getLayer(i); + if (layer->getType() == LayerType::kQUANTIZE || + layer->getType() == LayerType::kDEQUANTIZE) { + return true; + } + } + return false; + }; + + if (!hasQDQLayers(network) && (build.int8 || int8IO) && + build.calibration.empty()) { + // Explicitly set int8 scales if no calibrator is provided and if I/O + // tensors use int8, + // because auto calibration does not support this case. + SMP_RETVAL_IF_FALSE(setTensorDynamicRange(network), + "Error in set tensor dynamic range.", false, err); + } else if (build.int8) { + if (!hasQDQLayers(network) && int8IO) { + try { + // Set dynamic ranges of int8 inputs / outputs to match scales loaded + // from calibration cache + // TODO http://nvbugs/3262234 Change the network validation so that this + // workaround can be removed + setTensorScalesFromCalibration(network, build.inputFormats, + build.outputFormats, build.calibration); + } catch (std::exception&) { + sample::gLogError << "Int8IO was specified but impossible to read " + "tensor scales from provided calibration cache " + "file" + << std::endl; + return false; + } + } + IOptimizationProfile* profileCalib{nullptr}; + if (!build.shapesCalib.empty()) { + profileCalib = builder.createOptimizationProfile(); + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) { + auto* input = network.getInput(i); + Dims profileDims{}; + auto shape = build.shapesCalib.find(input->getName()); + ShapeRange shapesCalib{}; + shapesCalib = shape->second; + + profileDims = + toDims(shapesCalib[static_cast(OptProfileSelector::kOPT)]); + // Here we check only kMIN as all profileDims are the same. + SMP_RETVAL_IF_FALSE( + profileCalib->setDimensions(input->getName(), + OptProfileSelector::kMIN, profileDims), + "Error in set dimensions to calibration profile OPT", false, err); + profileCalib->setDimensions(input->getName(), OptProfileSelector::kOPT, + profileDims); + profileCalib->setDimensions(input->getName(), OptProfileSelector::kMAX, + profileDims); + } + SMP_RETVAL_IF_FALSE(profileCalib->isValid(), + "Calibration profile is invalid", false, err); + SMP_RETVAL_IF_FALSE(config.setCalibrationProfile(profileCalib), + "Error in set calibration profile", false, err); + } + + std::vector elemCount{}; + for (int i = 0; i < network.getNbInputs(); i++) { + auto* input = network.getInput(i); + auto const dims = input->getDimensions(); + auto const isDynamicInput = std::any_of( + dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + + if (profileCalib) { + elemCount.push_back(volume(profileCalib->getDimensions( + input->getName(), OptProfileSelector::kOPT))); + } else if (profile && isDynamicInput) { + elemCount.push_back(volume(profile->getDimensions( + input->getName(), OptProfileSelector::kOPT))); + } else { + elemCount.push_back(volume(input->getDimensions())); + } + } + + config.setInt8Calibrator( + new RndInt8Calibrator(1, elemCount, build.calibration, network, err)); + } + + if (build.directIO) { + config.setFlag(BuilderFlag::kDIRECT_IO); + } + + switch (build.precisionConstraints) { + case PrecisionConstraints::kNONE: + // It's the default for TensorRT. + break; + case PrecisionConstraints::kOBEY: + config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); + break; + case PrecisionConstraints::kPREFER: + config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + break; + } + + if (!build.layerPrecisions.empty() && + build.precisionConstraints != PrecisionConstraints::kNONE) { + setLayerPrecisions(network, build.layerPrecisions); + } + + if (!build.layerOutputTypes.empty() && + build.precisionConstraints != PrecisionConstraints::kNONE) { + setLayerOutputTypes(network, build.layerOutputTypes); + } + + if (build.safe) { + config.setEngineCapability(sys.DLACore != -1 + ? EngineCapability::kDLA_STANDALONE + : EngineCapability::kSAFETY); + } + + if (build.restricted) { + config.setFlag(BuilderFlag::kSAFETY_SCOPE); + } + + if (sys.DLACore != -1) { + if (sys.DLACore < builder.getNbDLACores()) { + config.setDefaultDeviceType(DeviceType::kDLA); + config.setDLACore(sys.DLACore); + config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + + if (sys.fallback) { + config.setFlag(BuilderFlag::kGPU_FALLBACK); + } else { + // Reformatting runs on GPU, so avoid I/O reformatting. + config.setFlag(BuilderFlag::kDIRECT_IO); + } + if (!build.int8) { + config.setFlag(BuilderFlag::kFP16); + } + } else { + err << "Cannot create DLA engine, " << sys.DLACore << " not available" + << std::endl; + return false; + } + } + + if (build.enabledTactics || build.disabledTactics) { + TacticSources tacticSources = config.getTacticSources(); + tacticSources |= build.enabledTactics; + tacticSources &= ~build.disabledTactics; + config.setTacticSources(tacticSources); + } + + return true; +} + +//! +//! \brief Create an engine for a network defintion +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +bool networkToEngine(const BuildOptions& build, const SystemOptions& sys, + IBuilder& builder, BuildEnvironment& env, + std::ostream& err) { + TrtUniquePtr config{builder.createBuilderConfig()}; + std::vector> sparseWeights; + SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err); + SMP_RETVAL_IF_FALSE(setupNetworkAndConfig(build, sys, builder, *env.network, + *config, err, sparseWeights), + "Network And Config setup failed", false, err); + + std::unique_ptr timingCache{nullptr}; + // Try to load cache from file. Create a fresh cache if the file doesn't exist + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) { + std::vector loadedCache = loadTimingCacheFile(build.timingCacheFile); + timingCache.reset(config->createTimingCache( + static_cast(loadedCache.data()), loadedCache.size())); + SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", + false, err); + config->setTimingCache(*timingCache, false); + } + + // CUDA stream used for profiling by the builder. + auto profileStream = samplesCommon::makeCudaStream(); + SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", + false, err); + config->setProfileStream(*profileStream); + + TrtUniquePtr serializedEngine{ + builder.buildSerializedNetwork(*env.network, *config)}; + SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, + "Engine could not be created from network", false, err); + + env.engineBlob.resize(serializedEngine->size()); + std::memcpy(env.engineBlob.data(), serializedEngine->data(), + serializedEngine->size()); + + if (build.safe) { + ASSERT(sample::hasSafeRuntime()); + std::unique_ptr safeRuntime{ + sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; + SMP_RETVAL_IF_FALSE(safeRuntime != nullptr, "SafeRuntime creation failed", + false, err); + safeRuntime->setErrorRecorder(&gRecorder); + env.safeEngine.reset(safeRuntime->deserializeCudaEngine( + serializedEngine->data(), serializedEngine->size())); + if (build.consistency) { + checkSafeEngine(serializedEngine->data(), serializedEngine->size()); + } + SMP_RETVAL_IF_FALSE(env.safeEngine != nullptr, + "SafeEngine deserialization failed", false, err); + } else { + TrtUniquePtr runtime{ + createInferRuntime(sample::gLogger.getTRTLogger())}; + SMP_RETVAL_IF_FALSE(runtime != nullptr, "Runtime creation failed", false, + err); + runtime->setErrorRecorder(&gRecorder); + env.engine.reset(runtime->deserializeCudaEngine(serializedEngine->data(), + serializedEngine->size())); + SMP_RETVAL_IF_FALSE(env.engine != nullptr, "Engine deserialization failed", + false, err); + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) { + auto const& timingCache = config->getTimingCache(); + std::unique_ptr timingCacheHostData{ + timingCache->serialize()}; + SMP_RETVAL_IF_FALSE(timingCacheHostData != nullptr, + "Timing Cache serialization failed", false, err); + saveTimingCacheFile(build.timingCacheFile, timingCacheHostData.get()); + } + if (config->getInt8Calibrator()) { + delete config->getInt8Calibrator(); + } + } + return true; +} + +//! +//! \brief Parse a given model, create a network and an engine. +//! +bool modelToBuildEnv(const ModelOptions& model, const BuildOptions& build, + const SystemOptions& sys, BuildEnvironment& env, + std::ostream& err) { + TrtUniquePtr builder{ + createInferBuilder(sample::gLogger.getTRTLogger())}; + SMP_RETVAL_IF_FALSE(builder != nullptr, "Builder creation failed", false, + err); + builder->setErrorRecorder(&gRecorder); + auto networkFlags = + (build.maxBatch) + ? 0U + : 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + + env.network.reset(builder->createNetworkV2(networkFlags)); + SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, + err); + env.parser = modelToNetwork(model, *env.network, err); + SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, + err); + SMP_RETVAL_IF_FALSE(networkToEngine(build, sys, *builder, env, err), + "Building engine failed", false, err); + return true; +} + +namespace { +std::pair, std::vector> +getLayerWeightsRolePair(IRefitter& refitter) { + // Get number of refittable items. + auto const nbAll = refitter.getAll(0, nullptr, nullptr); + std::vector layerNames(nbAll); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbAll); + refitter.getAll(nbAll, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbAll); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), + [](char const* name) { + if (name == nullptr) { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} + +std::pair, std::vector> +getMissingLayerWeightsRolePair(IRefitter& refitter) { + // Get number of refittable items. + auto const nbMissing = refitter.getMissing(0, nullptr, nullptr); + std::vector layerNames(nbMissing); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbMissing); + refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbMissing); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), + [](char const* name) { + if (name == nullptr) { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} + +bool loadEngineToEnv(const std::string& engine, int DLACore, bool safe, + bool enableConsistency, BuildEnvironment& env, + std::ostream& err) { + std::ifstream engineFile(engine, std::ios::binary); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, + err << "Error opening engine file: " << engine); + engineFile.seekg(0, std::ifstream::end); + int64_t fsize = engineFile.tellg(); + engineFile.seekg(0, std::ifstream::beg); + + env.engineBlob.resize(fsize); + engineFile.read(reinterpret_cast(env.engineBlob.data()), fsize); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, + err << "Error loading engine file: " << engine); + + if (safe) { + ASSERT(sample::hasSafeRuntime()); + std::unique_ptr safeRuntime{ + sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; + safeRuntime->setErrorRecorder(&gRecorder); + env.safeEngine.reset( + safeRuntime->deserializeCudaEngine(env.engineBlob.data(), fsize)); + bool result = env.safeEngine != nullptr; + if (result && enableConsistency) { + checkSafeEngine(env.engineBlob.data(), fsize); + } + return result; + } + + TrtUniquePtr runtime{ + createInferRuntime(sample::gLogger.getTRTLogger())}; + if (DLACore != -1) { + runtime->setDLACore(DLACore); + } + runtime->setErrorRecorder(&gRecorder); + env.engine.reset( + runtime->deserializeCudaEngine(env.engineBlob.data(), fsize, nullptr)); + return env.engine != nullptr; +} +} // namespace + +void dumpRefittable(nvinfer1::ICudaEngine& engine) { + TrtUniquePtr refitter{ + createInferRefitter(engine, sample::gLogger.getTRTLogger())}; + if (refitter == nullptr) { + sample::gLogError << "Failed to create a refitter." << std::endl; + return; + } + + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + auto const nbAll = layerWeightsRolePair.first.size(); + for (size_t i = 0; i < nbAll; ++i) { + sample::gLogInfo << layerNames[i] << " " << weightsRoles[i] << std::endl; + } +} + +ICudaEngine* loadEngine(const std::string& engine, int DLACore, + std::ostream& err) { + BuildEnvironment env; + return loadEngineToEnv(engine, DLACore, false, false, env, err) + ? env.engine.release() + : nullptr; +} + +bool saveEngine(const ICudaEngine& engine, const std::string& fileName, + std::ostream& err) { + std::ofstream engineFile(fileName, std::ios::binary); + if (!engineFile) { + err << "Cannot open engine file: " << fileName << std::endl; + return false; + } + + TrtUniquePtr serializedEngine{engine.serialize()}; + if (serializedEngine == nullptr) { + err << "Engine serialization failed" << std::endl; + return false; + } + + engineFile.write(static_cast(serializedEngine->data()), + serializedEngine->size()); + return !engineFile.fail(); +} + +bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, + const SystemOptions& sys, BuildEnvironment& env, + std::ostream& err) { + TrtUniquePtr engine; + TrtUniquePtr network; + Parser parser; + + bool createEngineSuccess{false}; + + if (build.load) { + createEngineSuccess = loadEngineToEnv(build.engine, sys.DLACore, build.safe, + build.consistency, env, err); + } else { + createEngineSuccess = modelToBuildEnv(model, build, sys, env, err); + } + + SMP_RETVAL_IF_FALSE(createEngineSuccess, + "Failed to create engine from model.", false, err); + + if (build.save) { + std::ofstream engineFile(build.engine, std::ios::binary); + engineFile.write(reinterpret_cast(env.engineBlob.data()), + env.engineBlob.size()); + SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", + false, err); + } + return true; +} + +IHostMemory* networkToSerialized(const BuildOptions& build, + const SystemOptions& sys, IBuilder& builder, + INetworkDefinition& network, + std::ostream& err) { + TrtUniquePtr config{builder.createBuilderConfig()}; + std::vector> sparseWeights; + SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", nullptr, + err); + SMP_RETVAL_IF_FALSE(setupNetworkAndConfig(build, sys, builder, network, + *config, err, sparseWeights), + "Network And Config setup failed", nullptr, err); + return builder.buildSerializedNetwork(network, *config); +} + +IHostMemory* modelToSerialized(const ModelOptions& model, + const BuildOptions& build, + const SystemOptions& sys, std::ostream& err) { + TrtUniquePtr builder{ + createInferBuilder(sample::gLogger.getTRTLogger())}; + SMP_RETVAL_IF_FALSE(builder != nullptr, "Builder creation failed", nullptr, + err); + builder->setErrorRecorder(&gRecorder); + + auto networkFlags = + (build.maxBatch) + ? 0U + : 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + + TrtUniquePtr network{ + builder->createNetworkV2(networkFlags)}; + SMP_RETVAL_IF_FALSE(network != nullptr, "Network creation failed", nullptr, + err); + + Parser parser = modelToNetwork(model, *network, err); + SMP_RETVAL_IF_FALSE(parser.operator bool(), "Parsing model failed", nullptr, + err); + + return networkToSerialized(build, sys, *builder, *network, err); +} + +bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, + const SystemOptions& sys, std::ostream& err) { + TrtUniquePtr serialized{ + modelToSerialized(model, build, sys, err)}; + SMP_RETVAL_IF_FALSE(serialized != nullptr, "Network serialization failed", + false, err); + + std::ofstream engineFile(build.engine, std::ios::binary); + SMP_RETVAL_IF_FALSE(!!engineFile, + "Cannot open a file to save a serialize network", false, + err); + engineFile.write(static_cast(serialized->data()), serialized->size()); + return !engineFile.fail(); +} + +// There is not a getWeightsName API, so we need to use WeightsRole. +std::vector> +getAllRefitWeightsForLayer(const ILayer& l) { + switch (l.getType()) { + case LayerType::kCONSTANT: { + const auto& layer = static_cast(l); + return {std::make_pair(WeightsRole::kCONSTANT, layer.getWeights())}; + } + case LayerType::kCONVOLUTION: { + const auto& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kDECONVOLUTION: { + const auto& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kFULLY_CONNECTED: { + const auto& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kSCALE: { + const auto& layer = static_cast(l); + return {std::make_pair(WeightsRole::kSCALE, layer.getScale()), + std::make_pair(WeightsRole::kSHIFT, layer.getShift())}; + } + case LayerType::kRNN_V2: + case LayerType::kACTIVATION: + case LayerType::kPOOLING: + case LayerType::kLRN: + case LayerType::kSOFTMAX: + case LayerType::kSHUFFLE: + case LayerType::kCONCATENATION: + case LayerType::kELEMENTWISE: + case LayerType::kPLUGIN: + case LayerType::kUNARY: + case LayerType::kPADDING: + case LayerType::kREDUCE: + case LayerType::kTOPK: + case LayerType::kGATHER: + case LayerType::kMATRIX_MULTIPLY: + case LayerType::kRAGGED_SOFTMAX: + case LayerType::kIDENTITY: + case LayerType::kPLUGIN_V2: + case LayerType::kSLICE: + case LayerType::kFILL: + case LayerType::kSHAPE: + case LayerType::kPARAMETRIC_RELU: + case LayerType::kRESIZE: + case LayerType::kTRIP_LIMIT: + case LayerType::kRECURRENCE: + case LayerType::kITERATOR: + case LayerType::kLOOP_OUTPUT: + case LayerType::kSELECT: + case LayerType::kQUANTIZE: + case LayerType::kDEQUANTIZE: + case LayerType::kCONDITION: + case LayerType::kCONDITIONAL_INPUT: + case LayerType::kCONDITIONAL_OUTPUT: + case LayerType::kSCATTER: + case LayerType::kEINSUM: + case LayerType::kASSERTION: + return {}; + } + return {}; +} + +bool timeRefit(INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, + bool multiThreading) { + using time_point = std::chrono::time_point; + using durationMs = std::chrono::duration; + + auto const nbLayers = network.getNbLayers(); + TrtUniquePtr refitter{ + createInferRefitter(engine, sample::gLogger.getTRTLogger())}; + // Set max threads that can be used by refitter. + if (multiThreading && !refitter->setMaxThreads(10)) { + sample::gLogError << "Failed to set max threads to refitter." << std::endl; + return false; + } + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + // We use std::string instead of const char* since we can have copies of layer + // names. + std::set> layerRoleSet; + + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + + std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(), + std::inserter(layerRoleSet, layerRoleSet.begin()), + [](std::string const& layerName, WeightsRole const role) { + return std::make_pair(layerName, role); + }); + + auto const isRefittable = [&layerRoleSet](char const* layerName, + WeightsRole const role) { + return layerRoleSet.find(std::make_pair(layerName, role)) != + layerRoleSet.end(); + }; + + auto const setWeights = [&] { + for (int32_t i = 0; i < nbLayers; i++) { + auto const layer = network.getLayer(i); + auto const roleWeightsVec = getAllRefitWeightsForLayer(*layer); + for (auto const& roleWeights : roleWeightsVec) { + if (isRefittable(layer->getName(), roleWeights.first)) { + bool const success = refitter->setWeights( + layer->getName(), roleWeights.first, roleWeights.second); + if (!success) { + return false; + } + } + } + } + return true; + }; + + auto const reportMissingWeights = [&] { + auto const& missingPair = getMissingLayerWeightsRolePair(*refitter); + auto const& layerNames = missingPair.first; + auto const& weightsRoles = missingPair.second; + for (size_t i = 0; i < layerNames.size(); ++i) { + sample::gLogError << "Missing (" << layerNames[i] << ", " + << weightsRoles[i] << ") for refitting." << std::endl; + } + return layerNames.empty(); + }; + + // Warm up and report missing weights + bool const success = + setWeights() && reportMissingWeights() && refitter->refitCudaEngine(); + if (!success) { + return false; + } + + constexpr int32_t loop = 10; + time_point const refitStartTime{std::chrono::steady_clock::now()}; + { + for (int32_t l = 0; l < loop; l++) { + bool const success = setWeights() && refitter->refitCudaEngine(); + if (!success) { + return false; + } + } + } + time_point const refitEndTime{std::chrono::steady_clock::now()}; + + sample::gLogInfo << "Engine refitted" + << " in " + << durationMs(refitEndTime - refitStartTime).count() / loop + << " ms." << std::endl; + return true; +} + +namespace { +void* initSafeRuntime() { + void* handle{nullptr}; +#if !defined(_WIN32) + std::string const dllName{samplesCommon::isDebug() + ? "libnvinfer_safe_debug.so.8" + : "libnvinfer_safe.so.8"}; +#if SANITIZER_BUILD + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); +#else + handle = dlopen(dllName.c_str(), RTLD_LAZY); +#endif +#endif + return handle; +} + +void* initConsistencyCheckerLibrary() { + void* handle{nullptr}; +#if !defined(_WIN32) + std::string const dllName{samplesCommon::isDebug() + ? "libnvinfer_checker_debug.so.8" + : "libnvinfer_checker.so.8"}; +#if SANITIZER_BUILD + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); +#else + handle = dlopen(dllName.c_str(), RTLD_LAZY); +#endif +#endif + return handle; +} + +#if !defined(_WIN32) +struct DllDeleter { + void operator()(void* handle) { + if (handle != nullptr) { + dlclose(handle); + } + } +}; +const std::unique_ptr safeRuntimeLibrary{initSafeRuntime()}; +const std::unique_ptr consistencyCheckerLibrary{ + initConsistencyCheckerLibrary()}; +#endif +} // namespace + +bool hasSafeRuntime() { + bool ret{false}; +#if !defined(_WIN32) + ret = (safeRuntimeLibrary != nullptr); +#endif + return ret; +} + +nvinfer1::safe::IRuntime* +createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept { + nvinfer1::safe::IRuntime* runtime{nullptr}; +#if !defined(_WIN32) + constexpr char symbolName[] = + "_ZN8nvinfer14safe18createInferRuntimeERNS_7ILoggerE"; + typedef nvinfer1::safe::IRuntime* (*CreateInferRuntimeFn)(nvinfer1::ILogger & + logger); + if (hasSafeRuntime()) { + auto createFn = reinterpret_cast( + dlsym(safeRuntimeLibrary.get(), symbolName)); + if (createFn != nullptr) { + runtime = createFn(logger); + } + } +#endif + return runtime; +} + +bool hasConsistencyChecker() { + bool ret{false}; +#if !defined(_WIN32) + ret = (consistencyCheckerLibrary != nullptr); +#endif + return ret; +} + +nvinfer1::consistency::IConsistencyChecker* +createConsistencyChecker(nvinfer1::ILogger& logger, + void const* serializedEngine, + int32_t const engineSize) noexcept { + nvinfer1::consistency::IConsistencyChecker* checker{nullptr}; + + if (serializedEngine == nullptr || engineSize == 0) { + return checker; + } + +#if !defined(_WIN32) + constexpr char symbolName[] = "createConsistencyChecker_INTERNAL"; + typedef nvinfer1::consistency::IConsistencyChecker* (*CreateCheckerFn)( + nvinfer1::ILogger * logger, void const* data, size_t size, + uint32_t version); + if (hasSafeRuntime()) { + auto createFn = reinterpret_cast( + dlsym(consistencyCheckerLibrary.get(), symbolName)); + if (createFn != nullptr) { + checker = + createFn(&logger, serializedEngine, engineSize, NV_TENSORRT_VERSION); + } + } +#endif + return checker; +} + +bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize) { + if (!hasConsistencyChecker()) { + sample::gLogError << "Cannot perform consistency check because the checker " + "is not loaded.." + << std::endl; + return false; + } + auto checker = std::unique_ptr( + createConsistencyChecker(sample::gLogger.getTRTLogger(), serializedEngine, + engineSize)); + if (checker.get() == nullptr) { + sample::gLogError << "Failed to create consistency checker." << std::endl; + return false; + } + sample::gLogInfo << "Start consistency checking." << std::endl; + if (!checker->validate()) { + sample::gLogError << "Consistency validation failed." << std::endl; + return false; + } + sample::gLogInfo << "Consistency validation passed." << std::endl; + return true; +} +} // namespace sample diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleEngines.h b/csrc/fastdeploy/backends/tensorrt/common/sampleEngines.h new file mode 100644 index 000000000..1b7b7a000 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleEngines.h @@ -0,0 +1,195 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_ENGINES_H +#define TRT_SAMPLE_ENGINES_H + +#include +#include + +//#include "NvCaffeParser.h" +#include "NvInfer.h" +#include "NvInferConsistency.h" +#include "NvInferSafeRuntime.h" +#include "NvOnnxParser.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample { + +struct Parser { +// TrtUniquePtr caffeParser; + TrtUniquePtr onnxParser; + + operator bool() const { return false || onnxParser; } +}; + +struct BuildEnvironment { + TrtUniquePtr network; + //! Parser that creates the network. Must be declared *after* network, so that + //! when + //! ~BuildEnvironment() executes, the parser is destroyed before the network + //! is destroyed. + Parser parser; + TrtUniquePtr engine; + std::unique_ptr safeEngine; + std::vector engineBlob; +}; + +//! +//! \brief Generate a network definition for a given model +//! +//! \return Parser The parser used to initialize the network and that holds the +//! weights for the network, or an invalid +//! parser (the returned parser converts to false if tested) +//! +//! Constant input dimensions in the model must not be changed in the +//! corresponding +//! network definition, because its correctness may rely on the constants. +//! +//! \see Parser::operator bool() +//! +Parser modelToNetwork(const ModelOptions& model, + nvinfer1::INetworkDefinition& network, std::ostream& err); + +//! +//! \brief Set up network and config +//! +//! \return boolean Return true if network and config were successfully set +//! +bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, + IBuilder& builder, INetworkDefinition& network, + IBuilderConfig& config, std::ostream& err, + std::vector>& sparseWeights); + +//! +//! \brief Log refittable layers and weights of a refittable engine +//! +void dumpRefittable(nvinfer1::ICudaEngine& engine); + +//! +//! \brief Load a serialized engine +//! +//! \return Pointer to the engine loaded or nullptr if the operation failed +//! +nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, + std::ostream& err); + +//! +//! \brief Save an engine into a file +//! +//! \return boolean Return true if the engine was successfully saved +//! +bool saveEngine(const nvinfer1::ICudaEngine& engine, + const std::string& fileName, std::ostream& err); + +//! +//! \brief Create an engine from model or serialized file, and optionally save +//! engine +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +bool getEngineBuildEnv(const ModelOptions& model, const BuildOptions& build, + const SystemOptions& sys, BuildEnvironment& env, + std::ostream& err); + +//! +//! \brief Create an engine from model or serialized file, and optionally save +//! engine +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +inline TrtUniquePtr getEngine(const ModelOptions& model, + const BuildOptions& build, + const SystemOptions& sys, + std::ostream& err) { + BuildEnvironment env; + TrtUniquePtr engine; + if (getEngineBuildEnv(model, build, sys, env, err)) { + engine.swap(env.engine); + } + return engine; +} + +//! +//! \brief Create a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +IHostMemory* networkToSerialized(const BuildOptions& build, + const SystemOptions& sys, IBuilder& builder, + INetworkDefinition& network, + std::ostream& err); + +//! +//! \brief Tranfer model to a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +IHostMemory* modelToSerialized(const ModelOptions& model, + const BuildOptions& build, + const SystemOptions& sys, std::ostream& err); + +//! +//! \brief Serialize network and save it into a file +//! +//! \return boolean Return true if the network was successfully serialized and +//! saved +//! +bool serializeAndSave(const ModelOptions& model, const BuildOptions& build, + const SystemOptions& sys, std::ostream& err); + +bool timeRefit(const INetworkDefinition& network, nvinfer1::ICudaEngine& engine, + bool multiThreading); + +//! +//! \brief Set tensor scales from a calibration table +//! +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, + const std::vector& inputFormats, + const std::vector& outputFormats, + const std::string& calibrationFile); + +//! +//! \brief Check if safe runtime is loaded. +//! +bool hasSafeRuntime(); + +//! +//! \brief Create a safe runtime object if the dynamic library is loaded. +//! +nvinfer1::safe::IRuntime* +createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept; + +//! +//! \brief Check if consistency checker is loaded. +//! +bool hasConsistencyChecker(); + +//! +//! \brief Create a consistency checker object if the dynamic library is loaded. +//! +nvinfer1::consistency::IConsistencyChecker* +createConsistencyChecker(nvinfer1::ILogger& logger, + IHostMemory const* engine) noexcept; + +//! +//! \brief Run consistency check on serialized engine. +//! +bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize); +} // namespace sample + +#endif // TRT_SAMPLE_ENGINES_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleInference.cpp b/csrc/fastdeploy/backends/tensorrt/common/sampleInference.cpp new file mode 100644 index 000000000..fd7e9f82f --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleInference.cpp @@ -0,0 +1,943 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__QNX__) +#include +#include +#endif + +#include "NvInfer.h" + +#include "ErrorRecorder.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleEngines.h" +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" +#include "sampleUtils.h" + +namespace sample { + +template +bool validateTensorNames(const MapType& map, const EngineType* engine, + const int32_t endBindingIndex) { + // Check if the provided input tensor names match the input tensors of the + // engine. + // Throw an error if the provided input tensor names cannot be found because + // it implies a potential typo. + for (const auto& item : map) { + bool tensorNameFound{false}; + for (int32_t b = 0; b < endBindingIndex; ++b) { + if (engine->bindingIsInput(b) && + engine->getBindingName(b) == item.first) { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) { + sample::gLogError + << "Cannot find input tensor with name \"" << item.first + << "\" in the engine bindings! " + << "Please make sure the input tensor names are correct." + << std::endl; + return false; + } + } + return true; +} + +template class FillBindingClosure { + private: + using InputsMap = std::unordered_map; + using BindingsVector = std::vector>; + + EngineType const* engine; + ContextType const* context; + InputsMap const& inputs; + BindingsVector& bindings; + int32_t batch; + int32_t endBindingIndex; + + void fillOneBinding(int32_t bindingIndex, int64_t vol) { + auto const dims = getDims(bindingIndex); + auto const name = engine->getBindingName(bindingIndex); + auto const isInput = engine->bindingIsInput(bindingIndex); + auto const dataType = engine->getBindingDataType(bindingIndex); + auto const* bindingInOutStr = isInput ? "input" : "output"; + for (auto& binding : bindings) { + const auto input = inputs.find(name); + if (isInput && input != inputs.end()) { + sample::gLogInfo << "Using values loaded from " << input->second + << " for input " << name << std::endl; + binding->addBinding(bindingIndex, name, isInput, vol, dataType, + input->second); + } else { + sample::gLogInfo << "Using random values for " << bindingInOutStr << " " + << name << std::endl; + binding->addBinding(bindingIndex, name, isInput, vol, dataType); + } + sample::gLogInfo << "Created " << bindingInOutStr << " binding for " + << name << " with dimensions " << dims << std::endl; + } + } + + bool fillAllBindings(int32_t batch, int32_t endBindingIndex) { + if (!validateTensorNames(inputs, engine, endBindingIndex)) { + sample::gLogError << "Invalid tensor names found in --loadInputs flag." + << std::endl; + return false; + } + + for (int32_t b = 0; b < endBindingIndex; b++) { + auto const dims = getDims(b); + auto const comps = engine->getBindingComponentsPerElement(b); + auto const strides = context->getStrides(b); + int32_t const vectorDimIndex = engine->getBindingVectorizedDim(b); + auto const vol = volume(dims, strides, vectorDimIndex, comps, batch); + fillOneBinding(b, vol); + } + return true; + } + + Dims getDims(int32_t bindingIndex); + + public: + FillBindingClosure(EngineType const* _engine, ContextType const* _context, + InputsMap const& _inputs, BindingsVector& _bindings, + int32_t _batch, int32_t _endBindingIndex) + : engine(_engine), context(_context), inputs(_inputs), + bindings(_bindings), batch(_batch), endBindingIndex(_endBindingIndex) {} + + bool operator()() { return fillAllBindings(batch, endBindingIndex); } +}; + +template <> +Dims FillBindingClosure:: + getDims(int32_t bindingIndex) { + return context->getBindingDimensions(bindingIndex); +} + +template <> +Dims FillBindingClosure< + nvinfer1::safe::ICudaEngine, + nvinfer1::safe::IExecutionContext>::getDims(int32_t bindingIndex) { + return engine->getBindingDimensions(bindingIndex); +} + +bool setUpInference(InferenceEnvironment& iEnv, + const InferenceOptions& inference) { + int32_t device{}; + cudaCheck(cudaGetDevice(&device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + // Use managed memory on integrated devices when transfers are skipped + // and when it is explicitly requested on the commandline. + bool useManagedMemory{(inference.skipTransfers && properties.integrated) || + inference.useManaged}; + using FillSafeBindings = + FillBindingClosure; + if (iEnv.safe) { + ASSERT(sample::hasSafeRuntime()); + auto* safeEngine = iEnv.safeEngine.get(); + for (int32_t s = 0; s < inference.streams; ++s) { + iEnv.safeContext.emplace_back(safeEngine->createExecutionContext()); + iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); + } + const int32_t nBindings = safeEngine->getNbBindings(); + auto const* safeContext = iEnv.safeContext.front().get(); + // batch is set to 1 because safety only support explicit batch. + return FillSafeBindings(iEnv.safeEngine.get(), safeContext, + inference.inputs, iEnv.bindings, 1, nBindings)(); + } + + using FillStdBindings = + FillBindingClosure; + + for (int32_t s = 0; s < inference.streams; ++s) { + auto ec = iEnv.engine->createExecutionContext(); + if (ec == nullptr) { + sample::gLogError << "Unable to create execution context for stream " << s + << "." << std::endl; + return false; + } + iEnv.context.emplace_back(ec); + iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); + } + if (iEnv.profiler) { + iEnv.context.front()->setProfiler(iEnv.profiler.get()); + // Always run reportToProfiler() after enqueue launch + iEnv.context.front()->setEnqueueEmitsProfile(false); + } + + const int32_t nOptProfiles = iEnv.engine->getNbOptimizationProfiles(); + const int32_t nBindings = iEnv.engine->getNbBindings(); + const int32_t bindingsInProfile = + nOptProfiles > 0 ? nBindings / nOptProfiles : 0; + const int32_t endBindingIndex = + bindingsInProfile ? bindingsInProfile : iEnv.engine->getNbBindings(); + + if (nOptProfiles > 1) { + sample::gLogWarning << "Multiple profiles are currently not supported. " + "Running with one profile." + << std::endl; + } + + // Make sure that the tensor names provided in command-line args actually + // exist in any of the engine bindings + // to avoid silent typos. + if (!validateTensorNames(inference.shapes, iEnv.engine.get(), + endBindingIndex)) { + sample::gLogError << "Invalid tensor names found in --shapes flag." + << std::endl; + return false; + } + + // Set all input dimensions before all bindings can be allocated + for (int32_t b = 0; b < endBindingIndex; ++b) { + if (iEnv.engine->bindingIsInput(b)) { + auto dims = iEnv.context.front()->getBindingDimensions(b); + const bool isScalar = dims.nbDims == 0; + const bool isDynamicInput = + std::any_of(dims.d, dims.d + dims.nbDims, + [](int32_t dim) { return dim == -1; }) || + iEnv.engine->isShapeBinding(b); + if (isDynamicInput) { + auto shape = inference.shapes.find(iEnv.engine->getBindingName(b)); + + std::vector staticDims; + if (shape == inference.shapes.end()) { + // If no shape is provided, set dynamic dimensions to 1. + constexpr int32_t DEFAULT_DIMENSION = 1; + if (iEnv.engine->isShapeBinding(b)) { + if (isScalar) { + staticDims.push_back(1); + } else { + staticDims.resize(dims.d[0]); + std::fill(staticDims.begin(), staticDims.end(), + DEFAULT_DIMENSION); + } + } else { + staticDims.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), + [&](int32_t dimension) { + return dimension >= 0 ? dimension + : DEFAULT_DIMENSION; + }); + } + sample::gLogWarning << "Dynamic dimensions required for input: " + << iEnv.engine->getBindingName(b) + << ", but no shapes were provided. Automatically " + "overriding shape to: " + << staticDims << std::endl; + } else if (inference.inputs.count(shape->first) && + iEnv.engine->isShapeBinding(b)) { + if (isScalar || dims.nbDims == 1) { + // Load shape tensor from file. + size_t const size = isScalar ? 1 : dims.d[0]; + staticDims.resize(size); + auto const& filename = inference.inputs.at(shape->first); + auto dst = reinterpret_cast(staticDims.data()); + loadFromFile(filename, dst, + size * sizeof(decltype(staticDims)::value_type)); + } else { + sample::gLogWarning << "Cannot load shape tensor " << shape->first + << " from file, " + << "ND-Shape isn't supported yet" << std::endl; + // Fallback + staticDims = shape->second; + } + } else { + staticDims = shape->second; + } + + for (auto& c : iEnv.context) { + if (iEnv.engine->isShapeBinding(b)) { + if (!c->setInputShapeBinding(b, staticDims.data())) { + return false; + } + } else { + if (!c->setBindingDimensions(b, toDims(staticDims))) { + return false; + } + } + } + } + } + } + + auto* engine = iEnv.engine.get(); + auto const* context = iEnv.context.front().get(); + int32_t const batch = + engine->hasImplicitBatchDimension() ? inference.batch : 1; + return FillStdBindings(engine, context, inference.inputs, iEnv.bindings, + batch, endBindingIndex)(); +} + +namespace { + +#if defined(__QNX__) +using TimePoint = double; +#else +using TimePoint = std::chrono::time_point; +#endif + +TimePoint getCurrentTime() { +#if defined(__QNX__) + uint64_t const currentCycles = ClockCycles(); + uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec; + // Return current timestamp in ms. + return static_cast(currentCycles) * 1000. / cyclesPerSecond; +#else + return std::chrono::high_resolution_clock::now(); +#endif +} + +//! +//! \struct SyncStruct +//! \brief Threads synchronization structure +//! +struct SyncStruct { + std::mutex mutex; + TrtCudaStream mainStream; + TrtCudaEvent gpuStart{cudaEventBlockingSync}; + TimePoint cpuStart{}; + float sleep{}; +}; + +struct Enqueue { + explicit Enqueue(nvinfer1::IExecutionContext& context, void** buffers) + : mContext(context), mBuffers(buffers) {} + + nvinfer1::IExecutionContext& mContext; + void** mBuffers{}; +}; + +//! +//! \class EnqueueImplicit +//! \brief Functor to enqueue inference with implict batch +//! +class EnqueueImplicit : private Enqueue { + public: + explicit EnqueueImplicit(nvinfer1::IExecutionContext& context, void** buffers, + int32_t batch) + : Enqueue(context, buffers), mBatch(batch) {} + + bool operator()(TrtCudaStream& stream) const { + if (mContext.enqueue(mBatch, mBuffers, stream.get(), nullptr)) { + // Collecting layer timing info from current profile index of execution + // context + if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && + !mContext.reportToProfiler()) { + gLogWarning + << "Failed to collect layer timing info from previous enqueue()" + << std::endl; + } + return true; + } + return false; + } + + private: + int32_t mBatch; +}; + +//! +//! \class EnqueueExplicit +//! \brief Functor to enqueue inference with explict batch +//! +class EnqueueExplicit : private Enqueue { + public: + explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, void** buffers) + : Enqueue(context, buffers) {} + + bool operator()(TrtCudaStream& stream) const { + if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) { + // Collecting layer timing info from current profile index of execution + // context + if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && + !mContext.reportToProfiler()) { + gLogWarning + << "Failed to collect layer timing info from previous enqueueV2()" + << std::endl; + } + return true; + } + return false; + } +}; + +//! +//! \class EnqueueGraph +//! \brief Functor to enqueue inference from CUDA Graph +//! +class EnqueueGraph { + public: + explicit EnqueueGraph(nvinfer1::IExecutionContext& context, + TrtCudaGraph& graph) + : mGraph(graph), mContext(context) {} + + bool operator()(TrtCudaStream& stream) const { + if (mGraph.launch(stream)) { + // Collecting layer timing info from current profile index of execution + // context + if (mContext.getProfiler() && !mContext.reportToProfiler()) { + gLogWarning << "Failed to collect layer timing info from previous CUDA " + "graph launch" + << std::endl; + } + return true; + } + return false; + } + + TrtCudaGraph& mGraph; + nvinfer1::IExecutionContext& mContext; +}; + +//! +//! \class EnqueueSafe +//! \brief Functor to enqueue safe execution context +//! +class EnqueueSafe { + public: + explicit EnqueueSafe(nvinfer1::safe::IExecutionContext& context, + void** buffers) + : mContext(context), mBuffers(buffers) {} + + bool operator()(TrtCudaStream& stream) const { + if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) { + return true; + } + return false; + } + + nvinfer1::safe::IExecutionContext& mContext; + void** mBuffers{}; +}; + +using EnqueueFunction = std::function; + +enum class StreamType : int32_t { + kINPUT = 0, + kCOMPUTE = 1, + kOUTPUT = 2, + kNUM = 3 +}; + +enum class EventType : int32_t { + kINPUT_S = 0, + kINPUT_E = 1, + kCOMPUTE_S = 2, + kCOMPUTE_E = 3, + kOUTPUT_S = 4, + kOUTPUT_E = 5, + kNUM = 6 +}; + +using MultiStream = + std::array(StreamType::kNUM)>; + +using MultiEvent = std::array, + static_cast(EventType::kNUM)>; + +using EnqueueTimes = std::array; + +//! +//! \class Iteration +//! \brief Inference iteration and streams management +//! +template class Iteration { + public: + Iteration(int32_t id, const InferenceOptions& inference, ContextType& context, + Bindings& bindings) + : mBindings(bindings), mStreamId(id), mDepth(1 + inference.overlap), + mActive(mDepth), mEvents(mDepth), mEnqueueTimes(mDepth), + mContext(&context) { + for (int32_t d = 0; d < mDepth; ++d) { + for (int32_t e = 0; e < static_cast(EventType::kNUM); ++e) { + mEvents[d][e].reset(new TrtCudaEvent(!inference.spin)); + } + } + createEnqueueFunction(inference, context, bindings); + } + + bool query(bool skipTransfers) { + if (mActive[mNext]) { + return true; + } + + if (!skipTransfers) { + record(EventType::kINPUT_S, StreamType::kINPUT); + mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); + record(EventType::kINPUT_E, StreamType::kINPUT); + wait(EventType::kINPUT_E, + StreamType::kCOMPUTE); // Wait for input DMA before compute + } + + record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE); + recordEnqueueTime(); + if (!mEnqueue(getStream(StreamType::kCOMPUTE))) { + return false; + } + recordEnqueueTime(); + record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE); + + if (!skipTransfers) { + wait(EventType::kCOMPUTE_E, + StreamType::kOUTPUT); // Wait for compute before output DMA + record(EventType::kOUTPUT_S, StreamType::kOUTPUT); + mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); + record(EventType::kOUTPUT_E, StreamType::kOUTPUT); + } + + mActive[mNext] = true; + moveNext(); + return true; + } + + float sync(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, + std::vector& trace, bool skipTransfers) { + if (mActive[mNext]) { + if (skipTransfers) { + getEvent(EventType::kCOMPUTE_E).synchronize(); + } else { + getEvent(EventType::kOUTPUT_E).synchronize(); + } + trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers)); + mActive[mNext] = false; + return getEvent(EventType::kCOMPUTE_S) - gpuStart; + } + return 0; + } + + void syncAll(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, + std::vector& trace, bool skipTransfers) { + for (int32_t d = 0; d < mDepth; ++d) { + sync(cpuStart, gpuStart, trace, skipTransfers); + moveNext(); + } + } + + void wait(TrtCudaEvent& gpuStart) { + getStream(StreamType::kINPUT).wait(gpuStart); + } + + void setInputData() { + mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); + } + + void fetchOutputData() { + mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); + } + + private: + void moveNext() { mNext = mDepth - 1 - mNext; } + + TrtCudaStream& getStream(StreamType t) { + return mStream[static_cast(t)]; + } + + TrtCudaEvent& getEvent(EventType t) { + return *mEvents[mNext][static_cast(t)]; + } + + void record(EventType e, StreamType s) { getEvent(e).record(getStream(s)); } + + void recordEnqueueTime() { + mEnqueueTimes[mNext][enqueueStart] = getCurrentTime(); + enqueueStart = 1 - enqueueStart; + } + + TimePoint getEnqueueTime(bool start) { + return mEnqueueTimes[mNext][start ? 0 : 1]; + } + + void wait(EventType e, StreamType s) { getStream(s).wait(getEvent(e)); } + + InferenceTrace getTrace(const TimePoint& cpuStart, + const TrtCudaEvent& gpuStart, bool skipTransfers) { + float is = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart + : getEvent(EventType::kINPUT_S) - gpuStart; + float ie = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart + : getEvent(EventType::kINPUT_E) - gpuStart; + float os = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart + : getEvent(EventType::kOUTPUT_S) - gpuStart; + float oe = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart + : getEvent(EventType::kOUTPUT_E) - gpuStart; + + return InferenceTrace(mStreamId, + std::chrono::duration( + getEnqueueTime(true) - cpuStart) + .count(), + std::chrono::duration( + getEnqueueTime(false) - cpuStart) + .count(), + is, ie, getEvent(EventType::kCOMPUTE_S) - gpuStart, + getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe); + } + + void createEnqueueFunction(const InferenceOptions& inference, + nvinfer1::IExecutionContext& context, + Bindings& bindings) { + if (inference.batch) { + mEnqueue = EnqueueFunction(EnqueueImplicit( + context, mBindings.getDeviceBuffers(), inference.batch)); + } else { + mEnqueue = EnqueueFunction( + EnqueueExplicit(context, mBindings.getDeviceBuffers())); + } + if (inference.graph) { + TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); + // Avoid capturing initialization calls by executing the enqueue function + // at least + // once before starting CUDA graph capture. + const auto ret = mEnqueue(stream); + assert(ret); + stream.synchronize(); + + mGraph.beginCapture(stream); + // The built TRT engine may contain operations that are not permitted + // under CUDA graph capture mode. + // When the stream is capturing, the enqueue call may return false if the + // current CUDA graph capture fails. + if (mEnqueue(stream)) { + mGraph.endCapture(stream); + mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph)); + } else { + mGraph.endCaptureOnError(stream); + // Ensure any CUDA error has been cleaned up. + cudaCheck(cudaGetLastError()); + sample::gLogWarning << "The built TensorRT engine contains operations " + "that are not permitted under " + "CUDA graph capture mode." + << std::endl; + sample::gLogWarning << "The specified --useCudaGraph flag has been " + "ignored. The inference will be " + "launched without using CUDA graph launch." + << std::endl; + } + } + } + + void createEnqueueFunction(const InferenceOptions&, + nvinfer1::safe::IExecutionContext& context, + Bindings&) { + mEnqueue = + EnqueueFunction(EnqueueSafe(context, mBindings.getDeviceBuffers())); + } + + Bindings& mBindings; + + TrtCudaGraph mGraph; + EnqueueFunction mEnqueue; + + int32_t mStreamId{0}; + int32_t mNext{0}; + int32_t mDepth{2}; // default to double buffer to hide DMA transfers + + std::vector mActive; + MultiStream mStream; + std::vector mEvents; + + int32_t enqueueStart{0}; + std::vector mEnqueueTimes; + ContextType* mContext{nullptr}; +}; + +template +bool inferenceLoop( + std::vector>>& iStreams, + const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, int iterations, + float maxDurationMs, float warmupMs, std::vector& trace, + bool skipTransfers, float idleMs) { + float durationMs = 0; + int32_t skip = 0; + + for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; + ++i) { + for (auto& s : iStreams) { + if (!s->query(skipTransfers)) { + return false; + } + } + for (auto& s : iStreams) { + durationMs = std::max(durationMs, + s->sync(cpuStart, gpuStart, trace, skipTransfers)); + } + if (durationMs < warmupMs) // Warming up + { + if (durationMs) // Skip complete iterations + { + ++skip; + } + continue; + } + if (idleMs != 0.F) { + std::this_thread::sleep_for( + std::chrono::duration(idleMs)); + } + } + for (auto& s : iStreams) { + s->syncAll(cpuStart, gpuStart, trace, skipTransfers); + } + return true; +} + +template +void inferenceExecution(const InferenceOptions& inference, + InferenceEnvironment& iEnv, SyncStruct& sync, + const int32_t threadIdx, const int32_t streamsPerThread, + int32_t device, std::vector& trace) { + float warmupMs = inference.warmup; + float durationMs = inference.duration * 1000.F + warmupMs; + + cudaCheck(cudaSetDevice(device)); + + std::vector>> iStreams; + + for (int32_t s = 0; s < streamsPerThread; ++s) { + const int32_t streamId{threadIdx * streamsPerThread + s}; + auto* iteration = new Iteration( + streamId, inference, *iEnv.template getContext(streamId), + *iEnv.bindings[streamId]); + if (inference.skipTransfers) { + iteration->setInputData(); + } + iStreams.emplace_back(iteration); + } + + for (auto& s : iStreams) { + s->wait(sync.gpuStart); + } + + std::vector localTrace; + if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, + inference.iterations, durationMs, warmupMs, localTrace, + inference.skipTransfers, inference.idle)) { + iEnv.error = true; + } + + if (inference.skipTransfers) { + for (auto& s : iStreams) { + s->fetchOutputData(); + } + } + + sync.mutex.lock(); + trace.insert(trace.end(), localTrace.begin(), localTrace.end()); + sync.mutex.unlock(); +} + +inline std::thread makeThread(const InferenceOptions& inference, + InferenceEnvironment& iEnv, SyncStruct& sync, + int32_t threadIdx, int32_t streamsPerThread, + int32_t device, + std::vector& trace) { + if (iEnv.safe) { + ASSERT(sample::hasSafeRuntime()); + return std::thread(inferenceExecution, + std::cref(inference), std::ref(iEnv), std::ref(sync), + threadIdx, streamsPerThread, device, std::ref(trace)); + } + + return std::thread(inferenceExecution, + std::cref(inference), std::ref(iEnv), std::ref(sync), + threadIdx, streamsPerThread, device, std::ref(trace)); +} + +} // namespace + +bool runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv, + int32_t device, std::vector& trace) { + cudaCheck(cudaProfilerStart()); + + trace.resize(0); + + SyncStruct sync; + sync.sleep = inference.sleep; + sync.mainStream.sleep(&sync.sleep); + sync.cpuStart = getCurrentTime(); + sync.gpuStart.record(sync.mainStream); + + // When multiple streams are used, trtexec can run inference in two modes: + // (1) if inference.threads is true, then run each stream on each thread. + // (2) if inference.threads is false, then run all streams on the same thread. + const int32_t numThreads = inference.threads ? inference.streams : 1; + const int32_t streamsPerThread = inference.threads ? 1 : inference.streams; + + std::vector threads; + for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) { + threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, + streamsPerThread, device, trace)); + } + for (auto& th : threads) { + th.join(); + } + + cudaCheck(cudaProfilerStop()); + + auto cmpTrace = [](const InferenceTrace& a, const InferenceTrace& b) { + return a.h2dStart < b.h2dStart; + }; + std::sort(trace.begin(), trace.end(), cmpTrace); + + return !iEnv.error; +} + +namespace { +size_t reportGpuMemory() { + static size_t prevFree{0}; + size_t free{0}; + size_t total{0}; + size_t newlyAllocated{0}; + cudaCheck(cudaMemGetInfo(&free, &total)); + sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB"; + if (prevFree != 0) { + newlyAllocated = (prevFree - free); + sample::gLogInfo << ", newly allocated GPU memory = " + << newlyAllocated / 1024.0_MiB << " GiB"; + } + sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" + << std::endl; + prevFree = free; + return newlyAllocated; +} +} // namespace + +//! Returns true if deserialization is slower than expected or fails. +bool timeDeserialize(InferenceEnvironment& iEnv) { + constexpr int32_t kNB_ITERS{20}; + std::unique_ptr rt{ + createInferRuntime(sample::gLogger.getTRTLogger())}; + std::unique_ptr engine; + + std::unique_ptr safeRT{ + sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; + std::unique_ptr safeEngine; + + if (iEnv.safe) { + ASSERT(sample::hasSafeRuntime() && safeRT != nullptr); + safeRT->setErrorRecorder(&gRecorder); + } + + auto timeDeserializeFn = [&]() -> float { + bool deserializeOK{false}; + engine.reset(nullptr); + safeEngine.reset(nullptr); + auto startClock = std::chrono::high_resolution_clock::now(); + if (iEnv.safe) { + safeEngine.reset(safeRT->deserializeCudaEngine(iEnv.engineBlob.data(), + iEnv.engineBlob.size())); + deserializeOK = (safeEngine != nullptr); + } else { + engine.reset(rt->deserializeCudaEngine(iEnv.engineBlob.data(), + iEnv.engineBlob.size(), nullptr)); + deserializeOK = (engine != nullptr); + } + auto endClock = std::chrono::high_resolution_clock::now(); + // return NAN if deserialization failed. + return deserializeOK + ? std::chrono::duration(endClock - startClock) + .count() + : NAN; + }; + + // Warmup the caches to make sure that cache thrashing isn't throwing off the + // results + { + sample::gLogInfo << "Begin deserialization warmup..." << std::endl; + for (int32_t i = 0, e = 2; i < e; ++i) { + timeDeserializeFn(); + } + } + sample::gLogInfo << "Begin deserialization engine timing..." << std::endl; + float const first = timeDeserializeFn(); + + // Check if first deserialization suceeded. + if (std::isnan(first)) { + sample::gLogError << "Engine deserialization failed." << std::endl; + return true; + } + + sample::gLogInfo << "First deserialization time = " << first + << " milliseconds" << std::endl; + + // Record initial gpu memory state. + reportGpuMemory(); + + float totalTime{0.F}; + for (int32_t i = 0; i < kNB_ITERS; ++i) { + totalTime += timeDeserializeFn(); + } + const auto averageTime = totalTime / kNB_ITERS; + // reportGpuMemory sometimes reports zero after a single deserialization of a + // small engine, + // so use the size of memory for all the iterations. + const auto totalEngineSizeGpu = reportGpuMemory(); + sample::gLogInfo << "Total deserialization time = " << totalTime + << " milliseconds in " << kNB_ITERS + << " iterations, average time = " << averageTime + << " milliseconds, first time = " << first + << " milliseconds." << std::endl; + sample::gLogInfo << "Deserialization Bandwidth = " + << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" + << std::endl; + + // If the first deserialization is more than tolerance slower than + // the average deserialization, return true, which means an error occurred. + // The tolerance is set to 2x since the deserialization time is quick and + // susceptible + // to caching issues causing problems in the first timing. + const auto tolerance = 2.0F; + const bool isSlowerThanExpected = first > averageTime * tolerance; + if (isSlowerThanExpected) { + sample::gLogInfo << "First deserialization time divided by average time is " + << (first / averageTime) << ". Exceeds tolerance of " + << tolerance << "x." << std::endl; + } + return isSlowerThanExpected; +} + +std::string getLayerInformation(const InferenceEnvironment& iEnv, + nvinfer1::LayerInformationFormat format) { + auto runtime = std::unique_ptr( + createInferRuntime(sample::gLogger.getTRTLogger())); + auto inspector = + std::unique_ptr(iEnv.engine->createEngineInspector()); + if (!iEnv.context.empty()) { + inspector->setExecutionContext(iEnv.context.front().get()); + } + std::string result = inspector->getEngineInformation(format); + return result; +} + +} // namespace sample diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleInference.h b/csrc/fastdeploy/backends/tensorrt/common/sampleInference.h new file mode 100644 index 000000000..700dc8bef --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleInference.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_INFERENCE_H +#define TRT_SAMPLE_INFERENCE_H + +#include "sampleReporting.h" +#include "sampleUtils.h" + +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvInferSafeRuntime.h" + +namespace sample { + +struct InferenceEnvironment { + TrtUniquePtr engine; + std::unique_ptr profiler; + std::vector> context; + std::vector> bindings; + bool error{false}; + + std::vector engineBlob; + + bool safe{false}; + std::unique_ptr safeEngine; + std::vector> safeContext; + + template + inline ContextType* getContext(int32_t streamIdx); +}; + +template <> +inline nvinfer1::IExecutionContext* +InferenceEnvironment::getContext(int32_t streamIdx) { + return context[streamIdx].get(); +} + +template <> +inline nvinfer1::safe::IExecutionContext* +InferenceEnvironment::getContext(int32_t streamIdx) { + return safeContext[streamIdx].get(); +} + +//! +//! \brief Set up contexts and bindings for inference +//! +bool setUpInference(InferenceEnvironment& iEnv, + const InferenceOptions& inference); + +//! +//! \brief Deserialize the engine and time how long it takes. +//! +bool timeDeserialize(InferenceEnvironment& iEnv); + +//! +//! \brief Run inference and collect timing, return false if any error hit +//! during inference +//! +bool runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv, + int32_t device, std::vector& trace); + +//! +//! \brief Get layer information of the engine. +//! +std::string getLayerInformation(const InferenceEnvironment& iEnv, + nvinfer1::LayerInformationFormat format); + +} // namespace sample + +#endif // TRT_SAMPLE_INFERENCE_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleOptions.cpp b/csrc/fastdeploy/backends/tensorrt/common/sampleOptions.cpp new file mode 100644 index 000000000..a01b4dfde --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleOptions.cpp @@ -0,0 +1,1634 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +#include "logger.h" +#include "sampleOptions.h" + +namespace sample { + +namespace { + +std::vector splitToStringVec(const std::string& option, + char separator) { + std::vector options; + + for (size_t start = 0; start < option.length();) { + size_t separatorIndex = option.find(separator, start); + if (separatorIndex == std::string::npos) { + separatorIndex = option.length(); + } + options.emplace_back(option.substr(start, separatorIndex - start)); + start = separatorIndex + 1; + } + + return options; +} + +template T stringToValue(const std::string& option) { + return T{option}; +} + +template <> int32_t stringToValue(const std::string& option) { + return std::stoi(option); +} + +template <> float stringToValue(const std::string& option) { + return std::stof(option); +} + +template <> double stringToValue(const std::string& option) { + return std::stod(option); +} + +template <> bool stringToValue(const std::string& option) { return true; } + +template <> +std::vector +stringToValue>(const std::string& option) { + std::vector shape; + std::vector dimsStrings = splitToStringVec(option, 'x'); + for (const auto& d : dimsStrings) { + shape.push_back(stringToValue(d)); + } + return shape; +} + +template <> +nvinfer1::DataType +stringToValue(const std::string& option) { + const std::unordered_map strToDT{ + {"fp32", nvinfer1::DataType::kFLOAT}, + {"fp16", nvinfer1::DataType::kHALF}, + {"int8", nvinfer1::DataType::kINT8}, + {"int32", nvinfer1::DataType::kINT32}}; + const auto& dt = strToDT.find(option); + if (dt == strToDT.end()) { + throw std::invalid_argument("Invalid DataType " + option); + } + return dt->second; +} + +template <> +nvinfer1::TensorFormats +stringToValue(const std::string& option) { + std::vector optionStrings = splitToStringVec(option, '+'); + const std::unordered_map strToFmt{ + {"chw", nvinfer1::TensorFormat::kLINEAR}, + {"chw2", nvinfer1::TensorFormat::kCHW2}, + {"chw4", nvinfer1::TensorFormat::kCHW4}, + {"hwc8", nvinfer1::TensorFormat::kHWC8}, + {"chw16", nvinfer1::TensorFormat::kCHW16}, + {"chw32", nvinfer1::TensorFormat::kCHW32}, + {"dhwc8", nvinfer1::TensorFormat::kDHWC8}, + {"hwc", nvinfer1::TensorFormat::kHWC}, + {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, + {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}}; + nvinfer1::TensorFormats formats{}; + for (auto f : optionStrings) { + const auto& tf = strToFmt.find(f); + if (tf == strToFmt.end()) { + throw std::invalid_argument(std::string("Invalid TensorFormat ") + f); + } + formats |= 1U << static_cast(tf->second); + } + + return formats; +} + +template <> IOFormat stringToValue(const std::string& option) { + IOFormat ioFormat{}; + const size_t colon = option.find(':'); + + if (colon == std::string::npos) { + throw std::invalid_argument(std::string("Invalid IOFormat ") + option); + } + + ioFormat.first = stringToValue(option.substr(0, colon)); + ioFormat.second = + stringToValue(option.substr(colon + 1)); + + return ioFormat; +} + +template +std::pair splitNameAndValue(const std::string& s) { + std::string tensorName; + std::string valueString; + // Split on the last : + std::vector nameRange{splitToStringVec(s, ':')}; + // Everything before the last : is the name + tensorName = nameRange[0]; + for (size_t i = 1; i < nameRange.size() - 1; i++) { + tensorName += ":" + nameRange[i]; + } + // Value is the string element after the last : + valueString = nameRange[nameRange.size() - 1]; + return std::pair(tensorName, stringToValue(valueString)); +} + +template +void splitInsertKeyValue(const std::vector& kvList, T& map) { + for (const auto& kv : kvList) { + map.insert(splitNameAndValue(kv)); + } +} + +const char* boolToEnabled(bool enable) { + return enable ? "Enabled" : "Disabled"; +} + +//! Check if input option exists in input arguments. +//! If it does: return its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOption(Arguments& arguments, const std::string& option, + T& value) { + const auto match = arguments.find(option); + if (match != arguments.end()) { + value = stringToValue(match->second); + arguments.erase(match); + return true; + } + + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: return false in value, erase the argument and return true. +//! If it does not: return false. +bool getAndDelNegOption(Arguments& arguments, const std::string& option, + bool& value) { + bool dummy; + if (getAndDelOption(arguments, option, dummy)) { + value = false; + return true; + } + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: add all the matched arg values to values vector, erase the +//! argument and return true. +//! If it does not: return false. +template +bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, + std::vector& values) { + const auto match = arguments.equal_range(option); + if (match.first == match.second) { + return false; + } + + auto addToValues = [&values](Arguments::value_type& argValue) { + values.emplace_back(stringToValue(argValue.second)); + }; + std::for_each(match.first, match.second, addToValues); + arguments.erase(match.first, match.second); + + return true; +} + +void insertShapesBuild(std::unordered_map& shapes, + nvinfer1::OptProfileSelector selector, + const std::string& name, + const std::vector& dims) { + shapes[name][static_cast(selector)] = dims; +} + +void insertShapesInference( + std::unordered_map>& shapes, + const std::string& name, const std::vector& dims) { + shapes[name] = dims; +} + +std::string removeSingleQuotationMarks(std::string& str) { + std::vector strList{splitToStringVec(str, '\'')}; + // Remove all the escaped single quotation marks + std::string retVal = ""; + // Do not really care about unterminated sequences + for (size_t i = 0; i < strList.size(); i++) { + retVal += strList[i]; + } + return retVal; +} + +void getLayerPrecisions(Arguments& arguments, char const* argument, + LayerPrecisions& layerPrecisions) { + std::string list; + if (!getAndDelOption(arguments, argument, list)) { + return; + } + + // The layerPrecisions flag contains comma-separated layerName:precision + // pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + layerPrecisions[layerName] = namePrecisionPair.second; + } +} + +void getLayerOutputTypes(Arguments& arguments, char const* argument, + LayerOutputTypes& layerOutputTypes) { + std::string list; + if (!getAndDelOption(arguments, argument, list)) { + return; + } + + // The layerOutputTypes flag contains comma-separated layerName:types pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + auto const typeStrings = splitToStringVec(namePrecisionPair.second, '+'); + std::vector typeVec(typeStrings.size(), + nvinfer1::DataType::kFLOAT); + std::transform(typeStrings.begin(), typeStrings.end(), typeVec.begin(), + stringToValue); + layerOutputTypes[layerName] = typeVec; + } +} + +bool getShapesBuild(Arguments& arguments, + std::unordered_map& shapes, + char const* argument, + nvinfer1::OptProfileSelector selector) { + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesBuild(shapes, selector, tensorName, dims); + } + return retVal; +} + +bool getShapesInference( + Arguments& arguments, + std::unordered_map>& shapes, + const char* argument) { + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesInference(shapes, tensorName, dims); + } + return retVal; +} + +void processShapes(std::unordered_map& shapes, + bool minShapes, bool optShapes, bool maxShapes, bool calib) { + // Only accept optShapes only or all three of minShapes, optShapes, maxShapes + if (((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes + // only, both minShapes and + // maxShapes + || (minShapes && !maxShapes && optShapes) // both minShapes and optShapes + || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes + { + if (calib) { + throw std::invalid_argument( + "Must specify only --optShapesCalib or all of --minShapesCalib, " + "--optShapesCalib, --maxShapesCalib"); + } else { + throw std::invalid_argument( + "Must specify only --optShapes or all of --minShapes, --optShapes, " + "--maxShapes"); + } + } + + // If optShapes only, expand optShapes to minShapes and maxShapes + if (optShapes && !minShapes && !maxShapes) { + std::unordered_map newShapes; + for (auto& s : shapes) { + insertShapesBuild( + newShapes, nvinfer1::OptProfileSelector::kMIN, s.first, + s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + insertShapesBuild( + newShapes, nvinfer1::OptProfileSelector::kOPT, s.first, + s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + insertShapesBuild( + newShapes, nvinfer1::OptProfileSelector::kMAX, s.first, + s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } + shapes = newShapes; + } +} + +template +void printShapes(std::ostream& os, const char* phase, const T& shapes) { + if (shapes.empty()) { + os << "Input " << phase << " shapes: model" << std::endl; + } else { + for (const auto& s : shapes) { + os << "Input " << phase << " shape: " << s.first << "=" << s.second + << std::endl; + } + } +} + +std::ostream& printBatch(std::ostream& os, int32_t maxBatch) { + if (maxBatch != maxBatchNotProvided) { + os << maxBatch; + } else { + os << "explicit batch"; + } + return os; +} + +std::ostream& printTacticSources(std::ostream& os, + nvinfer1::TacticSources enabledSources, + nvinfer1::TacticSources disabledSources) { + if (!enabledSources && !disabledSources) { + os << "Using default tactic sources"; + } else { + auto const addSource = [&](uint32_t source, std::string const& name) { + if (enabledSources & source) { + os << name << " [ON], "; + } else if (disabledSources & source) { + os << name << " [OFF], "; + } + }; + + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS), + "cublas"); + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT), + "cublasLt"); + addSource(1U << static_cast(nvinfer1::TacticSource::kCUDNN), + "cudnn"); + } + return os; +} + +std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) { + os << "FP32"; + if (options.fp16) { + os << "+FP16"; + } + if (options.int8) { + os << "+INT8"; + } + if (options.precisionConstraints == PrecisionConstraints::kOBEY) { + os << " (obey precision constraints)"; + } + if (options.precisionConstraints == PrecisionConstraints::kPREFER) { + os << " (prefer precision constraints)"; + } + return os; +} + +std::ostream& printTimingCache(std::ostream& os, BuildOptions const& options) { + switch (options.timingCacheMode) { + case TimingCacheMode::kGLOBAL: + os << "global"; + break; + case TimingCacheMode::kLOCAL: + os << "local"; + break; + case TimingCacheMode::kDISABLE: + os << "disable"; + break; + } + return os; +} + +std::ostream& printSparsity(std::ostream& os, BuildOptions const& options) { + switch (options.sparsity) { + case SparsityFlag::kDISABLE: + os << "Disabled"; + break; + case SparsityFlag::kENABLE: + os << "Enabled"; + break; + case SparsityFlag::kFORCE: + os << "Forced"; + break; + } + + return os; +} + +std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options) { + auto const printValueOrDefault = [&os](double const val) { + if (val >= 0) { + os << val << " MiB"; + } else { + os << "default"; + } + }; + os << "workspace: "; + printValueOrDefault(options.workspace); + os << ", "; + os << "dlaSRAM: "; + printValueOrDefault(options.dlaSRAM); + os << ", "; + os << "dlaLocalDRAM: "; + printValueOrDefault(options.dlaLocalDRAM); + os << ", "; + os << "dlaGlobalDRAM: "; + printValueOrDefault(options.dlaGlobalDRAM); + return os; +} + +} // namespace + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]) { + Arguments arguments; + for (int32_t i = 1; i < argc; ++i) { + auto valuePtr = strchr(argv[i], '='); + if (valuePtr) { + std::string value{valuePtr + 1}; + arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value); + } else { + arguments.emplace(argv[i], ""); + } + } + return arguments; +} + +void BaseModelOptions::parse(Arguments& arguments) { + if (getAndDelOption(arguments, "--onnx", model)) { + format = ModelFormat::kONNX; + } else if (getAndDelOption(arguments, "--uff", model)) { + format = ModelFormat::kUFF; + } else if (getAndDelOption(arguments, "--model", model)) { + format = ModelFormat::kCAFFE; + } +} + +void UffInput::parse(Arguments& arguments) { + getAndDelOption(arguments, "--uffNHWC", NHWC); + std::vector args; + if (getAndDelRepeatedOption(arguments, "--uffInput", args)) { + for (const auto& i : args) { + std::vector values{splitToStringVec(i, ',')}; + if (values.size() == 4) { + nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), + std::stoi(values[3])}; + inputs.emplace_back(values[0], dims); + } else { + throw std::invalid_argument(std::string("Invalid uffInput ") + i); + } + } + } +} + +void ModelOptions::parse(Arguments& arguments) { + baseModel.parse(arguments); + + switch (baseModel.format) { + case ModelFormat::kCAFFE: { + getAndDelOption(arguments, "--deploy", prototxt); + break; + } + case ModelFormat::kUFF: { + uffInputs.parse(arguments); + if (uffInputs.inputs.empty()) { + throw std::invalid_argument("Uff models require at least one input"); + } + break; + } + case ModelFormat::kONNX: + break; + case ModelFormat::kANY: { + if (getAndDelOption(arguments, "--deploy", prototxt)) { + baseModel.format = ModelFormat::kCAFFE; + } + break; + } + } + + // The --output flag should only be used with Caffe and UFF. It has no effect + // on ONNX. + std::vector outArgs; + if (getAndDelRepeatedOption(arguments, "--output", outArgs)) { + for (const auto& o : outArgs) { + for (auto& v : splitToStringVec(o, ',')) { + outputs.emplace_back(std::move(v)); + } + } + } + if (baseModel.format == ModelFormat::kCAFFE || + baseModel.format == ModelFormat::kUFF) { + if (outputs.empty()) { + throw std::invalid_argument( + "Caffe and Uff models require at least one output"); + } + } else if (baseModel.format == ModelFormat::kONNX) { + if (!outputs.empty()) { + throw std::invalid_argument( + "The --output flag should not be used with ONNX models."); + } + } +} + +void BuildOptions::parse(Arguments& arguments) { + auto getFormats = [&arguments](std::vector& formatsVector, + const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) { + formatsVector.push_back(stringToValue(f)); + } + }; + + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + + bool addedExplicitBatchFlag{false}; + getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag); + if (addedExplicitBatchFlag) { + sample::gLogWarning + << "--explicitBatch flag has been deprecated and has no effect!" + << std::endl; + sample::gLogWarning << "Explicit batch dim is automatically enabled if " + "input model is ONNX or if dynamic " + << "shapes are provided when the engine is built." + << std::endl; + } + + bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", + nvinfer1::OptProfileSelector::kMIN); + bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", + nvinfer1::OptProfileSelector::kOPT); + bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", + nvinfer1::OptProfileSelector::kMAX); + processShapes(shapes, minShapes, optShapes, maxShapes, false); + bool minShapesCalib = + getShapesBuild(arguments, shapesCalib, "--minShapesCalib", + nvinfer1::OptProfileSelector::kMIN); + bool optShapesCalib = + getShapesBuild(arguments, shapesCalib, "--optShapesCalib", + nvinfer1::OptProfileSelector::kOPT); + bool maxShapesCalib = + getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", + nvinfer1::OptProfileSelector::kMAX); + processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, + true); + + bool addedExplicitPrecisionFlag{false}; + getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag); + if (addedExplicitPrecisionFlag) { + sample::gLogWarning + << "--explicitPrecision flag has been deprecated and has no effect!" + << std::endl; + } + + if (getAndDelOption(arguments, "--workspace", workspace)) { + sample::gLogWarning + << "--workspace flag has been deprecated by --memPoolSize flag." + << std::endl; + } + + std::string memPoolSizes; + getAndDelOption(arguments, "--memPoolSize", memPoolSizes); + std::vector memPoolSpecs{splitToStringVec(memPoolSizes, ',')}; + for (auto const& memPoolSpec : memPoolSpecs) { + std::string memPoolName; + double memPoolSize; + std::tie(memPoolName, memPoolSize) = splitNameAndValue(memPoolSpec); + if (memPoolSize < 0) { + throw std::invalid_argument(std::string("Negative memory pool size: ") + + std::to_string(memPoolSize)); + } + if (memPoolName == "workspace") { + workspace = memPoolSize; + } else if (memPoolName == "dlaSRAM") { + dlaSRAM = memPoolSize; + } else if (memPoolName == "dlaLocalDRAM") { + dlaLocalDRAM = memPoolSize; + } else if (memPoolName == "dlaGlobalDRAM") { + dlaGlobalDRAM = memPoolSize; + } else if (!memPoolName.empty()) { + throw std::invalid_argument(std::string("Unknown memory pool: ") + + memPoolName); + } + } + + getAndDelOption(arguments, "--maxBatch", maxBatch); + getAndDelOption(arguments, "--minTiming", minTiming); + getAndDelOption(arguments, "--avgTiming", avgTiming); + + bool best{false}; + getAndDelOption(arguments, "--best", best); + if (best) { + int8 = true; + fp16 = true; + } + + getAndDelOption(arguments, "--refit", refittable); + getAndDelNegOption(arguments, "--noTF32", tf32); + getAndDelOption(arguments, "--fp16", fp16); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--safe", safe); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--restricted", restricted); + + getAndDelOption(arguments, "--directIO", directIO); + + std::string precisionConstraintsString; + getAndDelOption(arguments, "--precisionConstraints", + precisionConstraintsString); + if (!precisionConstraintsString.empty()) { + const std::unordered_map + precisionConstraintsMap = {{"obey", PrecisionConstraints::kOBEY}, + {"prefer", PrecisionConstraints::kPREFER}, + {"none", PrecisionConstraints::kNONE}}; + auto it = precisionConstraintsMap.find(precisionConstraintsString); + if (it == precisionConstraintsMap.end()) { + throw std::invalid_argument( + std::string("Unknown precision constraints: ") + + precisionConstraintsString); + } + precisionConstraints = it->second; + } else { + precisionConstraints = PrecisionConstraints::kNONE; + } + + getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions); + getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes); + + if (layerPrecisions.empty() && layerOutputTypes.empty() && + precisionConstraints != PrecisionConstraints::kNONE) { + sample::gLogWarning << "When --precisionConstraints flag is set to " + "\"obey\" or \"prefer\", please add " + << "--layerPrecision/--layerOutputTypes flags to set " + "layer-wise precisions and output " + << "types." << std::endl; + } else if ((!layerPrecisions.empty() || !layerOutputTypes.empty()) && + precisionConstraints == PrecisionConstraints::kNONE) { + sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no " + "effect when --precisionConstraints " + << "flag is set to \"none\"." << std::endl; + } + + std::string sparsityString; + getAndDelOption(arguments, "--sparsity", sparsityString); + if (sparsityString == "disable") { + sparsity = SparsityFlag::kDISABLE; + } else if (sparsityString == "enable") { + sparsity = SparsityFlag::kENABLE; + } else if (sparsityString == "force") { + sparsity = SparsityFlag::kFORCE; + } else if (!sparsityString.empty()) { + throw std::invalid_argument(std::string("Unknown sparsity mode: ") + + sparsityString); + } + + bool calibCheck = getAndDelOption(arguments, "--calib", calibration); + if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty()) { + shapesCalib = shapes; + } + + std::string profilingVerbosityString; + if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString)) { + sample::gLogWarning + << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." + << std::endl; + } + + getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString); + if (profilingVerbosityString == "layer_names_only") { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; + } else if (profilingVerbosityString == "none") { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE; + } else if (profilingVerbosityString == "detailed") { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; + } else if (profilingVerbosityString == "default") { + sample::gLogWarning + << "--profilingVerbosity=default has been deprecated by " + "--profilingVerbosity=layer_names_only." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; + } else if (profilingVerbosityString == "verbose") { + sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated " + "by --profilingVerbosity=detailed." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; + } else if (!profilingVerbosityString.empty()) { + throw std::invalid_argument(std::string("Unknown profilingVerbosity: ") + + profilingVerbosityString); + } + + if (getAndDelOption(arguments, "--loadEngine", engine)) { + load = true; + } + if (getAndDelOption(arguments, "--saveEngine", engine)) { + save = true; + } + if (load && save) { + throw std::invalid_argument( + "Incompatible load and save engine options selected"); + } + + std::string tacticSourceArgs; + if (getAndDelOption(arguments, "--tacticSources", tacticSourceArgs)) { + std::vector tacticList = + splitToStringVec(tacticSourceArgs, ','); + for (auto& t : tacticList) { + bool enable{false}; + if (t.front() == '+') { + enable = true; + } else if (t.front() != '-') { + throw std::invalid_argument( + "Tactic source must be prefixed with + or -, indicating whether it " + "should be enabled or disabled " + "respectively."); + } + t.erase(0, 1); + + const auto toUpper = [](std::string& sourceName) { + std::transform(sourceName.begin(), sourceName.end(), sourceName.begin(), + [](char c) { return std::toupper(c); }); + return sourceName; + }; + + nvinfer1::TacticSource source{}; + t = toUpper(t); + if (t == "CUBLAS") { + source = nvinfer1::TacticSource::kCUBLAS; + } else if (t == "CUBLASLT" || t == "CUBLAS_LT") { + source = nvinfer1::TacticSource::kCUBLAS_LT; + } else if (t == "CUDNN") { + source = nvinfer1::TacticSource::kCUDNN; + } else { + throw std::invalid_argument(std::string("Unknown tactic source: ") + t); + } + + uint32_t sourceBit = 1U << static_cast(source); + + if (enable) { + enabledTactics |= sourceBit; + } else { + disabledTactics |= sourceBit; + } + + if (enabledTactics & disabledTactics) { + throw std::invalid_argument(std::string("Cannot enable and disable ") + + t); + } + } + } + + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + if (noBuilderCache) { + timingCacheMode = TimingCacheMode::kDISABLE; + } else if (!timingCacheFile.empty()) { + timingCacheMode = TimingCacheMode::kGLOBAL; + } else { + timingCacheMode = TimingCacheMode::kLOCAL; + } +} + +void SystemOptions::parse(Arguments& arguments) { + getAndDelOption(arguments, "--device", device); + getAndDelOption(arguments, "--useDLACore", DLACore); + getAndDelOption(arguments, "--allowGPUFallback", fallback); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) { + plugins.emplace_back(pluginName); + } +} + +void InferenceOptions::parse(Arguments& arguments) { + getAndDelOption(arguments, "--streams", streams); + getAndDelOption(arguments, "--iterations", iterations); + getAndDelOption(arguments, "--duration", duration); + getAndDelOption(arguments, "--warmUp", warmup); + getAndDelOption(arguments, "--sleepTime", sleep); + getAndDelOption(arguments, "--idleTime", idle); + bool exposeDMA{false}; + if (getAndDelOption(arguments, "--exposeDMA", exposeDMA)) { + overlap = !exposeDMA; + } + getAndDelOption(arguments, "--noDataTransfers", skipTransfers); + getAndDelOption(arguments, "--useManagedMemory", useManaged); + getAndDelOption(arguments, "--useSpinWait", spin); + getAndDelOption(arguments, "--threads", threads); + getAndDelOption(arguments, "--useCudaGraph", graph); + getAndDelOption(arguments, "--separateProfileRun", rerun); + getAndDelOption(arguments, "--buildOnly", skip); + getAndDelOption(arguments, "--timeDeserialize", timeDeserialize); + getAndDelOption(arguments, "--timeRefit", timeRefit); + + std::string list; + getAndDelOption(arguments, "--loadInputs", list); + std::vector inputsList{splitToStringVec(list, ',')}; + splitInsertKeyValue(inputsList, inputs); + + getShapesInference(arguments, shapes, "--shapes"); + getAndDelOption(arguments, "--batch", batch); +} + +void ReportingOptions::parse(Arguments& arguments) { + getAndDelOption(arguments, "--percentile", percentile); + getAndDelOption(arguments, "--avgRuns", avgs); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "--dumpRefit", refit); + getAndDelOption(arguments, "--dumpOutput", output); + getAndDelOption(arguments, "--dumpProfile", profile); + getAndDelOption(arguments, "--dumpLayerInfo", layerInfo); + getAndDelOption(arguments, "--exportTimes", exportTimes); + getAndDelOption(arguments, "--exportOutput", exportOutput); + getAndDelOption(arguments, "--exportProfile", exportProfile); + getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo); + if (percentile < 0 || percentile > 100) { + throw std::invalid_argument(std::string("Percentile ") + + std::to_string(percentile) + + "is not in [0,100]"); + } +} + +bool parseHelp(Arguments& arguments) { + bool helpLong{false}; + bool helpShort{false}; + getAndDelOption(arguments, "--help", helpLong); + getAndDelOption(arguments, "-h", helpShort); + return helpLong || helpShort; +} + +void AllOptions::parse(Arguments& arguments) { + model.parse(arguments); + build.parse(arguments); + system.parse(arguments); + inference.parse(arguments); + + // Use explicitBatch when input model is ONNX or when dynamic shapes are used. + const bool isOnnx{model.baseModel.format == ModelFormat::kONNX}; + const bool hasDynamicShapes{!build.shapes.empty() || + !inference.shapes.empty()}; + const bool detectedExplicitBatch = isOnnx || hasDynamicShapes; + + // Throw an error if user tries to use --batch or --maxBatch when the engine + // has explicit batch dim. + const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided}; + const bool batchWasSet{inference.batch != batchNotProvided}; + if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet)) { + throw std::invalid_argument( + "The --batch and --maxBatch flags should not be used when the input " + "model is ONNX or when dynamic shapes " + "are provided. Please use --optShapes and --shapes to set input shapes " + "instead."); + } + + // If batch and/or maxBatch is not set and the engine has implicit batch dim, + // set them to default values. + if (!detectedExplicitBatch) { + // If batch is not set, set it to default value. + if (!batchWasSet) { + inference.batch = defaultBatch; + } + // If maxBatch is not set, set it to be equal to batch. + if (!maxBatchWasSet) { + build.maxBatch = inference.batch; + } + // MaxBatch should not be less than batch. + if (build.maxBatch < inference.batch) { + throw std::invalid_argument( + "Build max batch " + std::to_string(build.maxBatch) + + " is less than inference batch " + std::to_string(inference.batch)); + } + } + + if (build.shapes.empty() && !inference.shapes.empty()) { + // If --shapes are provided but --optShapes are not, assume that optShapes + // is the same as shapes. + for (auto& s : inference.shapes) { + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, + s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, + s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, + s.first, s.second); + } + } else if (!build.shapes.empty() && inference.shapes.empty()) { + // If --optShapes are provided but --shapes are not, assume that shapes is + // the same as optShapes. + for (auto& s : build.shapes) { + insertShapesInference( + inference.shapes, s.first, + s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } + } + + reporting.parse(arguments); + helps = parseHelp(arguments); + + if (!helps) { + if (!build.load && model.baseModel.format == ModelFormat::kANY) { + throw std::invalid_argument("Model missing or format not recognized"); + } + if (build.safe && system.DLACore >= 0) { + auto checkSafeDLAFormats = [](std::vector const& fmt) { + return fmt.empty() + ? false + : std::all_of(fmt.begin(), fmt.end(), + [](IOFormat const& pair) { + bool supported{false}; + bool const isLINEAR{ + pair.second == + 1U << static_cast( + nvinfer1::TensorFormat::kLINEAR)}; + bool const isCHW4{ + pair.second == + 1U << static_cast( + nvinfer1::TensorFormat::kCHW4)}; + bool const isCHW32{ + pair.second == + 1U << static_cast( + nvinfer1::TensorFormat::kCHW32)}; + bool const isCHW16{ + pair.second == + 1U << static_cast( + nvinfer1::TensorFormat::kCHW16)}; + supported |= pair.first == + nvinfer1::DataType::kINT8 && + (isLINEAR || isCHW4 || isCHW32); + supported |= pair.first == + nvinfer1::DataType::kHALF && + (isLINEAR || isCHW4 || isCHW16); + return supported; + }); + }; + if (!checkSafeDLAFormats(build.inputFormats) || + !checkSafeDLAFormats(build.outputFormats)) { + throw std::invalid_argument( + "I/O formats for safe DLA capability are restricted to " + "fp16/int8:linear, fp16:chw16 or int8:chw32"); + } + if (system.fallback) { + throw std::invalid_argument( + "GPU fallback (--allowGPUFallback) not allowed for safe DLA " + "capability"); + } + } + } +} + +void SafeBuilderOptions::parse(Arguments& arguments) { + auto getFormats = [&arguments](std::vector& formatsVector, + const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) { + formatsVector.push_back(stringToValue(f)); + } + }; + + getAndDelOption(arguments, "--serialized", serialized); + getAndDelOption(arguments, "--onnx", onnxModelFile); + getAndDelOption(arguments, "--help", help); + getAndDelOption(arguments, "-h", help); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "-v", verbose); + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--calib", calibFile); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--std", standard); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) { + plugins.emplace_back(pluginName); + } +} + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) { + os << "=== Model Options ===" << std::endl; + + os << "Format: "; + switch (options.format) { + case ModelFormat::kCAFFE: { + os << "Caffe"; + break; + } + case ModelFormat::kONNX: { + os << "ONNX"; + break; + } + case ModelFormat::kUFF: { + os << "UFF"; + break; + } + case ModelFormat::kANY: + os << "*"; + break; + } + os << std::endl << "Model: " << options.model << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, const UffInput& input) { + os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl; + for (const auto& i : input.inputs) { + os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] + << "," << i.second.d[2] << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options) { + os << options.baseModel; + switch (options.baseModel.format) { + case ModelFormat::kCAFFE: { + os << "Prototxt: " << options.prototxt << std::endl; + break; + } + case ModelFormat::kUFF: { + os << options.uffInputs; + break; + } + case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or + // the generic case + case ModelFormat::kANY: + break; + } + + os << "Output:"; + for (const auto& o : options.outputs) { + os << " " << o; + } + os << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) { + switch (dtype) { + case nvinfer1::DataType::kFLOAT: { + os << "fp32"; + break; + } + case nvinfer1::DataType::kHALF: { + os << "fp16"; + break; + } + case nvinfer1::DataType::kINT8: { + os << "int8"; + break; + } + case nvinfer1::DataType::kINT32: { + os << "int32"; + break; + } + case nvinfer1::DataType::kBOOL: { + os << "bool"; + break; + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, IOFormat const& format) { + os << format.first << ":"; + + for (int32_t f = 0; f < nvinfer1::EnumMax(); ++f) { + if ((1U << f) & format.second) { + if (f) { + os << "+"; + } + switch (nvinfer1::TensorFormat(f)) { + case nvinfer1::TensorFormat::kLINEAR: { + os << "chw"; + break; + } + case nvinfer1::TensorFormat::kCHW2: { + os << "chw2"; + break; + } + case nvinfer1::TensorFormat::kHWC8: { + os << "hwc8"; + break; + } + case nvinfer1::TensorFormat::kHWC16: { + os << "hwc16"; + break; + } + case nvinfer1::TensorFormat::kCHW4: { + os << "chw4"; + break; + } + case nvinfer1::TensorFormat::kCHW16: { + os << "chw16"; + break; + } + case nvinfer1::TensorFormat::kCHW32: { + os << "chw32"; + break; + } + case nvinfer1::TensorFormat::kDHWC8: { + os << "dhwc8"; + break; + } + case nvinfer1::TensorFormat::kCDHW32: { + os << "cdhw32"; + break; + } + case nvinfer1::TensorFormat::kHWC: { + os << "hwc"; + break; + } + case nvinfer1::TensorFormat::kDLA_LINEAR: { + os << "dla_linear"; + break; + } + case nvinfer1::TensorFormat::kDLA_HWC4: { + os << "dla_hwc4"; + break; + } + } + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims) { + int32_t i = 0; + for (const auto& d : dims) { + if (!d.size()) { + break; + } + os << (i ? "+" : "") << d; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, + LayerPrecisions const& layerPrecisions) { + int32_t i = 0; + for (auto const& layerPrecision : layerPrecisions) { + os << (i ? "," : "") << layerPrecision.first << ":" + << layerPrecision.second; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options) { + // clang-format off + os << "=== Build Options ===" << std::endl << + + "Max batch: "; printBatch(os, options.maxBatch) << std::endl << + "Memory Pools: "; printMemoryPools(os, options) << std::endl << + "minTiming: " << options.minTiming << std::endl << + "avgTiming: " << options.avgTiming << std::endl << + "Precision: "; printPrecision(os, options) << std::endl << + "LayerPrecisions: " << options.layerPrecisions << std::endl << + "Calibration: " << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl << + "Refit: " << boolToEnabled(options.refittable) << std::endl << + "Sparsity: "; printSparsity(os, options) << std::endl << + "Safe mode: " << boolToEnabled(options.safe) << std::endl << + "DirectIO mode: " << boolToEnabled(options.directIO) << std::endl << + "Restricted mode: " << boolToEnabled(options.restricted) << std::endl << + "Save engine: " << (options.save ? options.engine : "") << std::endl << + "Load engine: " << (options.load ? options.engine : "") << std::endl << + "Profiling verbosity: " << static_cast(options.profilingVerbosity) << std::endl << + "Tactic sources: "; printTacticSources(os, options.enabledTactics, options.disabledTactics) << std::endl << + "timingCacheMode: "; printTimingCache(os, options) << std::endl << + "timingCacheFile: " << options.timingCacheFile << std::endl; + // clang-format on + + auto printIOFormats = [](std::ostream& os, const char* direction, + const std::vector formats) { + if (formats.empty()) { + os << direction << "s format: fp32:CHW" << std::endl; + } else { + for (const auto& f : formats) { + os << direction << ": " << f << std::endl; + } + } + }; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + printShapes(os, "build", options.shapes); + printShapes(os, "calibration", options.shapesCalib); + + return os; +} + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options) { + // clang-format off + os << "=== System Options ===" << std::endl << + + "Device: " << options.device << std::endl << + "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << + (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl; + os << "Plugins:"; + + for (const auto& p : options.plugins) + { + os << " " << p; + } + os << std::endl; + + return os; + // clang-format on +} + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) { + // clang-format off + os << "=== Inference Options ===" << std::endl << + + "Batch: "; + if (options.batch && options.shapes.empty()) + { + os << options.batch << std::endl; + } + else + { + os << "Explicit" << std::endl; + } + printShapes(os, "inference", options.shapes); + os << "Iterations: " << options.iterations << std::endl << + "Duration: " << options.duration << "s (+ " + << options.warmup << "ms warm up)" << std::endl << + "Sleep time: " << options.sleep << "ms" << std::endl << + "Idle time: " << options.idle << "ms" << std::endl << + "Streams: " << options.streams << std::endl << + "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << + "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << + "Spin-wait: " << boolToEnabled(options.spin) << std::endl << + "Multithreading: " << boolToEnabled(options.threads) << std::endl << + "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << + "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << + "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << + "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << + "Skip inference: " << boolToEnabled(options.skip) << std::endl; + + // clang-format on + os << "Inputs:" << std::endl; + for (const auto& input : options.inputs) { + os << input.first << "<-" << input.second << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options) { + // clang-format off + os << "=== Reporting Options ===" << std::endl << + + "Verbose: " << boolToEnabled(options.verbose) << std::endl << + "Averages: " << options.avgs << " inferences" << std::endl << + "Percentile: " << options.percentile << std::endl << + "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << + "Dump output: " << boolToEnabled(options.output) << std::endl << + "Profile: " << boolToEnabled(options.profile) << std::endl << + "Export timing to JSON file: " << options.exportTimes << std::endl << + "Export output to JSON file: " << options.exportOutput << std::endl << + "Export profile to JSON file: " << options.exportProfile << std::endl; + // clang-format on + + return os; +} + +std::ostream& operator<<(std::ostream& os, const AllOptions& options) { + os << options.model << options.build << options.system << options.inference + << options.reporting << std::endl; + return os; +} + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) { + auto printIOFormats = [](std::ostream& os, const char* direction, + const std::vector formats) { + if (formats.empty()) { + os << direction << "s format: fp32:CHW" << std::endl; + } else { + for (const auto& f : formats) { + os << direction << ": " << f << std::endl; + } + } + }; + + os << "=== Build Options ===" << std::endl; + os << "Model ONNX: " << options.onnxModelFile << std::endl; + + os << "Precision: FP16"; + if (options.int8) { + os << " + INT8"; + } + os << std::endl; + os << "Calibration file: " << options.calibFile << std::endl; + os << "Serialized Network: " << options.serialized << std::endl; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + + os << "Plugins:"; + for (const auto& p : options.plugins) { + os << " " << p; + } + os << std::endl; + return os; +} + +void BaseModelOptions::help(std::ostream& os) { + // clang-format off + os << " --uff= UFF model" << std::endl << + " --onnx= ONNX model" << std::endl << + " --model= Caffe model (default = no model, random weights used)" << std::endl; + // clang-format on +} + +void UffInput::help(std::ostream& os) { + // clang-format off + os << " --uffInput=,X,Y,Z Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified " + "multiple times; at least one is required for UFF models" << std::endl << + " --uffNHWC Set if inputs are in the NHWC layout instead of NCHW (use " << + "X,Y,Z=H,W,C order in --uffInput)" << std::endl; + // clang-format on +} + +void ModelOptions::help(std::ostream& os) { + // clang-format off + os << "=== Model Options ===" << std::endl; + BaseModelOptions::help(os); + os << " --deploy= Caffe prototxt file" << std::endl << + " --output=[,]* Output names (it can be specified multiple times); at least one output " + "is required for UFF and Caffe" << std::endl; + UffInput::help(os); + // clang-format on +} + +void BuildOptions::help(std::ostream& os) { + // clang-format off + os << "=== Build Options ===" "\n" + " --maxBatch Set max batch size and build an implicit batch engine (default = same size as --batch)" "\n" + " This option should not be used when the input model is ONNX or when dynamic shapes are provided." "\n" + " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" + " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" + " Note: All three of min, opt and max shapes must be supplied." "\n" + " However, if only opt shapes is supplied then it will be expanded so" "\n" + " that min shapes and max shapes are set to the same values as opt shapes." "\n" + " Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." "\n" + " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" + " Each input shape is supplied as a key-value pair where key is the input name and" "\n" + " value is the dimensions (including the batch dimension) to be used for that input." "\n" + " Each key-value pair has the key and value separated using a colon (:)." "\n" + " Multiple input shapes can be provided via comma-separated key-value pairs." "\n" + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" + " See --outputIOFormats help for the grammar of type and format list." "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " inputs following the same order as network inputs ID (even if only one input" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " outputs following the same order as network outputs ID (even if only one output" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " IO Formats: spec ::= IOfmt[\",\"spec]" "\n" + " IOfmt ::= type:fmt" "\n" + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" + " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" "\n" + " --workspace=N Set workspace size in MiB." "\n" + " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s) in MiB." "\n" + " Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n" + " Pool constraint: poolspec ::= poolfmt[\",\"poolspec]" "\n" + " poolfmt ::= pool:sizeInMiB" "\n" + " pool ::= \"workspace\"|\"dlaSRAM\"|\"dlaLocalDRAM\"|\"dlaGlobalDRAM\"" "\n" + " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)" "\n" + " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " + << defaultMinTiming << ")" "\n" + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " + << defaultAvgTiming << ")" "\n" + " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" + " and weights within the engine." "\n" + " --sparsity=spec Control sparsity (default = disabled). " "\n" + " Sparsity: spec ::= \"disable\", \"enable\", \"force\"" "\n" + " Note: Description about each of these options is as below" "\n" + " disable = do not enable sparse tactics in the builder (this is the default)" "\n" + " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" + " considered if the weights have the right sparsity pattern)" "\n" + " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" + " a sparsity pattern (even if you loaded a model yourself)" "\n" + " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" + " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" + " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" + " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" + " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" + " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" + " Precision Constaints: spec ::= \"none\" | \"obey\" | \"prefer\"" "\n" + " none = no constraints" "\n" + " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" + " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" + " otherwise" "\n" + " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" + " \"obey\" or \"prefer\". (default = none)" "\n" + " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" + " layerName to specify the default precision for all the unspecified layers." "\n" + " Per-layer precision spec ::= layerPrecision[\",\"spec]" "\n" + " layerPrecision ::= layerName\":\"precision" "\n" + " precision ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" "\n" + " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" + " \"obey\" or \"prefer\". (default = none)" "\n" + " The specs are read left-to-right, and later ones override earlier ones. \"*\" can be used as a" "\n" + " layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n" + " one output, then multiple types separated by \"+\" can be provided for this layer." "\n" + " Per-layer output type spec ::= layerOutputTypes[\",\"spec]" "\n" + " layerOutputTypes ::= layerName\":\"type" "\n" + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"[\"+\"type]" "\n" + " --calib= Read INT8 calibration cache file" "\n" + " --safe Enable build safety certified engine" "\n" + " --consistency Perform consistency checking on safety certified engine" "\n" + " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" + " --saveEngine= Save the serialized engine" "\n" + " --loadEngine= Load a serialized engine" "\n" + " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" + " tactic sources (default = all available tactics)." "\n" + " Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics." "\n" + " Tactic Sources: tactics ::= [\",\"tactic]" "\n" + " tactic ::= (+|-)lib" "\n" + " lib ::= \"CUBLAS\"|\"CUBLAS_LT\"|\"CUDNN\"" "\n" + " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" + " --timingCacheFile= Save/load the serialized global timing cache" "\n" + ; + // clang-format on + os << std::flush; +} + +void SystemOptions::help(std::ostream& os) { + // clang-format off + os << "=== System Options ===" << std::endl << + " --device=N Select cuda device N (default = " << defaultDevice << ")" << std::endl << + " --useDLACore=N Select DLA core N for layers that support DLA (default = none)" << std::endl << + " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers " + "(default = disabled)" << std::endl; + os << " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl; + // clang-format on +} + +void InferenceOptions::help(std::ostream& os) { + // clang-format off + os << "=== Inference Options ===" << std::endl << + " --batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << + " This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl << + " shapes are provided when the engine is built." << std::endl << + " --shapes=spec Set input shapes for dynamic shapes inference inputs." << std::endl << + " Note: Input names can be wrapped with escaped single quotes (ex: \\\'Input:0\\\')." << std::endl << + " Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128" << std::endl << + " Each input shape is supplied as a key-value pair where key is the input name and" << std::endl << + " value is the dimensions (including the batch dimension) to be used for that input." << std::endl << + " Each key-value pair has the key and value separated using a colon (:)." << std::endl << + " Multiple input shapes can be provided via comma-separated key-value pairs." << std::endl << + " --loadInputs=spec Load input values from files (default = generate random inputs). Input names can be " + "wrapped with single quotes (ex: 'Input:0')" << std::endl << + " Input values spec ::= Ival[\",\"spec]" << std::endl << + " Ival ::= name\":\"file" << std::endl << + " --iterations=N Run at least N inference iterations (default = " << defaultIterations << ")" << std::endl << + " --warmUp=N Run for N milliseconds to warmup before measuring performance (default = " + << defaultWarmUp << ")" << std::endl << + " --duration=N Run performance measurements for at least N seconds wallclock time (default = " + << defaultDuration << ")" << std::endl << + " --sleepTime=N Delay inference start with a gap of N milliseconds between launch and compute " + "(default = " << defaultSleep << ")" << std::endl << + " --idleTime=N Sleep N milliseconds between two continuous iterations" + "(default = " << defaultIdle << ")" << std::endl << + " --streams=N Instantiate N engines to use concurrently (default = " << defaultStreams << ")" << std::endl << + " --exposeDMA Serialize DMA transfers to and from device (default = disabled)." << std::endl << + " --noDataTransfers Disable DMA transfers to and from device (default = enabled)." << std::endl << + " --useManagedMemory Use managed memory instead of seperate host and device allocations (default = disabled)." << std::endl << + " --useSpinWait Actively synchronize on GPU events. This option may decrease synchronization time but " + "increase CPU usage and power (default = disabled)" << std::endl << + " --threads Enable multithreading to drive engines with independent threads" + " or speed up refitting (default = disabled) " << std::endl << + " --useCudaGraph Use CUDA graph to capture engine execution and then launch inference (default = disabled)." << std::endl << + " This flag may be ignored if the graph capture fails." << std::endl << + " --timeDeserialize Time the amount of time it takes to deserialize the network and exit." << std::endl << + " --timeRefit Time the amount of time it takes to refit the engine before inference." << std::endl << + " --separateProfileRun Do not attach the profiler in the benchmark run; if profiling is enabled, a second " + "profile run will be executed (default = disabled)" << std::endl << + " --buildOnly Skip inference perf measurement (default = disabled)" << std::endl; + // clang-format on +} + +void ReportingOptions::help(std::ostream& os) { + // clang-format off + os << "=== Reporting Options ===" << std::endl << + " --verbose Use verbose logging (default = false)" << std::endl << + " --avgRuns=N Report performance measurements averaged over N consecutive " + "iterations (default = " << defaultAvgRuns << ")" << std::endl << + " --percentile=P Report performance for the P percentage (0<=P<=100, 0 " + "representing max perf, and 100 representing min perf; (default" + " = " << defaultPercentile << "%)" << std::endl << + " --dumpRefit Print the refittable layers and weights from a refittable " + "engine" << std::endl << + " --dumpOutput Print the output tensor(s) of the last inference iteration " + "(default = disabled)" << std::endl << + " --dumpProfile Print profile information per layer (default = disabled)" << std::endl << + " --dumpLayerInfo Print layer information of the engine to console " + "(default = disabled)" << std::endl << + " --exportTimes= Write the timing results in a json file (default = disabled)" << std::endl << + " --exportOutput= Write the output tensors to a json file (default = disabled)" << std::endl << + " --exportProfile= Write the profile information per layer in a json file " + "(default = disabled)" << std::endl << + " --exportLayerInfo= Write the layer information of the engine in a json file " + "(default = disabled)" << std::endl; + // clang-format on +} + +void helpHelp(std::ostream& os) { + // clang-format off + os << "=== Help ===" << std::endl << + " --help, -h Print this message" << std::endl; + // clang-format on +} + +void AllOptions::help(std::ostream& os) { + ModelOptions::help(os); + os << std::endl; + BuildOptions::help(os); + os << std::endl; + InferenceOptions::help(os); + os << std::endl; + // clang-format off + os << "=== Build and Inference Batch Options ===" << std::endl << + " When using implicit batch, the max batch size of the engine, if not given, " << std::endl << + " is set to the inference batch size;" << std::endl << + " when using explicit batch, if shapes are specified only for inference, they " << std::endl << + " will be used also as min/opt/max in the build profile; if shapes are " << std::endl << + " specified only for the build, the opt shapes will be used also for inference;" << std::endl << + " if both are specified, they must be compatible; and if explicit batch is " << std::endl << + " enabled but neither is specified, the model must provide complete static" << std::endl << + " dimensions, including batch size, for all inputs" << std::endl << + " Using ONNX models automatically forces explicit batch." << std::endl << + std::endl; + // clang-format on + ReportingOptions::help(os); + os << std::endl; + SystemOptions::help(os); + os << std::endl; + helpHelp(os); +} + +void SafeBuilderOptions::printHelp(std::ostream& os) { + // clang-format off + os << "=== Mandatory ===" << std::endl << + " --onnx= ONNX model" << std::endl << + " " << std::endl << + "=== Optional ===" << std::endl << + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" << std::endl << + " See --outputIOFormats help for the grammar of type and format list." << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " inputs following the same order as network inputs ID (even if only one input" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " outputs following the same order as network outputs ID (even if only one output" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + " IO Formats: spec ::= IOfmt[\",\"spec]" << std::endl << + " IOfmt ::= type:fmt" << std::endl << + " type ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\"" << std::endl << + " fmt ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\"|\"dhwc8\")[\"+\"fmt]" << std::endl << + " --int8 Enable int8 precision, in addition to fp16 (default = disabled)" << std::endl << + " --consistency Enable consistency check for serialized engine, (default = disabled)" << std::endl << + " --std Build standard serialized engine, (default = disabled)" << std::endl << + " --calib= Read INT8 calibration cache file" << std::endl << + " --serialized= Save the serialized network" << std::endl << + " --plugins Plugin library (.so) to load (can be specified multiple times)" << std::endl << + " --verbose or -v Use verbose logging (default = false)" << std::endl << + " --help or -h Print this message" << std::endl << + " " << std::endl; + // clang-format on +} + +} // namespace sample diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleOptions.h b/csrc/fastdeploy/backends/tensorrt/common/sampleOptions.h new file mode 100644 index 000000000..99293da10 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleOptions.h @@ -0,0 +1,311 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_OPTIONS_H +#define TRT_SAMPLE_OPTIONS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +namespace sample { + +// Build default params +constexpr int32_t maxBatchNotProvided{0}; +constexpr int32_t defaultMinTiming{1}; +constexpr int32_t defaultAvgTiming{8}; + +// System default params +constexpr int32_t defaultDevice{0}; + +// Inference default params +constexpr int32_t defaultBatch{1}; +constexpr int32_t batchNotProvided{0}; +constexpr int32_t defaultStreams{1}; +constexpr int32_t defaultIterations{10}; +constexpr float defaultWarmUp{200.F}; +constexpr float defaultDuration{3.F}; +constexpr float defaultSleep{}; +constexpr float defaultIdle{}; + +// Reporting default params +constexpr int32_t defaultAvgRuns{10}; +constexpr float defaultPercentile{99}; + +enum class PrecisionConstraints { kNONE, kOBEY, kPREFER }; + +enum class ModelFormat { kANY, kCAFFE, kONNX, kUFF }; + +enum class SparsityFlag { kDISABLE, kENABLE, kFORCE }; + +enum class TimingCacheMode { kDISABLE, kLOCAL, kGLOBAL }; + +using Arguments = std::unordered_multimap; + +using IOFormat = std::pair; + +using ShapeRange = + std::array, + nvinfer1::EnumMax()>; + +using LayerPrecisions = std::unordered_map; +using LayerOutputTypes = + std::unordered_map>; + +struct Options { + virtual void parse(Arguments& arguments) = 0; +}; + +struct BaseModelOptions : public Options { + ModelFormat format{ModelFormat::kANY}; + std::string model; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct UffInput : public Options { + std::vector> inputs; + bool NHWC{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct ModelOptions : public Options { + BaseModelOptions baseModel; + std::string prototxt; + std::vector outputs; + UffInput uffInputs; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct BuildOptions : public Options { + int32_t maxBatch{maxBatchNotProvided}; + double workspace{-1.0}; + double dlaSRAM{-1.0}; + double dlaLocalDRAM{-1.0}; + double dlaGlobalDRAM{-1.0}; + int32_t minTiming{defaultMinTiming}; + int32_t avgTiming{defaultAvgTiming}; + bool tf32{true}; + bool fp16{false}; + bool int8{false}; + bool directIO{false}; + PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE}; + LayerPrecisions layerPrecisions; + LayerOutputTypes layerOutputTypes; + bool safe{false}; + bool consistency{false}; + bool restricted{false}; + bool save{false}; + bool load{false}; + bool refittable{false}; + SparsityFlag sparsity{SparsityFlag::kDISABLE}; + nvinfer1::ProfilingVerbosity profilingVerbosity{ + nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; + std::string engine; + std::string calibration; + std::unordered_map shapes; + std::unordered_map shapesCalib; + std::vector inputFormats; + std::vector outputFormats; + nvinfer1::TacticSources enabledTactics{0}; + nvinfer1::TacticSources disabledTactics{0}; + TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; + std::string timingCacheFile{}; + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct SystemOptions : public Options { + int32_t device{defaultDevice}; + int32_t DLACore{-1}; + bool fallback{false}; + std::vector plugins; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct InferenceOptions : public Options { + int32_t batch{batchNotProvided}; + int32_t iterations{defaultIterations}; + int32_t streams{defaultStreams}; + float warmup{defaultWarmUp}; + float duration{defaultDuration}; + float sleep{defaultSleep}; + float idle{defaultIdle}; + bool overlap{true}; + bool skipTransfers{false}; + bool useManaged{false}; + bool spin{false}; + bool threads{false}; + bool graph{false}; + bool skip{false}; + bool rerun{false}; + bool timeDeserialize{false}; + bool timeRefit{false}; + std::unordered_map inputs; + std::unordered_map> shapes; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct ReportingOptions : public Options { + bool verbose{false}; + int32_t avgs{defaultAvgRuns}; + float percentile{defaultPercentile}; + bool refit{false}; + bool output{false}; + bool profile{false}; + bool layerInfo{false}; + std::string exportTimes; + std::string exportOutput; + std::string exportProfile; + std::string exportLayerInfo; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +struct SafeBuilderOptions : public Options { + std::string serialized{}; + std::string onnxModelFile{}; + bool help{false}; + bool verbose{false}; + std::vector inputFormats; + std::vector outputFormats; + bool int8{false}; + std::string calibFile{}; + std::vector plugins; + bool consistency{false}; + bool standard{false}; + + void parse(Arguments& arguments) override; + + static void printHelp(std::ostream& out); +}; + +struct AllOptions : public Options { + ModelOptions model; + BuildOptions build; + SystemOptions system; + InferenceOptions inference; + ReportingOptions reporting; + bool helps{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]); + +bool parseHelp(Arguments& arguments); + +void helpHelp(std::ostream& out); + +// Functions to print options + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const UffInput& input); + +std::ostream& operator<<(std::ostream& os, const IOFormat& format); + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims); + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options); + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options); + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options); + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options); + +std::ostream& operator<<(std::ostream& os, const AllOptions& options); + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options); + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) { + for (int32_t i = 0; i < dims.nbDims; ++i) { + os << (i ? "x" : "") << dims.d[i]; + } + return os; +} +inline std::ostream& operator<<(std::ostream& os, + const nvinfer1::WeightsRole role) { + switch (role) { + case nvinfer1::WeightsRole::kKERNEL: { + os << "Kernel"; + break; + } + case nvinfer1::WeightsRole::kBIAS: { + os << "Bias"; + break; + } + case nvinfer1::WeightsRole::kSHIFT: { + os << "Shift"; + break; + } + case nvinfer1::WeightsRole::kSCALE: { + os << "Scale"; + break; + } + case nvinfer1::WeightsRole::kCONSTANT: { + os << "Constant"; + break; + } + case nvinfer1::WeightsRole::kANY: { + os << "Any"; + break; + } + } + + return os; +} + +inline std::ostream& operator<<(std::ostream& os, + const std::vector& vec) { + for (int32_t i = 0, e = static_cast(vec.size()); i < e; ++i) { + os << (i ? "x" : "") << vec[i]; + } + return os; +} + +} // namespace sample + +#endif // TRT_SAMPLES_OPTIONS_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleReporting.cpp b/csrc/fastdeploy/backends/tensorrt/common/sampleReporting.cpp new file mode 100644 index 000000000..5e8e8619b --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleReporting.cpp @@ -0,0 +1,480 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" + +using namespace nvinfer1; + +namespace sample { + +namespace { + +//! +//! \brief Find percentile in an ascending sequence of timings +//! \note percentile must be in [0, 100]. Otherwise, an exception is thrown. +//! +template +float findPercentile(float percentile, + std::vector const& timings, + T const& toFloat) { + int32_t const all = static_cast(timings.size()); + int32_t const exclude = static_cast((1 - percentile / 100) * all); + if (timings.empty()) { + return std::numeric_limits::infinity(); + } + if (percentile < 0.0f || percentile > 100.0f) { + throw std::runtime_error("percentile is not in [0, 100]!"); + } + return toFloat(timings[std::max(all - 1 - exclude, 0)]); +} + +//! +//! \brief Find median in a sorted sequence of timings +//! +template +float findMedian(std::vector const& timings, T const& toFloat) { + if (timings.empty()) { + return std::numeric_limits::infinity(); + } + + int32_t const m = timings.size() / 2; + if (timings.size() % 2) { + return toFloat(timings[m]); + } + + return (toFloat(timings[m - 1]) + toFloat(timings[m])) / 2; +} + +//! +//! \brief Find coefficient of variance (which is std / mean) in a sorted +//! sequence of timings given the mean +//! +template +float findCoeffOfVariance(std::vector const& timings, + T const& toFloat, float mean) { + if (timings.empty()) { + return 0; + } + + if (mean == 0.F) { + return std::numeric_limits::infinity(); + } + + auto const metricAccumulator = [toFloat, mean](float acc, + InferenceTime const& a) { + float const diff = toFloat(a) - mean; + return acc + diff * diff; + }; + float const variance = + std::accumulate(timings.begin(), timings.end(), 0.F, metricAccumulator) / + timings.size(); + + return std::sqrt(variance) / mean * 100.F; +} + +inline InferenceTime traceToTiming(const InferenceTrace& a) { + return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), + (a.computeEnd - a.computeStart), (a.d2hEnd - a.d2hStart), + (a.d2hEnd - a.h2dStart)); +} + +} // namespace + +void printProlog(int32_t warmups, int32_t timings, float warmupMs, + float benchTimeMs, std::ostream& os) { + os << "Warmup completed " << warmups << " queries over " << warmupMs << " ms" + << std::endl; + os << "Timing trace has " << timings << " queries over " << benchTimeMs / 1000 + << " s" << std::endl; +} + +void printTiming(std::vector const& timings, int32_t runsPerAvg, + std::ostream& os) { + int32_t count = 0; + InferenceTime sum; + + os << std::endl; + os << "=== Trace details ===" << std::endl; + os << "Trace averages of " << runsPerAvg << " runs:" << std::endl; + for (auto const& t : timings) { + sum += t; + + if (++count == runsPerAvg) { + // clang-format off + os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg + << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end " << sum.e2e / runsPerAvg + << " ms, enqueue " << sum.enq / runsPerAvg << " ms)" << std::endl; + // clang-format on + count = 0; + sum.enq = 0; + sum.h2d = 0; + sum.compute = 0; + sum.d2h = 0; + sum.e2e = 0; + } + } +} + +void printMetricExplanations(std::ostream& os) { + os << std::endl; + os << "=== Explanations of the performance metrics ===" << std::endl; + os << "Total Host Walltime: the host walltime from when the first query " + "(after warmups) is enqueued to when the " + "last query is completed." + << std::endl; + os << "GPU Compute Time: the GPU latency to execute the kernels for a query." + << std::endl; + os << "Total GPU Compute Time: the summation of the GPU Compute Time of all " + "the queries. If this is significantly " + "shorter than Total Host Walltime, the GPU may be under-utilized " + "because of host-side overheads or data " + "transfers." + << std::endl; + os << "Throughput: the observed throughput computed by dividing the number " + "of queries by the Total Host Walltime. " + "If this is significantly lower than the reciprocal of GPU Compute " + "Time, the GPU may be under-utilized " + "because of host-side overheads or data transfers." + << std::endl; + os << "Enqueue Time: the host latency to enqueue a query. If this is longer " + "than GPU Compute Time, the GPU may be " + "under-utilized." + << std::endl; + os << "H2D Latency: the latency for host-to-device data transfers for input " + "tensors of a single query." + << std::endl; + os << "D2H Latency: the latency for device-to-host data transfers for output " + "tensors of a single query." + << std::endl; + os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H " + "Latency. This is the latency to infer a " + "single query." + << std::endl; + os << "End-to-End Host Latency: the duration from when the H2D of a query is " + "called to when the D2H of the same " + "query is completed, which includes the latency to wait for the " + "completion of the previous query. This is " + "the latency of a query if multiple queries are enqueued consecutively." + << std::endl; +} + +PerformanceResult +getPerformanceResult(std::vector const& timings, + std::function metricGetter, + float percentile) { + auto const metricComparator = [metricGetter](InferenceTime const& a, + InferenceTime const& b) { + return metricGetter(a) < metricGetter(b); + }; + auto const metricAccumulator = [metricGetter](float acc, + InferenceTime const& a) { + return acc + metricGetter(a); + }; + std::vector newTimings = timings; + std::sort(newTimings.begin(), newTimings.end(), metricComparator); + PerformanceResult result; + result.min = metricGetter(newTimings.front()); + result.max = metricGetter(newTimings.back()); + result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, + metricAccumulator) / + newTimings.size(); + result.median = findMedian(newTimings, metricGetter); + result.percentile = findPercentile(percentile, newTimings, metricGetter); + result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean); + return result; +} + +void printEpilog(std::vector const& timings, float walltimeMs, + float percentile, int32_t batchSize, std::ostream& osInfo, + std::ostream& osWarning, std::ostream& osVerbose) { + float const throughput = batchSize * timings.size() / walltimeMs * 1000; + + auto const getLatency = [](InferenceTime const& t) { return t.latency(); }; + auto const latencyResult = + getPerformanceResult(timings, getLatency, percentile); + + auto const getEndToEnd = [](InferenceTime const& t) { return t.e2e; }; + auto const e2eLatencyResult = + getPerformanceResult(timings, getEndToEnd, percentile); + + auto const getEnqueue = [](InferenceTime const& t) { return t.enq; }; + auto const enqueueResult = + getPerformanceResult(timings, getEnqueue, percentile); + + auto const getH2d = [](InferenceTime const& t) { return t.h2d; }; + auto const h2dResult = getPerformanceResult(timings, getH2d, percentile); + + auto const getCompute = [](InferenceTime const& t) { return t.compute; }; + auto const gpuComputeResult = + getPerformanceResult(timings, getCompute, percentile); + + auto const getD2h = [](InferenceTime const& t) { return t.d2h; }; + auto const d2hResult = getPerformanceResult(timings, getD2h, percentile); + + auto const toPerfString = [percentile](const PerformanceResult& r) { + std::stringstream s; + s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean + << " ms, " + << "median = " << r.median << " ms, percentile(" << percentile + << "%) = " << r.percentile << " ms"; + return s.str(); + }; + + osInfo << std::endl; + osInfo << "=== Performance summary ===" << std::endl; + osInfo << "Throughput: " << throughput << " qps" << std::endl; + osInfo << "Latency: " << toPerfString(latencyResult) << std::endl; + osInfo << "End-to-End Host Latency: " << toPerfString(e2eLatencyResult) + << std::endl; + osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl; + osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl; + osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl; + osInfo << "D2H Latency: " << toPerfString(d2hResult) << std::endl; + osInfo << "Total Host Walltime: " << walltimeMs / 1000 << " s" << std::endl; + osInfo << "Total GPU Compute Time: " + << gpuComputeResult.mean * timings.size() / 1000 << " s" << std::endl; + + // Report warnings if the throughput is bound by other factors than GPU + // Compute Time. + constexpr float kENQUEUE_BOUND_REPORTING_THRESHOLD{0.8F}; + if (enqueueResult.median > + kENQUEUE_BOUND_REPORTING_THRESHOLD * gpuComputeResult.median) { + osWarning << "* Throughput may be bound by Enqueue Time rather than GPU " + "Compute and the GPU may be under-utilized." + << std::endl; + osWarning << " If not already in use, --useCudaGraph (utilize CUDA graphs " + "where possible) may increase the " + "throughput." + << std::endl; + } + if (h2dResult.median >= gpuComputeResult.median) { + osWarning << "* Throughput may be bound by host-to-device transfers for " + "the inputs rather than GPU Compute and " + "the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." + << std::endl; + } + if (d2hResult.median >= gpuComputeResult.median) { + osWarning << "* Throughput may be bound by device-to-host transfers for " + "the outputs rather than GPU Compute " + "and the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." + << std::endl; + } + + // Report warnings if the GPU Compute Time is unstable. + constexpr float kUNSTABLE_PERF_REPORTING_THRESHOLD{1.0F}; + if (gpuComputeResult.coeffVar > kUNSTABLE_PERF_REPORTING_THRESHOLD) { + osWarning + << "* GPU compute time is unstable, with coefficient of variance = " + << gpuComputeResult.coeffVar << "%." << std::endl; + osWarning << " If not already in use, locking GPU clock frequency or " + "adding --useSpinWait may improve the " + << "stability." << std::endl; + } + + // Explain what the metrics mean. + osInfo << "Explanations of the performance metrics are printed in the " + "verbose logs." + << std::endl; + printMetricExplanations(osVerbose); + + osInfo << std::endl; +} + +void printPerformanceReport(std::vector const& trace, + const ReportingOptions& reporting, float warmupMs, + int32_t batchSize, std::ostream& osInfo, + std::ostream& osWarning, std::ostream& osVerbose) { + auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { + return a.computeStart >= warmupMs; + }; + auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup); + int32_t const warmups = noWarmup - trace.begin(); + float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart; + // when implicit batch used, batchSize = options.inference.batch, which is + // parsed through --batch + // when explicit batch used, batchSize = options.inference.batch = 0 + // treat inference with explicit batch as a single query and report the + // throughput + batchSize = batchSize ? batchSize : 1; + printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, + warmupMs, benchTime, osInfo); + + std::vector timings(trace.size() - warmups); + std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming); + printTiming(timings, reporting.avgs, osInfo); + printEpilog(timings, benchTime, reporting.percentile, batchSize, osInfo, + osWarning, osVerbose); + + if (!reporting.exportTimes.empty()) { + exportJSONTrace(trace, reporting.exportTimes); + } +} + +//! Printed format: +//! [ value, ...] +//! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end +//! h2d" : time, "start compute" : time, +//! "end compute" : time, "start d2h" : time, "end d2h" : time, +//! "h2d" : time, "compute" : time, +//! "d2h" : time, "latency" : time, "end to end" : time } +//! +void exportJSONTrace(std::vector const& trace, + std::string const& fileName) { + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl; + char const* sep = " "; + for (auto const& t : trace) { + InferenceTime const it(traceToTiming(t)); + os << sep << "{ "; + sep = ", "; + // clang-format off + os << "\"startEnqMs\" : " << t.enqStart << sep << "\"endEnqMs\" : " << t.enqEnd << sep + << "\"startH2dMs\" : " << t.h2dStart << sep << "\"endH2dMs\" : " << t.h2dEnd << sep + << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep + << "\"startD2hMs\" : " << t.d2hStart << sep << "\"endD2hMs\" : " << t.d2hEnd << sep + << "\"h2dMs\" : " << it.h2d << sep << "\"computeMs\" : " << it.compute << sep + << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << sep + << "\"endToEndMs\" : " << it.e2e << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept { + if (mIterator == mLayers.end()) { + bool const first = !mLayers.empty() && mLayers.begin()->name == layerName; + mUpdatesCount += mLayers.empty() || first; + if (first) { + mIterator = mLayers.begin(); + } else { + mLayers.emplace_back(); + mLayers.back().name = layerName; + mIterator = mLayers.end() - 1; + } + } + + mIterator->timeMs += timeMs; + ++mIterator; +} + +void Profiler::print(std::ostream& os) const noexcept { + std::string const nameHdr("Layer"); + std::string const timeHdr(" Time (ms)"); + std::string const avgHdr(" Avg. Time (ms)"); + std::string const percentageHdr(" Time %"); + + float const totalTimeMs = getTotalTime(); + + auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { + return a.name.size() < b.name.size(); + }; + auto const longestName = + std::max_element(mLayers.begin(), mLayers.end(), cmpLayer); + auto const nameLength = + std::max(longestName->name.size() + 1, nameHdr.size()); + auto const timeLength = timeHdr.size(); + auto const avgLength = avgHdr.size(); + auto const percentageLength = percentageHdr.size(); + + os << std::endl + << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl + << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr + << std::endl; + + for (auto const& p : mLayers) { + // clang-format off + os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << p.timeMs + << std::setw(avgLength) << std::fixed << std::setprecision(4) << p.timeMs / mUpdatesCount + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100 + << std::endl; + } + { + os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2) + << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl; + // clang-format on + } + os << std::endl; +} + +void Profiler::exportJSONProfile(std::string const& fileName) const noexcept { + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl + << " { \"count\" : " << mUpdatesCount << " }" << std::endl; + + auto const totalTimeMs = getTotalTime(); + + for (auto const& l : mLayers) { + // clang-format off + os << ", {" << " \"name\" : \"" << l.name << "\"" + ", \"timeMs\" : " << l.timeMs + << ", \"averageMs\" : " << l.timeMs / mUpdatesCount + << ", \"percentage\" : " << l.timeMs / totalTimeMs * 100 + << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void dumpInputs(nvinfer1::IExecutionContext const& context, + Bindings const& bindings, std::ostream& os) { + os << "Input Tensors:" << std::endl; + bindings.dumpInputs(context, os); +} + +void dumpOutputs(nvinfer1::IExecutionContext const& context, + Bindings const& bindings, std::ostream& os) { + os << "Output Tensors:" << std::endl; + bindings.dumpOutputs(context, os); +} + +void exportJSONOutput(nvinfer1::IExecutionContext const& context, + Bindings const& bindings, std::string const& fileName, + int32_t batch) { + std::ofstream os(fileName, std::ofstream::trunc); + std::string sep = " "; + auto const output = bindings.getOutputBindings(); + os << "[" << std::endl; + for (auto const& binding : output) { + // clang-format off + os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl; + sep = ", "; + os << " " << sep << "\"dimensions\" : \""; + bindings.dumpBindingDimensions(binding.second, context, os); + os << "\"" << std::endl; + os << " " << sep << "\"values\" : [ "; + bindings.dumpBindingValues(context, binding.second, os, sep, batch); + os << " ]" << std::endl << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +} // namespace sample diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleReporting.h b/csrc/fastdeploy/backends/tensorrt/common/sampleReporting.h new file mode 100644 index 000000000..68b78af9c --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleReporting.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_REPORTING_H +#define TRT_SAMPLE_REPORTING_H + +#include +#include + +#include "NvInfer.h" + +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample { + +//! +//! \struct InferenceTime +//! \brief Measurement times in milliseconds +//! +struct InferenceTime { + InferenceTime(float q, float i, float c, float o, float e) + : enq(q), h2d(i), compute(c), d2h(o), e2e(e) {} + + InferenceTime() = default; + InferenceTime(InferenceTime const&) = default; + InferenceTime(InferenceTime&&) = default; + InferenceTime& operator=(InferenceTime const&) = default; + InferenceTime& operator=(InferenceTime&&) = default; + ~InferenceTime() = default; + + float enq{0}; // Enqueue + float h2d{0}; // Host to Device + float compute{0}; // Compute + float d2h{0}; // Device to Host + float e2e{0}; // end to end + + // ideal latency + float latency() const { return h2d + compute + d2h; } +}; + +//! +//! \struct InferenceTrace +//! \brief Measurement points in milliseconds +//! +struct InferenceTrace { + InferenceTrace(int32_t s, float es, float ee, float is, float ie, float cs, + float ce, float os, float oe) + : stream(s), enqStart(es), enqEnd(ee), h2dStart(is), h2dEnd(ie), + computeStart(cs), computeEnd(ce), d2hStart(os), d2hEnd(oe) {} + + InferenceTrace() = default; + InferenceTrace(InferenceTrace const&) = default; + InferenceTrace(InferenceTrace&&) = default; + InferenceTrace& operator=(InferenceTrace const&) = default; + InferenceTrace& operator=(InferenceTrace&&) = default; + ~InferenceTrace() = default; + + int32_t stream{0}; + float enqStart{0}; + float enqEnd{0}; + float h2dStart{0}; + float h2dEnd{0}; + float computeStart{0}; + float computeEnd{0}; + float d2hStart{0}; + float d2hEnd{0}; +}; + +inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b) { + return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, + a.d2h + b.d2h, a.e2e + b.e2e); +} + +inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) { + return a = a + b; +} + +//! +//! \struct PerformanceResult +//! \brief Performance result of a performance metric +//! +struct PerformanceResult { + float min{0}; + float max{0}; + float mean{0}; + float median{0}; + float percentile{0}; + float coeffVar{0}; // coefficient of variation +}; + +//! +//! \brief Print benchmarking time and number of traces collected +//! +void printProlog(int32_t warmups, int32_t timings, float warmupMs, + float walltime, std::ostream& os); + +//! +//! \brief Print a timing trace +//! +void printTiming(std::vector const& timings, int32_t runsPerAvg, + std::ostream& os); + +//! +//! \brief Print the performance summary of a trace +//! +void printEpilog(std::vector const& timings, float percentile, + int32_t batchSize, std::ostream& osInfo, + std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Get the result of a specific performance metric from a trace +//! +PerformanceResult +getPerformanceResult(std::vector const& timings, + std::function metricGetter, + float percentile); + +//! +//! \brief Print the explanations of the performance metrics printed in +//! printEpilog() function. +//! +void printMetricExplanations(std::ostream& os); + +//! +//! \brief Print and summarize a timing trace +//! +void printPerformanceReport(std::vector const& trace, + ReportingOptions const& reporting, float warmupMs, + int32_t batchSize, std::ostream& osInfo, + std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Export a timing trace to JSON file +//! +void exportJSONTrace(std::vector const& trace, + std::string const& fileName); + +//! +//! \brief Print input tensors to stream +//! +void dumpInputs(nvinfer1::IExecutionContext const& context, + Bindings const& bindings, std::ostream& os); + +//! +//! \brief Print output tensors to stream +//! +void dumpOutputs(nvinfer1::IExecutionContext const& context, + Bindings const& bindings, std::ostream& os); + +//! +//! \brief Export output tensors to JSON file +//! +void exportJSONOutput(nvinfer1::IExecutionContext const& context, + Bindings const& bindings, std::string const& fileName, + int32_t batch); + +//! +//! \struct LayerProfile +//! \brief Layer profile information +//! +struct LayerProfile { + std::string name; + float timeMs{0}; +}; + +//! +//! \class Profiler +//! \brief Collect per-layer profile information, assuming times are reported in +//! the same order +//! +class Profiler : public nvinfer1::IProfiler { + public: + void reportLayerTime(char const* layerName, float timeMs) noexcept override; + + void print(std::ostream& os) const noexcept; + + //! + //! \brief Export a profile to JSON file + //! + void exportJSONProfile(std::string const& fileName) const noexcept; + + private: + float getTotalTime() const noexcept { + auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { + return accumulator + lp.timeMs; + }; + return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime); + } + + std::vector mLayers; + std::vector::iterator mIterator{mLayers.begin()}; + int32_t mUpdatesCount{0}; +}; + +} // namespace sample + +#endif // TRT_SAMPLE_REPORTING_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/sampleUtils.h b/csrc/fastdeploy/backends/tensorrt/common/sampleUtils.h new file mode 100644 index 000000000..2c6f415bc --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/sampleUtils.h @@ -0,0 +1,494 @@ +/* + * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_UTILS_H +#define TRT_SAMPLE_UTILS_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "NvInfer.h" + +#include "common.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleOptions.h" + +namespace sample { + +inline int dataTypeSize(nvinfer1::DataType dataType) { + switch (dataType) { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: + return 4; + case nvinfer1::DataType::kHALF: + return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kINT8: + return 1; + } + return 0; +} + +template inline T roundUp(T m, T n) { + return ((m + n - 1) / n) * n; +} + +inline int volume(const nvinfer1::Dims& d) { + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +//! comps is the number of components in a vector. Ignored if vecDim < 0. +inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, + int vecDim, int comps, int batch) { + int maxNbElems = 1; + for (int i = 0; i < dims.nbDims; ++i) { + // Get effective length of axis. + int d = dims.d[i]; + // Any dimension is 0, it is an empty tensor. + if (d == 0) { + return 0; + } + if (i == vecDim) { + d = samplesCommon::divUp(d, comps); + } + maxNbElems = std::max(maxNbElems, d * strides.d[i]); + } + return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); +} + +inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch) { + if (vecDim != -1) { + dims.d[vecDim] = roundUp(dims.d[vecDim], comps); + } + return volume(dims) * std::max(batch, 1); +} + +inline nvinfer1::Dims toDims(const std::vector& vec) { + int limit = static_cast(nvinfer1::Dims::MAX_DIMS); + if (static_cast(vec.size()) > limit) { + sample::gLogWarning + << "Vector too long, only first 8 elements are used in dimension." + << std::endl; + } + // Pick first nvinfer1::Dims::MAX_DIMS elements + nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); + return dims; +} + +template +inline void fillBuffer(void* buffer, int64_t volume, T min, T max) { + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + if (std::is_integral::value) { + std::uniform_int_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { + return static_cast(distribution(engine)); + }; + std::generate(typedBuffer, typedBuffer + volume, generator); + } else { + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { + return static_cast(distribution(engine)); + }; + std::generate(typedBuffer, typedBuffer + volume, generator); + } +} + +// Specialization needed for custom type __half +template +inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max) { + H* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { + return static_cast(distribution(engine)); + }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} +template <> +inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, + __half max) { + fillBufferHalf(buffer, volume, min, max); +} + +template +inline void dumpBuffer(const void* buffer, const std::string& separator, + std::ostream& os, const Dims& dims, const Dims& strides, + int32_t vectorDim, int32_t spv) { + const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, + std::multiplies()); + const T* typedBuffer = static_cast(buffer); + std::string sep; + for (int64_t v = 0; v < volume; ++v) { + int64_t curV = v; + int32_t dataOffset = 0; + for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) { + int32_t dimVal = curV % dims.d[dimIndex]; + if (dimIndex == vectorDim) { + dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; + } else { + dataOffset += + dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); + } + curV /= dims.d[dimIndex]; + ASSERT(curV >= 0); + } + + os << sep << typedBuffer[dataOffset]; + sep = separator; + } +} + +inline void loadFromFile(std::string const& fileName, char* dst, size_t size) { + ASSERT(dst); + + std::ifstream file(fileName, std::ios::in | std::ios::binary); + if (file.is_open()) { + file.read(dst, size); + file.close(); + } else { + std::stringstream msg; + msg << "Cannot open file " << fileName << "!"; + throw std::invalid_argument(msg.str()); + } +} + +struct Binding { + bool isInput{false}; + std::unique_ptr buffer; + int64_t volume{0}; + nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; + + void fill(const std::string& fileName) { + loadFromFile(fileName, static_cast(buffer->getHostBuffer()), + buffer->getSize()); + } + + void fill() { + switch (dataType) { + case nvinfer1::DataType::kBOOL: { + fillBuffer(buffer->getHostBuffer(), volume, 0, 1); + break; + } + case nvinfer1::DataType::kINT32: { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT8: { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kFLOAT: { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kHALF: { + fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + } + } + + void dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, + int32_t spv, const std::string separator = " ") const { + switch (dataType) { + case nvinfer1::DataType::kBOOL: { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, + vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT32: { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, + vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT8: { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, + vectorDim, spv); + break; + } + case nvinfer1::DataType::kFLOAT: { + dumpBuffer(buffer->getHostBuffer(), separator, os, dims, strides, + vectorDim, spv); + break; + } + case nvinfer1::DataType::kHALF: { + dumpBuffer<__half>(buffer->getHostBuffer(), separator, os, dims, strides, + vectorDim, spv); + break; + } + } + } +}; + +class Bindings { + public: + Bindings() = delete; + explicit Bindings(bool useManaged) : mUseManaged(useManaged) {} + + void addBinding(int b, const std::string& name, bool isInput, int64_t volume, + nvinfer1::DataType dataType, + const std::string& fileName = "") { + while (mBindings.size() <= static_cast(b)) { + mBindings.emplace_back(); + mDevicePointers.emplace_back(); + } + mNames[name] = b; + if (mBindings[b].buffer == nullptr) { + if (mUseManaged) { + mBindings[b].buffer.reset(new UnifiedMirroredBuffer); + } else { + mBindings[b].buffer.reset(new DiscreteMirroredBuffer); + } + } + mBindings[b].isInput = isInput; + // Some memory allocators return nullptr when allocating zero bytes, but + // TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + if (volume == 0) { + mBindings[b].buffer->allocate(1); + } else { + mBindings[b].buffer->allocate( + static_cast(volume) * + static_cast(dataTypeSize(dataType))); + } + mBindings[b].volume = volume; + mBindings[b].dataType = dataType; + mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); + if (isInput) { + if (fileName.empty()) { + fill(b); + } else { + fill(b, fileName); + } + } + } + + void** getDeviceBuffers() { return mDevicePointers.data(); } + + void transferInputToDevice(TrtCudaStream& stream) { + for (auto& b : mNames) { + if (mBindings[b.second].isInput) { + mBindings[b.second].buffer->hostToDevice(stream); + } + } + } + + void transferOutputToHost(TrtCudaStream& stream) { + for (auto& b : mNames) { + if (!mBindings[b.second].isInput) { + mBindings[b.second].buffer->deviceToHost(stream); + } + } + } + + void fill(int binding, const std::string& fileName) { + mBindings[binding].fill(fileName); + } + + void fill(int binding) { mBindings[binding].fill(); } + + void dumpBindingDimensions(int binding, + const nvinfer1::IExecutionContext& context, + std::ostream& os) const { + const auto dims = context.getBindingDimensions(binding); + // Do not add a newline terminator, because the caller may be outputting a + // JSON string. + os << dims; + } + + void dumpBindingValues(const nvinfer1::IExecutionContext& context, + int binding, std::ostream& os, + const std::string& separator = " ", + int32_t batch = 1) const { + Dims dims = context.getBindingDimensions(binding); + Dims strides = context.getStrides(binding); + int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); + const int32_t spv = + context.getEngine().getBindingComponentsPerElement(binding); + + if (context.getEngine().hasImplicitBatchDimension()) { + auto insertN = [](Dims& d, int32_t bs) { + const int32_t nbDims = d.nbDims; + ASSERT(nbDims < Dims::MAX_DIMS); + std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); + d.d[0] = bs; + d.nbDims = nbDims + 1; + }; + int32_t batchStride = 0; + for (int32_t i = 0; i < strides.nbDims; ++i) { + if (strides.d[i] * dims.d[i] > batchStride) { + batchStride = strides.d[i] * dims.d[i]; + } + } + insertN(dims, batch); + insertN(strides, batchStride); + vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; + } + + mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); + } + + void dumpInputs(const nvinfer1::IExecutionContext& context, + std::ostream& os) const { + auto isInput = [](const Binding& b) { return b.isInput; }; + dumpBindings(context, isInput, os); + } + + void dumpOutputs(const nvinfer1::IExecutionContext& context, + std::ostream& os) const { + auto isOutput = [](const Binding& b) { return !b.isInput; }; + dumpBindings(context, isOutput, os); + } + + void dumpBindings(const nvinfer1::IExecutionContext& context, + std::ostream& os) const { + auto all = [](const Binding& b) { return true; }; + dumpBindings(context, all, os); + } + + void dumpBindings(const nvinfer1::IExecutionContext& context, + bool (*predicate)(const Binding& b), + std::ostream& os) const { + for (const auto& n : mNames) { + const auto binding = n.second; + if (predicate(mBindings[binding])) { + os << n.first << ": ("; + dumpBindingDimensions(binding, context, os); + os << ")" << std::endl; + + dumpBindingValues(context, binding, os); + os << std::endl; + } + } + } + + std::unordered_map getInputBindings() const { + auto isInput = [](const Binding& b) { return b.isInput; }; + return getBindings(isInput); + } + + std::unordered_map getOutputBindings() const { + auto isOutput = [](const Binding& b) { return !b.isInput; }; + return getBindings(isOutput); + } + + std::unordered_map getBindings() const { + auto all = [](const Binding& b) { return true; }; + return getBindings(all); + } + + std::unordered_map + getBindings(bool (*predicate)(const Binding& b)) const { + std::unordered_map bindings; + for (const auto& n : mNames) { + const auto binding = n.second; + if (predicate(mBindings[binding])) { + bindings.insert(n); + } + } + return bindings; + } + + private: + std::unordered_map mNames; + std::vector mBindings; + std::vector mDevicePointers; + bool mUseManaged{false}; +}; + +template struct TrtDestroyer { + void operator()(T* t) { t->destroy(); } +}; + +template using TrtUniquePtr = std::unique_ptr>; + +inline bool broadcastIOFormats(const std::vector& formats, + size_t nbBindings, bool isInput = true) { + bool broadcast = formats.size() == 1; + bool validFormatsCount = broadcast || (formats.size() == nbBindings); + if (!formats.empty() && !validFormatsCount) { + if (isInput) { + throw std::invalid_argument( + "The number of inputIOFormats must match network's inputs or be one " + "for broadcasting."); + } else { + throw std::invalid_argument( + "The number of outputIOFormats must match network's outputs or be " + "one for broadcasting."); + } + } + return broadcast; +} + +inline std::vector loadTimingCacheFile(const std::string inFileName) { + std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); + if (!iFile) { + sample::gLogWarning << "Could not read timing cache from: " << inFileName + << ". A new timing cache will be generated and written." + << std::endl; + return std::vector(); + } + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " + << inFileName << std::endl; + return content; +} + +inline void saveTimingCacheFile(const std::string outFileName, + const IHostMemory* blob) { + std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); + if (!oFile) { + sample::gLogWarning << "Could not write timing cache to: " << outFileName + << std::endl; + return; + } + oFile.write((char*)blob->data(), blob->size()); + oFile.close(); + sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " + << outFileName << std::endl; +} + +inline int32_t getCudaDriverVersion() { + int32_t version{-1}; + cudaCheck(cudaDriverGetVersion(&version)); + return version; +} + +inline int32_t getCudaRuntimeVersion() { + int32_t version{-1}; + cudaCheck(cudaRuntimeGetVersion(&version)); + return version; +} + +} // namespace sample + +#endif // TRT_SAMPLE_UTILS_H diff --git a/csrc/fastdeploy/backends/tensorrt/common/windows/getopt.c b/csrc/fastdeploy/backends/tensorrt/common/windows/getopt.c new file mode 100644 index 000000000..515a55bb1 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/windows/getopt.c @@ -0,0 +1,568 @@ +/* $OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $ */ +/* $NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $ */ + +/* + * Copyright (c) 2002 Todd C. Miller + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F39502-99-1-0512. + */ +/*- + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ + +#ifdef REPLACE_GETOPT +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ +#undef optreset /* see getopt.h */ +#define optreset __mingw_optreset +int optreset; /* reset getopt */ +char* optarg; /* argument associated with option */ +#endif + +#define PRINT_ERROR ((opterr) && (*options != ':')) + +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ + +/* return values */ +#define BADCH (int) '?' +#define BADARG ((*options == ':') ? (int) ':' : (int) '?') +#define INORDER (int) 1 + +#ifndef __CYGWIN__ +#define __progname __argv[0] +#else +extern char __declspec(dllimport) * __progname; +#endif + +#ifdef __CYGWIN__ +static char EMSG[] = ""; +#else +#define EMSG "" +#endif + +static int getopt_internal(int, char* const*, const char*, const struct option*, int*, int); +static int parse_long_options(char* const*, const char*, const struct option*, int*, int); +static int gcd(int, int); +static void permute_args(int, int, int, char* const*); + +static char* place = EMSG; /* option letter processing */ + +/* XXX: set optreset to 1 rather than these two */ +static int nonopt_start = -1; /* first non option argument (for permute) */ +static int nonopt_end = -1; /* first option after non options (for permute) */ + +/* Error messages */ +static const char recargchar[] = "option requires an argument -- %c"; +static const char recargstring[] = "option requires an argument -- %s"; +static const char ambig[] = "ambiguous option -- %.*s"; +static const char noarg[] = "option doesn't take an argument -- %.*s"; +static const char illoptchar[] = "unknown option -- %c"; +static const char illoptstring[] = "unknown option -- %s"; + +static void _vwarnx(const char* fmt, va_list ap) +{ + (void) fprintf(stderr, "%s: ", __progname); + if (fmt != NULL) + (void) vfprintf(stderr, fmt, ap); + (void) fprintf(stderr, "\n"); +} + +static void warnx(const char* fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + _vwarnx(fmt, ap); + va_end(ap); +} + +/* + * Compute the greatest common divisor of a and b. + */ +static int gcd(int a, int b) +{ + int c; + + c = a % b; + while (c != 0) + { + a = b; + b = c; + c = a % b; + } + + return (b); +} + +/* + * Exchange the block from nonopt_start to nonopt_end with the block + * from nonopt_end to opt_end (keeping the same order of arguments + * in each block). + */ +static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv) +{ + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char* swap; + + /* + * compute lengths of blocks and number and size of cycles + */ + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) + { + cstart = panonopt_end + i; + pos = cstart; + for (j = 0; j < cyclelen; j++) + { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char**) nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char**) nargv)[cstart] = swap; + } + } +} + +/* + * parse_long_options -- + * Parse long options in argc/argv argument vector. + * Returns -1 if short_too is set and the option does not match long_options. + */ +static int parse_long_options( + char* const* nargv, const char* options, const struct option* long_options, int* idx, int short_too) +{ + char *current_argv, *has_equal; + size_t current_argv_len; + int i, ambiguous, match; + +#define IDENTICAL_INTERPRETATION(_x, _y) \ + (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag \ + && long_options[(_x)].val == long_options[(_y)].val) + + current_argv = place; + match = -1; + ambiguous = 0; + + optind++; + + if ((has_equal = strchr(current_argv, '=')) != NULL) + { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } + else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) + { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) + { + /* exact match */ + match = i; + ambiguous = 0; + break; + } + /* + * If this is a known short option, don't allow + * a partial match of a single character. + */ + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else if (!IDENTICAL_INTERPRETATION(i, match)) + ambiguous = 1; + } + if (ambiguous) + { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int) current_argv_len, current_argv); + optopt = 0; + return (BADCH); + } + if (match != -1) + { /* option found */ + if (long_options[match].has_arg == no_argument && has_equal) + { + if (PRINT_ERROR) + warnx(noarg, (int) current_argv_len, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return (BADARG); + } + if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) + { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == required_argument) + { + /* + * optional argument doesn't use next nargv + */ + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) && (optarg == NULL)) + { + /* + * Missing argument; leading ':' indicates no error + * should be generated. + */ + if (PRINT_ERROR) + warnx(recargstring, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return (BADARG); + } + } + else + { /* unknown option */ + if (short_too) + { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return (BADCH); + } + if (idx) + *idx = match; + if (long_options[match].flag) + { + *long_options[match].flag = long_options[match].val; + return (0); + } + else + return (long_options[match].val); +#undef IDENTICAL_INTERPRETATION +} + +/* + * getopt_internal -- + * Parse argc/argv argument vector. Called by user level routines. + */ +static int getopt_internal( + int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx, int flags) +{ + const char* oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; + + if (options == NULL) + return (-1); + + /* + * XXX Some GNU programs (like cvs) set optind to 0 instead of + * XXX using optreset. Work around this braindamage. + */ + if (optind == 0) + optind = optreset = 1; + + /* + * Disable GNU extensions if POSIXLY_CORRECT is set or options + * string begins with a '+'. + * + * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or + * optreset != 0 for GNU compatibility. + */ + if (posixly_correct == -1 || optreset != 0) + posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); + if (*options == '-') + flags |= FLAG_ALLARGS; + else if (posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + if (*options == '+' || *options == '-') + options++; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; +start: + if (optreset || !*place) + { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) + { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) + { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + else if (nonopt_start != -1) + { + /* + * If we skipped non-options, set optind + * to the first of them. + */ + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL)) + { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) + { + /* + * GNU extension: + * return non-option as argument to option 1 + */ + optarg = nargv[optind++]; + return (INORDER); + } + if (!(flags & FLAG_PERMUTE)) + { + /* + * If no permutation wanted, stop parsing + * at first non-option. + */ + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + nonopt_start = optind - (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* + * If we have "-" do nothing, if "--" we are done. + */ + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') + { + optind++; + place = EMSG; + /* + * We found an option (--), so if we skipped + * non-options, we have to permute. + */ + if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* + * Check long options if: + * 1) we were passed some + * 2) the arg is not just "-" + * 3) either the arg starts with -- we are getopt_long_only() + */ + if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY))) + { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, idx, short_too); + if (optchar != -1) + { + place = EMSG; + return (optchar); + } + } + + if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0') + || (oli = strchr(options, optchar)) == NULL) + { + /* + * If the user specified "-" and '-' isn't listed in + * options, return -1 (non-option) as per POSIX. + * Otherwise, it is an unknown option character (or ':'). + */ + if (optchar == (int) '-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return (BADCH); + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') + { + /* -W long-option */ + if (*place) /* no space */ + /* NOTHING */; + else if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, idx, 0); + place = EMSG; + return (optchar); + } + if (*++oli != ':') + { /* doesn't take argument */ + if (!*place) + ++optind; + } + else + { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') + { /* arg not optional */ + if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return (optchar); +} + +#ifdef REPLACE_GETOPT +/* + * getopt -- + * Parse argc/argv argument vector. + * + * [eventually this will replace the BSD getopt] + */ +int getopt(int nargc, char* const* nargv, const char* options) +{ + + /* + * We don't pass FLAG_PERMUTE to getopt_internal() since + * the BSD getopt(3) (unlike GNU) has never done this. + * + * Furthermore, since many privileged programs call getopt() + * before dropping privileges it makes sense to keep things + * as simple (and bug-free) as possible. + */ + return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); +} +#endif /* REPLACE_GETOPT */ + +/* + * getopt_long -- + * Parse argc/argv argument vector. + */ +int getopt_long(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE)); +} + +/* + * getopt_long_only -- + * Parse argc/argv argument vector. + */ +int getopt_long_only(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE | FLAG_LONGONLY)); +} diff --git a/csrc/fastdeploy/backends/tensorrt/common/windows/getopt.h b/csrc/fastdeploy/backends/tensorrt/common/windows/getopt.h new file mode 100644 index 000000000..baa1d61b5 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/common/windows/getopt.h @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __GETOPT_H__ +/** + * DISCLAIMER + * This file has no copyright assigned and is placed in the Public Domain. + * This file is a part of the w64 mingw-runtime package. + * + * The w64 mingw-runtime package and its code is distributed in the hope that it + * will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR + * IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to + * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#define __GETOPT_H__ + +/* All the headers include this file. */ +#include + +#if defined(WINGETOPT_SHARED_LIB) +#if defined(BUILDING_WINGETOPT_DLL) +#define WINGETOPT_API __declspec(dllexport) +#else +#define WINGETOPT_API __declspec(dllimport) +#endif +#else +#define WINGETOPT_API +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + + WINGETOPT_API extern int optind; /* index of first non-option in argv */ + WINGETOPT_API extern int optopt; /* single option character, as parsed */ + WINGETOPT_API extern int opterr; /* flag to enable built-in diagnostics... */ + /* (user may set to zero, to suppress) */ + + WINGETOPT_API extern char* optarg; /* pointer to argument of current option */ + + extern int getopt(int nargc, char* const* nargv, const char* options); + +#ifdef _BSD_SOURCE +/* + * BSD adds the non-standard `optreset' feature, for reinitialisation + * of `getopt' parsing. We support this feature, for applications which + * proclaim their BSD heritage, before including this header; however, + * to maintain portability, developers are advised to avoid it. + */ +#define optreset __mingw_optreset + extern int optreset; +#endif +#ifdef __cplusplus +} +#endif +/* + * POSIX requires the `getopt' API to be specified in `unistd.h'; + * thus, `unistd.h' includes this header. However, we do not want + * to expose the `getopt_long' or `getopt_long_only' APIs, when + * included in this manner. Thus, close the standard __GETOPT_H__ + * declarations block, and open an additional __GETOPT_LONG_H__ + * specific block, only when *not* __UNISTD_H_SOURCED__, in which + * to declare the extended API. + */ +#endif /* !defined(__GETOPT_H__) */ + +#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) +#define __GETOPT_LONG_H__ + +#ifdef __cplusplus +extern "C" +{ +#endif + + struct option /* specification for a long form option... */ + { + const char* name; /* option name, without leading hyphens */ + int has_arg; /* does it take an argument? */ + int* flag; /* where to save its status, or NULL */ + int val; /* its associated status value */ + }; + + enum /* permitted values for its `has_arg' field... */ + { + no_argument = 0, /* option never takes an argument */ + required_argument, /* option always requires an argument */ + optional_argument /* option may take an argument */ + }; + + extern int getopt_long( + int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx); + extern int getopt_long_only( + int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx); +/* + * Previous MinGW implementation had... + */ +#ifndef HAVE_DECL_GETOPT +/* + * ...for the long form API only; keep this for compatibility. + */ +#define HAVE_DECL_GETOPT 1 +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */ diff --git a/csrc/fastdeploy/backends/tensorrt/trt_backend.cc b/csrc/fastdeploy/backends/tensorrt/trt_backend.cc new file mode 100644 index 000000000..dd3f837d9 --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/trt_backend.cc @@ -0,0 +1,528 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/backends/tensorrt/trt_backend.h" +#include "fastdeploy/utils/utils.h" +#ifdef ENABLE_PADDLE_FRONTEND +#include "paddle2onnx/converter.h" +#endif + +namespace fastdeploy { +size_t TrtDataTypeSize(const nvinfer1::DataType& dtype) { + if (dtype == nvinfer1::DataType::kFLOAT) { + return sizeof(float); + } else if (dtype == nvinfer1::DataType::kHALF) { + return sizeof(float) / 2; + } else if (dtype == nvinfer1::DataType::kINT8) { + return sizeof(int8_t); + } else if (dtype == nvinfer1::DataType::kINT32) { + return sizeof(int32_t); + } + // kBOOL + return sizeof(bool); +} + +FDDataType GetFDDataType(const nvinfer1::DataType& dtype) { + if (dtype == nvinfer1::DataType::kFLOAT) { + return FDDataType::FP32; + } else if (dtype == nvinfer1::DataType::kHALF) { + return FDDataType::FP16; + } else if (dtype == nvinfer1::DataType::kINT8) { + return FDDataType::INT8; + } else if (dtype == nvinfer1::DataType::kINT32) { + return FDDataType::INT32; + } + // kBOOL + return FDDataType::BOOL; +} + +std::vector toVec(const nvinfer1::Dims& dim) { + std::vector out(dim.d, dim.d + dim.nbDims); + return out; +} + +bool CheckDynamicShapeConfig(const paddle2onnx::OnnxReader& reader, + const TrtBackendOption& option) { + // paddle2onnx::ModelTensorInfo inputs[reader.NumInputs()]; + // std::string input_shapes[reader.NumInputs()]; + std::vector inputs(reader.NumInputs()); + std::vector input_shapes(reader.NumInputs()); + for (int i = 0; i < reader.NumInputs(); ++i) { + reader.GetInputInfo(i, &inputs[i]); + + // change 0 to -1, when input_dim is a string, onnx will make it to zero + for (int j = 0; j < inputs[i].rank; ++j) { + if (inputs[i].shape[j] <= 0) { + inputs[i].shape[j] = -1; + } + } + + input_shapes[i] = ""; + for (int j = 0; j < inputs[i].rank; ++j) { + if (j != inputs[i].rank - 1) { + input_shapes[i] += (std::to_string(inputs[i].shape[j]) + ", "); + } else { + input_shapes[i] += std::to_string(inputs[i].shape[j]); + } + } + } + + bool all_check_passed = true; + for (int i = 0; i < reader.NumInputs(); ++i) { + bool contain_unknown_dim = false; + for (int j = 0; j < inputs[i].rank; ++j) { + if (inputs[i].shape[j] < 0) { + contain_unknown_dim = true; + } + } + + std::string name(inputs[i].name, strlen(inputs[i].name)); + FDINFO << "The loaded model's input tensor:" << name + << " has shape [" + input_shapes[i] << "]." << std::endl; + if (contain_unknown_dim) { + auto iter1 = option.min_shape.find(name); + auto iter2 = option.max_shape.find(name); + auto iter3 = option.opt_shape.find(name); + if (iter1 == option.min_shape.end() || iter2 == option.max_shape.end() || + iter3 == option.opt_shape.end()) { + FDERROR << "The loaded model's input tensor:" << name + << " has dynamic shape [" + input_shapes[i] + + "], but didn't configure it's shape for tensorrt with " + "SetTrtInputShape correctly." + << std::endl; + all_check_passed = false; + } + } + } + + return all_check_passed; +} + +bool TrtBackend::InitFromTrt(const std::string& trt_engine_file, + const TrtBackendOption& option) { + if (initialized_) { + FDERROR << "TrtBackend is already initlized, cannot initialize again." + << std::endl; + return false; + } + cudaSetDevice(option.gpu_id); + + std::ifstream fin(trt_engine_file, std::ios::binary | std::ios::in); + if (!fin) { + FDERROR << "Failed to open TensorRT Engine file " << trt_engine_file + << std::endl; + return false; + } + fin.seekg(0, std::ios::end); + std::string engine_buffer; + engine_buffer.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(engine_buffer.at(0)), engine_buffer.size()); + fin.close(); + SampleUniquePtr runtime{ + createInferRuntime(sample::gLogger.getTRTLogger())}; + if (!runtime) { + FDERROR << "Failed to call createInferRuntime()." << std::endl; + return false; + } + engine_ = std::shared_ptr( + runtime->deserializeCudaEngine(engine_buffer.data(), + engine_buffer.size()), + samplesCommon::InferDeleter()); + if (!engine_) { + FDERROR << "Failed to call deserializeCudaEngine()." << std::endl; + return false; + } + + context_ = std::shared_ptr( + engine_->createExecutionContext()); + FDASSERT(cudaStreamCreate(&stream_) == 0, + "[ERROR] Error occurs while calling cudaStreamCreate()."); + GetInputOutputInfo(); + initialized_ = true; + return true; +} + +bool TrtBackend::InitFromPaddle(const std::string& model_file, + const std::string& params_file, + const TrtBackendOption& option, bool verbose) { + if (initialized_) { + FDERROR << "TrtBackend is already initlized, cannot initialize again." + << std::endl; + return false; + } + +#ifdef ENABLE_PADDLE_FRONTEND + std::vector custom_ops; + for (auto& item : option.custom_op_info_) { + paddle2onnx::CustomOp op; + std::strcpy(op.op_name, item.first.c_str()); + std::strcpy(op.export_op_name, item.second.c_str()); + custom_ops.emplace_back(op); + } + char* model_content_ptr; + int model_content_size = 0; + if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(), + &model_content_ptr, &model_content_size, 11, true, + verbose, true, true, true, custom_ops.data(), + custom_ops.size())) { + FDERROR << "Error occured while export PaddlePaddle to ONNX format." + << std::endl; + return false; + } + + if (option.remove_multiclass_nms_) { + char* new_model = nullptr; + int new_model_size = 0; + if (!paddle2onnx::RemoveMultiClassNMS(model_content_ptr, model_content_size, + &new_model, &new_model_size)) { + FDERROR << "Try to remove MultiClassNMS failed." << std::endl; + return false; + } + delete[] model_content_ptr; + std::string onnx_model_proto(new_model, new_model + new_model_size); + delete[] new_model; + return InitFromOnnx(onnx_model_proto, option, true); + } + + std::string onnx_model_proto(model_content_ptr, + model_content_ptr + model_content_size); + delete[] model_content_ptr; + model_content_ptr = nullptr; + return InitFromOnnx(onnx_model_proto, option, true); +#else + FDERROR << "Didn't compile with PaddlePaddle frontend, you can try to " + "call `InitFromOnnx` instead." + << std::endl; + return false; +#endif +} + +bool TrtBackend::InitFromOnnx(const std::string& model_file, + const TrtBackendOption& option, + bool from_memory_buffer) { + if (initialized_) { + FDERROR << "TrtBackend is already initlized, cannot initialize again." + << std::endl; + return false; + } + cudaSetDevice(option.gpu_id); + + std::string onnx_content = ""; + if (!from_memory_buffer) { + std::ifstream fin(model_file.c_str(), std::ios::binary | std::ios::in); + if (!fin) { + FDERROR << "[ERROR] Failed to open ONNX model file: " << model_file + << std::endl; + return false; + } + fin.seekg(0, std::ios::end); + onnx_content.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(onnx_content.at(0)), onnx_content.size()); + fin.close(); + } else { + onnx_content = model_file; + } + + // This part of code will record the original outputs order + // because the converted tensorrt network may exist wrong order of outputs + outputs_order_.clear(); + auto onnx_reader = + paddle2onnx::OnnxReader(onnx_content.c_str(), onnx_content.size()); + for (int i = 0; i < onnx_reader.NumOutputs(); ++i) { + std::string name( + onnx_reader.output_names[i], + onnx_reader.output_names[i] + strlen(onnx_reader.output_names[i])); + outputs_order_[name] = i; + } + if (!CheckDynamicShapeConfig(onnx_reader, option)) { + FDERROR << "TrtBackend::CheckDynamicShapeConfig failed." << std::endl; + return false; + } + + if (option.serialize_file != "") { + std::ifstream fin(option.serialize_file, std::ios::binary | std::ios::in); + if (fin) { + FDINFO << "Detect serialized TensorRT Engine file in " + << option.serialize_file << ", will load it directly." + << std::endl; + fin.close(); + return InitFromTrt(option.serialize_file); + } + } + + if (!CreateTrtEngine(onnx_content, option)) { + return false; + } + + context_ = std::shared_ptr( + engine_->createExecutionContext()); + FDASSERT(cudaStreamCreate(&stream_) == 0, + "[ERROR] Error occurs while calling cudaStreamCreate()."); + GetInputOutputInfo(); + initialized_ = true; + return true; +} + +bool TrtBackend::Infer(std::vector& inputs, + std::vector* outputs) { + AllocateBufferInDynamicShape(inputs, outputs); + std::vector input_binds(inputs.size()); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].dtype == FDDataType::INT64) { + int64_t* data = static_cast(inputs[i].Data()); + std::vector casted_data(data, data + inputs[i].Numel()); + FDASSERT(cudaMemcpyAsync(inputs_buffer_[inputs[i].name].data(), + static_cast(casted_data.data()), + inputs[i].Nbytes() / 2, cudaMemcpyHostToDevice, + stream_) == 0, + "[ERROR] Error occurs while copy memory from CPU to GPU."); + } else { + FDASSERT(cudaMemcpyAsync(inputs_buffer_[inputs[i].name].data(), + inputs[i].Data(), inputs[i].Nbytes(), + cudaMemcpyHostToDevice, stream_) == 0, + "[ERROR] Error occurs while copy memory from CPU to GPU."); + } + } + if (!context_->enqueueV2(bindings_.data(), stream_, nullptr)) { + FDERROR << "Failed to Infer with TensorRT." << std::endl; + return false; + } + for (size_t i = 0; i < outputs->size(); ++i) { + FDASSERT(cudaMemcpyAsync((*outputs)[i].Data(), + outputs_buffer_[(*outputs)[i].name].data(), + (*outputs)[i].Nbytes(), cudaMemcpyDeviceToHost, + stream_) == 0, + "[ERROR] Error occurs while copy memory from GPU to CPU."); + } + return true; +} + +void TrtBackend::GetInputOutputInfo() { + inputs_desc_.clear(); + outputs_desc_.clear(); + auto num_binds = engine_->getNbBindings(); + for (auto i = 0; i < num_binds; ++i) { + std::string name = std::string(engine_->getBindingName(i)); + auto shape = toVec(engine_->getBindingDimensions(i)); + auto dtype = engine_->getBindingDataType(i); + if (engine_->bindingIsInput(i)) { + inputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype}); + inputs_buffer_[name] = DeviceBuffer(dtype); + } else { + outputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype}); + outputs_buffer_[name] = DeviceBuffer(dtype); + } + } + bindings_.resize(num_binds); +} + +void TrtBackend::AllocateBufferInDynamicShape( + const std::vector& inputs, std::vector* outputs) { + for (const auto& item : inputs) { + auto idx = engine_->getBindingIndex(item.name.c_str()); + std::vector shape(item.shape.begin(), item.shape.end()); + auto dims = sample::toDims(shape); + context_->setBindingDimensions(idx, dims); + if (item.Nbytes() > inputs_buffer_[item.name].nbBytes()) { + inputs_buffer_[item.name].resize(dims); + bindings_[idx] = inputs_buffer_[item.name].data(); + } + } + if (outputs->size() != outputs_desc_.size()) { + outputs->resize(outputs_desc_.size()); + } + for (size_t i = 0; i < outputs_desc_.size(); ++i) { + auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str()); + auto output_dims = context_->getBindingDimensions(idx); + + // find the original index of output + auto iter = outputs_order_.find(outputs_desc_[i].name); + FDASSERT(iter != outputs_order_.end(), + "Cannot find output:" + outputs_desc_[i].name + + " of tensorrt network from the original model."); + auto ori_idx = iter->second; + (*outputs)[ori_idx].dtype = GetFDDataType(outputs_desc_[i].dtype); + (*outputs)[ori_idx].shape.assign(output_dims.d, + output_dims.d + output_dims.nbDims); + (*outputs)[ori_idx].name = outputs_desc_[i].name; + (*outputs)[ori_idx].data.resize(volume(output_dims) * + TrtDataTypeSize(outputs_desc_[i].dtype)); + if ((*outputs)[ori_idx].Nbytes() > + outputs_buffer_[outputs_desc_[i].name].nbBytes()) { + outputs_buffer_[outputs_desc_[i].name].resize(output_dims); + bindings_[idx] = outputs_buffer_[outputs_desc_[i].name].data(); + } + } +} + +bool TrtBackend::CreateTrtEngine(const std::string& onnx_model, + const TrtBackendOption& option) { + const auto explicitBatch = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + + builder_ = SampleUniquePtr( + nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); + if (!builder_) { + FDERROR << "Failed to call createInferBuilder()." << std::endl; + return false; + } + network_ = SampleUniquePtr( + builder_->createNetworkV2(explicitBatch)); + if (!network_) { + FDERROR << "Failed to call createNetworkV2()." << std::endl; + return false; + } + auto config = SampleUniquePtr( + builder_->createBuilderConfig()); + if (!config) { + FDERROR << "Failed to call createBuilderConfig()." << std::endl; + return false; + } + + if (option.enable_fp16) { + if (!builder_->platformHasFastFp16()) { + FDWARNING << "Detected FP16 is not supported in the current GPU, " + "will use FP32 instead." + << std::endl; + } else { + config->setFlag(nvinfer1::BuilderFlag::kFP16); + } + } + + parser_ = SampleUniquePtr( + nvonnxparser::createParser(*network_, sample::gLogger.getTRTLogger())); + if (!parser_) { + FDERROR << "Failed to call createParser()." << std::endl; + return false; + } + if (!parser_->parse(onnx_model.data(), onnx_model.size())) { + FDERROR << "Failed to parse ONNX model by TensorRT." << std::endl; + return false; + } + + FDINFO << "Start to building TensorRT Engine..." << std::endl; + bool fp16 = builder_->platformHasFastFp16(); + builder_->setMaxBatchSize(option.max_batch_size); + + config->setMaxWorkspaceSize(option.max_workspace_size); + + if (option.max_shape.size() > 0) { + auto profile = builder_->createOptimizationProfile(); + FDASSERT(option.max_shape.size() == option.min_shape.size() && + option.min_shape.size() == option.opt_shape.size(), + "[TrtBackend] Size of max_shape/opt_shape/min_shape in " + "TrtBackendOption should keep same."); + for (const auto& item : option.min_shape) { + // set min shape + FDASSERT(profile->setDimensions(item.first.c_str(), + nvinfer1::OptProfileSelector::kMIN, + sample::toDims(item.second)), + "[TrtBackend] Failed to set min_shape for input: " + item.first + + " in TrtBackend."); + + // set optimization shape + auto iter = option.opt_shape.find(item.first); + FDASSERT(iter != option.opt_shape.end(), + "[TrtBackend] Cannot find input name: " + item.first + + " in TrtBackendOption::opt_shape."); + FDASSERT(profile->setDimensions(item.first.c_str(), + nvinfer1::OptProfileSelector::kOPT, + sample::toDims(iter->second)), + "[TrtBackend] Failed to set opt_shape for input: " + item.first + + " in TrtBackend."); + // set max shape + iter = option.max_shape.find(item.first); + FDASSERT(iter != option.max_shape.end(), + "[TrtBackend] Cannot find input name: " + item.first + + " in TrtBackendOption::max_shape."); + FDASSERT(profile->setDimensions(item.first.c_str(), + nvinfer1::OptProfileSelector::kMAX, + sample::toDims(iter->second)), + "[TrtBackend] Failed to set max_shape for input: " + item.first + + " in TrtBackend."); + } + config->addOptimizationProfile(profile); + } + + SampleUniquePtr plan{ + builder_->buildSerializedNetwork(*network_, *config)}; + if (!plan) { + FDERROR << "Failed to call buildSerializedNetwork()." << std::endl; + return false; + } + + SampleUniquePtr runtime{ + createInferRuntime(sample::gLogger.getTRTLogger())}; + if (!runtime) { + FDERROR << "Failed to call createInferRuntime()." << std::endl; + return false; + } + + engine_ = std::shared_ptr( + runtime->deserializeCudaEngine(plan->data(), plan->size()), + samplesCommon::InferDeleter()); + if (!engine_) { + FDERROR << "Failed to call deserializeCudaEngine()." << std::endl; + return false; + } + + FDINFO << "TensorRT Engine is built succussfully." << std::endl; + if (option.serialize_file != "") { + FDINFO << "Serialize TensorRTEngine to local file " << option.serialize_file + << "." << std::endl; + std::ofstream engine_file(option.serialize_file.c_str()); + if (!engine_file) { + FDERROR << "Failed to open " << option.serialize_file << " to write." + << std::endl; + return false; + } + engine_file.write(static_cast(plan->data()), plan->size()); + engine_file.close(); + FDINFO << "TensorRTEngine is serialized to local file " + << option.serialize_file + << ", we can load this model from the seralized engine " + "directly next time." + << std::endl; + } + return true; +} + +TensorInfo TrtBackend::GetInputInfo(int index) { + FDASSERT(index < NumInputs(), "The index:" + std::to_string(index) + + " should less than the number of inputs:" + + std::to_string(NumInputs()) + "."); + TensorInfo info; + info.name = inputs_desc_[index].name; + info.shape.assign(inputs_desc_[index].shape.begin(), + inputs_desc_[index].shape.end()); + info.dtype = GetFDDataType(inputs_desc_[index].dtype); + return info; +} + +TensorInfo TrtBackend::GetOutputInfo(int index) { + FDASSERT(index < NumOutputs(), + "The index:" + std::to_string(index) + + " should less than the number of outputs:" + + std::to_string(NumOutputs()) + "."); + TensorInfo info; + info.name = outputs_desc_[index].name; + info.shape.assign(outputs_desc_[index].shape.begin(), + outputs_desc_[index].shape.end()); + info.dtype = GetFDDataType(outputs_desc_[index].dtype); + return info; +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/backends/tensorrt/trt_backend.h b/csrc/fastdeploy/backends/tensorrt/trt_backend.h new file mode 100644 index 000000000..376da241f --- /dev/null +++ b/csrc/fastdeploy/backends/tensorrt/trt_backend.h @@ -0,0 +1,113 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "fastdeploy/backends/backend.h" + +#include "fastdeploy/backends/tensorrt/common/argsParser.h" +#include "fastdeploy/backends/tensorrt/common/buffers.h" +#include "fastdeploy/backends/tensorrt/common/common.h" +#include "fastdeploy/backends/tensorrt/common/logger.h" +#include "fastdeploy/backends/tensorrt/common/parserOnnxConfig.h" +#include "fastdeploy/backends/tensorrt/common/sampleUtils.h" + +#include +#include "NvInfer.h" + +namespace fastdeploy { +using namespace samplesCommon; + +struct TrtValueInfo { + std::string name; + std::vector shape; + nvinfer1::DataType dtype; +}; + +struct TrtBackendOption { + int gpu_id = 0; + bool enable_fp16 = false; + bool enable_int8 = false; + size_t max_batch_size = 32; + size_t max_workspace_size = 1 << 30; + std::map> max_shape; + std::map> min_shape; + std::map> opt_shape; + std::string serialize_file = ""; + + // inside parameter, maybe remove next version + bool remove_multiclass_nms_ = false; + std::map custom_op_info_; +}; + +std::vector toVec(const nvinfer1::Dims& dim); +size_t TrtDataTypeSize(const nvinfer1::DataType& dtype); +FDDataType GetFDDataType(const nvinfer1::DataType& dtype); + +class TrtBackend : public BaseBackend { + public: + TrtBackend() : engine_(nullptr), context_(nullptr) {} + virtual ~TrtBackend() = default; + void BuildOption(const TrtBackendOption& option); + + bool InitFromPaddle(const std::string& model_file, + const std::string& params_file, + const TrtBackendOption& option = TrtBackendOption(), + bool verbose = false); + bool InitFromOnnx(const std::string& model_file, + const TrtBackendOption& option = TrtBackendOption(), + bool from_memory_buffer = false); + bool InitFromTrt(const std::string& trt_engine_file, + const TrtBackendOption& option = TrtBackendOption()); + + bool Infer(std::vector& inputs, std::vector* outputs); + + int NumInputs() const { return inputs_desc_.size(); } + int NumOutputs() const { return outputs_desc_.size(); } + TensorInfo GetInputInfo(int index); + TensorInfo GetOutputInfo(int index); + + private: + std::shared_ptr engine_; + std::shared_ptr context_; + SampleUniquePtr parser_; + SampleUniquePtr builder_; + SampleUniquePtr network_; + cudaStream_t stream_{}; + std::vector bindings_; + std::vector inputs_desc_; + std::vector outputs_desc_; + std::map inputs_buffer_; + std::map outputs_buffer_; + + // Sometimes while the number of outputs > 1 + // the output order of tensorrt may not be same + // with the original onnx model + // So this parameter will record to origin outputs + // order, to help recover the rigt order + std::map outputs_order_; + + void GetInputOutputInfo(); + void AllocateBufferInDynamicShape(const std::vector& inputs, + std::vector* outputs); + bool CreateTrtEngine(const std::string& onnx_model, + const TrtBackendOption& option); +}; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/core/config.h b/csrc/fastdeploy/core/config.h new file mode 100644 index 000000000..7fa16d577 --- /dev/null +++ b/csrc/fastdeploy/core/config.h @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#ifndef FASTDEPLOY_DEBUG +/* #undef FASTDEPLOY_DEBUG */ +#endif + +#ifndef FASTDEPLOY_LIB +/* #undef FASTDEPLOY_LIB */ +#endif + +#ifndef ENABLE_PADDLE_FRONTEND +#define ENABLE_PADDLE_FRONTEND +#endif + +#ifndef ENABLE_ORT_BACKEND +#define ENABLE_ORT_BACKEND +#endif + +#ifndef ENABLE_PADDLE_BACKEND +#define ENABLE_PADDLE_BACKEND +#endif + +#ifndef WITH_GPU +#define WITH_GPU +#endif + +#ifndef ENABLE_TRT_BACKEND +/* #undef ENABLE_TRT_BACKEND */ +#endif + +#ifndef ENABLE_VISION +#define ENABLE_VISION +#endif + +#ifndef ENABLE_OPENCV_CUDA +/* #undef ENABLE_OPENCV_CUDA */ +#endif + +#ifndef ENABLE_VISION_VISUALIZE +#define ENABLE_VISION_VISUALIZE +#endif diff --git a/csrc/fastdeploy/core/config.h.in b/csrc/fastdeploy/core/config.h.in new file mode 100644 index 000000000..771392586 --- /dev/null +++ b/csrc/fastdeploy/core/config.h.in @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#ifndef FASTDEPLOY_DEBUG +#cmakedefine FASTDEPLOY_DEBUG +#endif + +#ifndef FASTDEPLOY_LIB +#cmakedefine FASTDEPLOY_LIB +#endif + +#ifndef ENABLE_PADDLE_FRONTEND +#cmakedefine ENABLE_PADDLE_FRONTEND +#endif + +#ifndef ENABLE_ORT_BACKEND +#cmakedefine ENABLE_ORT_BACKEND +#endif + +#ifndef ENABLE_PADDLE_BACKEND +#cmakedefine ENABLE_PADDLE_BACKEND +#endif + +#ifndef WITH_GPU +#cmakedefine WITH_GPU +#endif + +#ifndef ENABLE_TRT_BACKEND +#cmakedefine ENABLE_TRT_BACKEND +#endif + +#ifndef ENABLE_VISION +#cmakedefine ENABLE_VISION +#endif + +#ifndef ENABLE_OPENCV_CUDA +#cmakedefine ENABLE_OPENCV_CUDA +#endif + +#ifndef ENABLE_VISION_VISUALIZE +#cmakedefine ENABLE_VISION_VISUALIZE +#endif diff --git a/csrc/fastdeploy/core/fd_tensor.cc b/csrc/fastdeploy/core/fd_tensor.cc new file mode 100644 index 000000000..c6f7a4739 --- /dev/null +++ b/csrc/fastdeploy/core/fd_tensor.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/core/fd_tensor.h" +#include "fastdeploy/utils/utils.h" + +#ifdef WITH_GPU +#include +#endif + +namespace fastdeploy { + +void* FDTensor::MutableData() { + if (external_data_ptr != nullptr) { + return external_data_ptr; + } + return data.data(); +} + +void* FDTensor::Data() { + if (external_data_ptr != nullptr) { + if (device == Device::GPU) { +#ifdef WITH_GPU + // need to copy cuda mem to cpu first + temporary_cpu_buffer.resize(Nbytes()); + FDASSERT(cudaMemcpy(temporary_cpu_buffer.data(), external_data_ptr, + Nbytes(), cudaMemcpyDeviceToHost) == 0, + "[ERROR] Error occurs while copy memory from GPU to CPU"); + return temporary_cpu_buffer.data(); +#else + FDASSERT(false, + "The FastDeploy didn't compile under -DWITH_GPU=ON, so this is " + "an unexpected problem happend."); +#endif + } else { + return external_data_ptr; + } + } + return data.data(); +} + +const void* FDTensor::Data() const { + if (external_data_ptr != nullptr) { + return external_data_ptr; + } + return data.data(); +} + +void FDTensor::SetExternalData(const std::vector& new_shape, + const FDDataType& data_type, void* data_buffer) { + dtype = data_type; + shape.assign(new_shape.begin(), new_shape.end()); + external_data_ptr = data_buffer; +} + +void FDTensor::Allocate(const std::vector& new_shape, + const FDDataType& data_type, + const std::string& tensor_name) { + dtype = data_type; + name = tensor_name; + shape.assign(new_shape.begin(), new_shape.end()); + int unit = FDDataTypeSize(data_type); + int total_size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + data.resize(total_size * unit); +} + +int FDTensor::Nbytes() const { return Numel() * FDDataTypeSize(dtype); } + +int FDTensor::Numel() const { + return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); +} + +template +void CalculateStatisInfo(void* src_ptr, int size, double* mean, double* max, + double* min) { + T* ptr = static_cast(src_ptr); + *mean = 0; + *max = -99999999; + *min = 99999999; + for (int i = 0; i < size; ++i) { + if (*(ptr + i) > *max) { + *max = *(ptr + i); + } + if (*(ptr + i) < *min) { + *min = *(ptr + i); + } + *mean += *(ptr + i); + } + *mean = *mean / size; +} + +void FDTensor::PrintInfo(const std::string& prefix) { + double mean = 0; + double max = -99999999; + double min = 99999999; + if (dtype == FDDataType::FP32) { + CalculateStatisInfo(Data(), Numel(), &mean, &max, &min); + } else if (dtype == FDDataType::FP64) { + CalculateStatisInfo(Data(), Numel(), &mean, &max, &min); + } else if (dtype == FDDataType::INT8) { + CalculateStatisInfo(Data(), Numel(), &mean, &max, &min); + } else if (dtype == FDDataType::UINT8) { + CalculateStatisInfo(Data(), Numel(), &mean, &max, &min); + } else if (dtype == FDDataType::INT32) { + CalculateStatisInfo(Data(), Numel(), &mean, &max, &min); + } else if (dtype == FDDataType::INT64) { + CalculateStatisInfo(Data(), Numel(), &mean, &max, &min); + } else { + FDASSERT(false, + "PrintInfo function doesn't support current situation, maybe you " + "need enhance this function now.") + } + std::cout << prefix << ": shape="; + for (int i = 0; i < shape.size(); ++i) { + std::cout << shape[i] << " "; + } + std::cout << ", dtype=" << Str(dtype) << ", mean=" << mean << ", max=" << max + << ", min=" << min << std::endl; +} + +FDTensor::FDTensor(const std::string& tensor_name) { name = tensor_name; } +} // namespace fastdeploy diff --git a/csrc/fastdeploy/core/fd_tensor.h b/csrc/fastdeploy/core/fd_tensor.h new file mode 100644 index 000000000..84e8c7ff0 --- /dev/null +++ b/csrc/fastdeploy/core/fd_tensor.h @@ -0,0 +1,87 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "fastdeploy/core/fd_type.h" + +namespace fastdeploy { + +struct FASTDEPLOY_DECL FDTensor { + std::vector data; + std::vector shape; + std::string name = ""; + FDDataType dtype; + + // This use to skip memory copy step + // the external_data_ptr will point to the user allocated memory + // user has to maintain the memory, allocate and release + void* external_data_ptr = nullptr; + // The internal data will be on CPU + // Some times, the external data is on the GPU, and we are going to use + // GPU to inference the model + // so we can skip data transfer, which may improve the efficience + Device device = Device::CPU; + + // if the external data is not on CPU, we use this temporary buffer + // to transfer data to CPU at some cases we need to visit the + // other devices' data + std::vector temporary_cpu_buffer; + + // Get data buffer pointer + void* MutableData(); + + // Use this data to get the tensor data to process + // Since the most senario is process data in CPU + // this function weill return a pointer to cpu memory + // buffer. + // If the original data is on other device, the data + // will copy to cpu store in `temporary_cpu_buffer` + void* Data(); + + const void* Data() const; + + // Set user memory buffer for Tensor, the memory is managed by + // the user it self, but the Tensor will share the memory with user + // So take care with the user buffer + void SetExternalData(const std::vector& new_shape, + const FDDataType& data_type, void* data_buffer); + + // Initialize Tensor + // Include setting attribute for tensor + // and allocate cpu memory buffer + void Allocate(const std::vector& new_shape, + const FDDataType& data_type, + const std::string& tensor_name = ""); + + // Total size of tensor memory buffer in bytes + int Nbytes() const; + + // Total number of elements in this tensor + int Numel() const; + + // Debug function + // Use this function to print shape, dtype, mean, max, min + // prefix will also be printed as tag + void PrintInfo(const std::string& prefix = "TensorInfo: "); + + FDTensor() {} + explicit FDTensor(const std::string& tensor_name); +}; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/core/fd_type.cc b/csrc/fastdeploy/core/fd_type.cc new file mode 100644 index 000000000..ae70fa6e5 --- /dev/null +++ b/csrc/fastdeploy/core/fd_type.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/core/fd_type.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { + +int FDDataTypeSize(const FDDataType& data_type) { + FDASSERT(data_type != FDDataType::FP16, "Float16 is not supported."); + if (data_type == FDDataType::BOOL) { + return sizeof(bool); + } else if (data_type == FDDataType::INT16) { + return sizeof(int16_t); + } else if (data_type == FDDataType::INT32) { + return sizeof(int32_t); + } else if (data_type == FDDataType::INT64) { + return sizeof(int64_t); + } else if (data_type == FDDataType::FP32) { + return sizeof(float); + } else if (data_type == FDDataType::FP64) { + return sizeof(double); + } else if (data_type == FDDataType::UINT8) { + return sizeof(uint8_t); + } else { + FDASSERT(false, "Unexpected data type: " + Str(data_type)); + } + return -1; +} + +std::string Str(const Device& d) { + std::string out; + switch (d) { + case Device::DEFAULT: + out = "Device::DEFAULT"; + break; + case Device::CPU: + out = "Device::CPU"; + break; + case Device::GPU: + out = "Device::GPU"; + break; + default: + out = "Device::UNKOWN"; + } + return out; +} + +std::string Str(const FDDataType& fdt) { + std::string out; + switch (fdt) { + case FDDataType::BOOL: + out = "FDDataType::BOOL"; + break; + case FDDataType::INT16: + out = "FDDataType::INT16"; + break; + case FDDataType::INT32: + out = "FDDataType::INT32"; + break; + case FDDataType::INT64: + out = "FDDataType::INT64"; + break; + case FDDataType::FP32: + out = "FDDataType::FP32"; + break; + case FDDataType::FP64: + out = "FDDataType::FP64"; + break; + case FDDataType::FP16: + out = "FDDataType::FP16"; + break; + case FDDataType::UINT8: + out = "FDDataType::UINT8"; + break; + case FDDataType::INT8: + out = "FDDataType::INT8"; + break; + default: + out = "FDDataType::UNKNOWN"; + } + return out; +} + +template +const FDDataType TypeToDataType::dtype = UNKNOWN1; + +template <> +const FDDataType TypeToDataType::dtype = BOOL; + +template <> +const FDDataType TypeToDataType::dtype = INT16; + +template <> +const FDDataType TypeToDataType::dtype = INT32; + +template <> +const FDDataType TypeToDataType::dtype = INT64; + +template <> +const FDDataType TypeToDataType::dtype = FP32; + +template <> +const FDDataType TypeToDataType::dtype = FP64; + +template <> +const FDDataType TypeToDataType::dtype = UINT8; + +template <> +const FDDataType TypeToDataType::dtype = INT8; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/core/fd_type.h b/csrc/fastdeploy/core/fd_type.h new file mode 100644 index 000000000..50b00dca8 --- /dev/null +++ b/csrc/fastdeploy/core/fd_type.h @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "fastdeploy/core/config.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { + +enum FASTDEPLOY_DECL Device { DEFAULT, CPU, GPU }; + +FASTDEPLOY_DECL std::string Str(const Device& d); + +enum FASTDEPLOY_DECL FDDataType { + BOOL, + INT16, + INT32, + INT64, + FP16, + FP32, + FP64, + UNKNOWN1, + UNKNOWN2, + UNKNOWN3, + UNKNOWN4, + UNKNOWN5, + UNKNOWN6, + UNKNOWN7, + UNKNOWN8, + UNKNOWN9, + UNKNOWN10, + UNKNOWN11, + UNKNOWN12, + UNKNOWN13, + UINT8, + INT8 +}; + +FASTDEPLOY_DECL std::string Str(const FDDataType& fdt); + +FASTDEPLOY_DECL int32_t FDDataTypeSize(const FDDataType& data_dtype); + +template +struct FASTDEPLOY_DECL TypeToDataType { + static const FDDataType dtype; +}; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/fastdeploy_model.cc b/csrc/fastdeploy/fastdeploy_model.cc new file mode 100644 index 000000000..c4dbc70a7 --- /dev/null +++ b/csrc/fastdeploy/fastdeploy_model.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/utils/unique_ptr.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { + +bool FastDeployModel::InitRuntime() { + FDASSERT( + CheckModelFormat(runtime_option.model_file, runtime_option.model_format), + "ModelFormatCheck Failed."); + if (runtime_initialized_) { + FDERROR << "The model is already initialized, cannot be initliazed again." + << std::endl; + return false; + } + if (runtime_option.backend != Backend::UNKNOWN) { + if (runtime_option.backend == Backend::ORT) { + if (!IsBackendAvailable(Backend::ORT)) { + FDERROR + << "Backend::ORT is not complied with current FastDeploy library." + << std::endl; + return false; + } + } else if (runtime_option.backend == Backend::TRT) { + if (!IsBackendAvailable(Backend::TRT)) { + FDERROR + << "Backend::TRT is not complied with current FastDeploy library." + << std::endl; + return false; + } + } else if (runtime_option.backend == Backend::PDINFER) { + if (!IsBackendAvailable(Backend::PDINFER)) { + FDERROR << "Backend::PDINFER is not compiled with current FastDeploy " + "library." + << std::endl; + return false; + } + } else { + FDERROR + << "Only support Backend::ORT / Backend::TRT / Backend::PDINFER now." + << std::endl; + return false; + } + runtime_ = utils::make_unique(); + if (!runtime_->Init(runtime_option)) { + return false; + } + runtime_initialized_ = true; + return true; + } + + if (runtime_option.device == Device::CPU) { + return CreateCpuBackend(); + } else if (runtime_option.device == Device::GPU) { +#ifdef WITH_GPU + return CreateGpuBackend(); +#else + FDERROR << "The compiled FastDeploy library doesn't support GPU now." + << std::endl; + return false; +#endif + } + FDERROR << "Only support CPU/GPU now." << std::endl; + return false; +} + +bool FastDeployModel::CreateCpuBackend() { + if (valid_cpu_backends.size() == 0) { + FDERROR << "There's no valid cpu backends for model: " << ModelName() + << std::endl; + return false; + } + + for (size_t i = 0; i < valid_cpu_backends.size(); ++i) { + if (!IsBackendAvailable(valid_cpu_backends[i])) { + continue; + } + runtime_option.backend = valid_cpu_backends[i]; + runtime_ = std::unique_ptr(new Runtime()); + if (!runtime_->Init(runtime_option)) { + return false; + } + runtime_initialized_ = true; + return true; + } + FDERROR << "Found no valid backend for model: " << ModelName() << std::endl; + return false; +} + +bool FastDeployModel::CreateGpuBackend() { + if (valid_gpu_backends.size() == 0) { + FDERROR << "There's no valid gpu backends for model: " << ModelName() + << std::endl; + return false; + } + + for (size_t i = 0; i < valid_gpu_backends.size(); ++i) { + if (!IsBackendAvailable(valid_gpu_backends[i])) { + continue; + } + runtime_option.backend = valid_gpu_backends[i]; + runtime_ = std::unique_ptr(new Runtime()); + if (!runtime_->Init(runtime_option)) { + return false; + } + runtime_initialized_ = true; + return true; + } + FDERROR << "Cannot find an available gpu backend to load this model." + << std::endl; + return false; +} + +bool FastDeployModel::Infer(std::vector& input_tensors, + std::vector* output_tensors) { + return runtime_->Infer(input_tensors, output_tensors); +} + +void FastDeployModel::EnableDebug() { +#ifdef FASTDEPLOY_DEBUG + debug_ = true; +#else + FDWARNING << "The compile FastDeploy is not with -DENABLE_DEBUG=ON, so " + "cannot enable debug mode." + << std::endl; + debug_ = false; +#endif +} + +bool FastDeployModel::DebugEnabled() { return debug_; } + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/fastdeploy_model.h b/csrc/fastdeploy/fastdeploy_model.h new file mode 100644 index 000000000..df83ac525 --- /dev/null +++ b/csrc/fastdeploy/fastdeploy_model.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "fastdeploy/fastdeploy_runtime.h" + +namespace fastdeploy { + +class FASTDEPLOY_DECL FastDeployModel { + public: + virtual std::string ModelName() const { return "NameUndefined"; } + + virtual bool InitRuntime(); + virtual bool CreateCpuBackend(); + virtual bool CreateGpuBackend(); + virtual bool Infer(std::vector& input_tensors, + std::vector* output_tensors); + + RuntimeOption runtime_option; + std::vector valid_cpu_backends = {Backend::ORT}; + std::vector valid_gpu_backends = {Backend::ORT}; + std::vector valid_external_backends; + bool initialized = false; + virtual int NumInputsOfRuntime() { return runtime_->NumInputs(); } + virtual int NumOutputsOfRuntime() { return runtime_->NumOutputs(); } + virtual TensorInfo InputInfoOfRuntime(int index) { + return runtime_->GetInputInfo(index); + } + virtual TensorInfo OutputInfoOfRuntime(int index) { + return runtime_->GetOutputInfo(index); + } + virtual bool Initialized() const { + return runtime_initialized_ && initialized; + } + + virtual void EnableDebug(); + virtual bool DebugEnabled(); + + private: + std::unique_ptr runtime_; + bool runtime_initialized_ = false; + bool debug_ = false; +}; + +#define TIMERECORD_START(id) \ + TimeCounter tc_##id; \ + tc_##id.Start(); + +#define TIMERECORD_END(id, prefix) \ + if (DebugEnabled()) { \ + tc_##id.End(); \ + FDLogger() << __FILE__ << "(" << __LINE__ << "):" << __FUNCTION__ << " " \ + << prefix << " duration = " << tc_##id.Duration() << "s." \ + << std::endl; \ + } + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/fastdeploy_runtime.cc b/csrc/fastdeploy/fastdeploy_runtime.cc new file mode 100644 index 000000000..e5c41a29a --- /dev/null +++ b/csrc/fastdeploy/fastdeploy_runtime.cc @@ -0,0 +1,365 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/fastdeploy_runtime.h" +#include "fastdeploy/utils/unique_ptr.h" +#include "fastdeploy/utils/utils.h" + +#ifdef ENABLE_ORT_BACKEND +#include "fastdeploy/backends/ort/ort_backend.h" +#endif + +#ifdef ENABLE_TRT_BACKEND +#include "fastdeploy/backends/tensorrt/trt_backend.h" +#endif + +#ifdef ENABLE_PADDLE_BACKEND +#include "fastdeploy/backends/paddle/paddle_backend.h" +#endif + +namespace fastdeploy { + +std::vector GetAvailableBackends() { + std::vector backends; +#ifdef ENABLE_ORT_BACKEND + backends.push_back(Backend::ORT); +#endif +#ifdef ENABLE_TRT_BACKEND + backends.push_back(Backend::TRT); +#endif +#ifdef ENABLE_PADDLE_BACKEND + backends.push_back(Backend::PDINFER); +#endif + return backends; +} + +bool IsBackendAvailable(const Backend& backend) { + std::vector backends = GetAvailableBackends(); + for (size_t i = 0; i < backends.size(); ++i) { + if (backend == backends[i]) { + return true; + } + } + return false; +} + +std::string Str(const Backend& b) { + if (b == Backend::ORT) { + return "Backend::ORT"; + } else if (b == Backend::TRT) { + return "Backend::TRT"; + } else if (b == Backend::PDINFER) { + return "Backend::PDINFER"; + } + return "UNKNOWN-Backend"; +} + +std::string Str(const Frontend& f) { + if (f == Frontend::PADDLE) { + return "Frontend::PADDLE"; + } else if (f == Frontend::ONNX) { + return "Frontend::ONNX"; + } + return "UNKNOWN-Frontend"; +} + +bool CheckModelFormat(const std::string& model_file, + const Frontend& model_format) { + if (model_format == Frontend::PADDLE) { + if (model_file.size() < 8 || + model_file.substr(model_file.size() - 8, 8) != ".pdmodel") { + FDERROR << "With model format of Frontend::PADDLE, the model file " + "should ends with `.pdmodel`, but now it's " + << model_file << std::endl; + return false; + } + } else if (model_format == Frontend::ONNX) { + if (model_file.size() < 5 || + model_file.substr(model_file.size() - 5, 5) != ".onnx") { + FDERROR << "With model format of Frontend::ONNX, the model file " + "should ends with `.onnx`, but now it's " + << model_file << std::endl; + return false; + } + } else { + FDERROR << "Only support model format with frontend Frontend::PADDLE / " + "Frontend::ONNX." + << std::endl; + return false; + } + return true; +} + +Frontend GuessModelFormat(const std::string& model_file) { + if (model_file.size() > 8 && + model_file.substr(model_file.size() - 8, 8) == ".pdmodel") { + FDLogger() << "Model Format: PaddlePaddle." << std::endl; + return Frontend::PADDLE; + } else if (model_file.size() > 5 && + model_file.substr(model_file.size() - 5, 5) == ".onnx") { + FDLogger() << "Model Format: ONNX." << std::endl; + return Frontend::ONNX; + } + + FDERROR << "Cannot guess which model format you are using, please set " + "RuntimeOption::model_format manually." + << std::endl; + return Frontend::PADDLE; +} + +void RuntimeOption::SetModelPath(const std::string& model_path, + const std::string& params_path, + const std::string& _model_format) { + if (_model_format == "paddle") { + model_file = model_path; + params_file = params_path; + model_format = Frontend::PADDLE; + } else if (_model_format == "onnx") { + model_file = model_path; + model_format = Frontend::ONNX; + } else { + FDASSERT(false, "The model format only can be 'paddle' or 'onnx'."); + } +} + +void RuntimeOption::UseGpu(int gpu_id) { +#ifdef WITH_GPU + device = Device::GPU; + device_id = gpu_id; +#else + FDWARNING << "The FastDeploy didn't compile with GPU, will force to use CPU." + << std::endl; + device = Device::CPU; +#endif +} + +void RuntimeOption::UseCpu() { device = Device::CPU; } + +void RuntimeOption::SetCpuThreadNum(int thread_num) { + FDASSERT(thread_num > 0, "The thread_num must be greater than 0."); + cpu_thread_num = thread_num; +} + +// use paddle inference backend +void RuntimeOption::UsePaddleBackend() { +#ifdef ENABLE_PADDLE_BACKEND + backend = Backend::PDINFER; +#else + FDASSERT(false, "The FastDeploy didn't compile with Paddle Inference."); +#endif +} + +// use onnxruntime backend +void RuntimeOption::UseOrtBackend() { +#ifdef ENABLE_ORT_BACKEND + backend = Backend::ORT; +#else + FDASSERT(false, "The FastDeploy didn't compile with OrtBackend."); +#endif +} + +void RuntimeOption::UseTrtBackend() { +#ifdef ENABLE_TRT_BACKEND + backend = Backend::TRT; +#else + FDASSERT(false, "The FastDeploy didn't compile with TrtBackend."); +#endif +} + +void RuntimeOption::EnablePaddleMKLDNN() { pd_enable_mkldnn = true; } + +void RuntimeOption::DisablePaddleMKLDNN() { pd_enable_mkldnn = false; } + +void RuntimeOption::SetPaddleMKLDNNCacheSize(int size) { + FDASSERT(size > 0, "Parameter size must greater than 0."); + pd_mkldnn_cache_size = size; +} + +void RuntimeOption::SetTrtInputShape(const std::string& input_name, + const std::vector& min_shape, + const std::vector& opt_shape, + const std::vector& max_shape) { + trt_min_shape[input_name].clear(); + trt_max_shape[input_name].clear(); + trt_opt_shape[input_name].clear(); + trt_min_shape[input_name].assign(min_shape.begin(), min_shape.end()); + if (opt_shape.size() == 0) { + trt_opt_shape[input_name].assign(min_shape.begin(), min_shape.end()); + } else { + trt_opt_shape[input_name].assign(opt_shape.begin(), opt_shape.end()); + } + if (max_shape.size() == 0) { + trt_max_shape[input_name].assign(min_shape.begin(), min_shape.end()); + } else { + trt_max_shape[input_name].assign(max_shape.begin(), max_shape.end()); + } +} + +void RuntimeOption::EnableTrtFP16() { trt_enable_fp16 = true; } + +void RuntimeOption::DisableTrtFP16() { trt_enable_fp16 = false; } + +void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) { + trt_serialize_file = cache_file_path; +} + +bool Runtime::Init(const RuntimeOption& _option) { + option = _option; + if (option.model_format == Frontend::AUTOREC) { + option.model_format = GuessModelFormat(_option.model_file); + } + if (option.backend == Backend::UNKNOWN) { + if (IsBackendAvailable(Backend::ORT)) { + option.backend = Backend::ORT; + } else if (IsBackendAvailable(Backend::PDINFER)) { + option.backend = Backend::PDINFER; + } else { + FDERROR << "Please define backend in RuntimeOption, current it's " + "Backend::UNKNOWN." + << std::endl; + return false; + } + } + if (option.backend == Backend::ORT) { + FDASSERT(option.device == Device::CPU || option.device == Device::GPU, + "Backend::TRT only supports Device::CPU/Device::GPU."); + CreateOrtBackend(); + } else if (option.backend == Backend::TRT) { + FDASSERT(option.device == Device::GPU, + "Backend::TRT only supports Device::GPU."); + CreateTrtBackend(); + } else if (option.backend == Backend::PDINFER) { + FDASSERT(option.device == Device::CPU || option.device == Device::GPU, + "Backend::TRT only supports Device::CPU/Device::GPU."); + FDASSERT( + option.model_format == Frontend::PADDLE, + "Backend::PDINFER only supports model format of Frontend::PADDLE."); + CreatePaddleBackend(); + } else { + FDERROR << "Runtime only support " + "Backend::ORT/Backend::TRT/Backend::PDINFER as backend now." + << std::endl; + return false; + } + return true; +} + +TensorInfo Runtime::GetInputInfo(int index) { + return backend_->GetInputInfo(index); +} + +TensorInfo Runtime::GetOutputInfo(int index) { + return backend_->GetOutputInfo(index); +} + +bool Runtime::Infer(std::vector& input_tensors, + std::vector* output_tensors) { + return backend_->Infer(input_tensors, output_tensors); +} + +void Runtime::CreatePaddleBackend() { +#ifdef ENABLE_PADDLE_BACKEND + auto pd_option = PaddleBackendOption(); + pd_option.enable_mkldnn = option.pd_enable_mkldnn; + pd_option.mkldnn_cache_size = option.pd_mkldnn_cache_size; + pd_option.use_gpu = (option.device == Device::GPU) ? true : false; + pd_option.gpu_id = option.device_id; + pd_option.cpu_thread_num = option.cpu_thread_num; + FDASSERT(option.model_format == Frontend::PADDLE, + "PaddleBackend only support model format of Frontend::PADDLE."); + backend_ = utils::make_unique(); + auto casted_backend = dynamic_cast(backend_.get()); + FDASSERT(casted_backend->InitFromPaddle(option.model_file, option.params_file, + pd_option), + "Load model from Paddle failed while initliazing PaddleBackend."); +#else + FDASSERT(false, + "PaddleBackend is not available, please compiled with " + "ENABLE_PADDLE_BACKEND=ON."); +#endif +} + +void Runtime::CreateOrtBackend() { +#ifdef ENABLE_ORT_BACKEND + auto ort_option = OrtBackendOption(); + ort_option.graph_optimization_level = option.ort_graph_opt_level; + ort_option.intra_op_num_threads = option.cpu_thread_num; + ort_option.inter_op_num_threads = option.ort_inter_op_num_threads; + ort_option.execution_mode = option.ort_execution_mode; + ort_option.use_gpu = (option.device == Device::GPU) ? true : false; + ort_option.gpu_id = option.device_id; + + // TODO(jiangjiajun): inside usage, maybe remove this later + ort_option.remove_multiclass_nms_ = option.remove_multiclass_nms_; + ort_option.custom_op_info_ = option.custom_op_info_; + + FDASSERT(option.model_format == Frontend::PADDLE || + option.model_format == Frontend::ONNX, + "OrtBackend only support model format of Frontend::PADDLE / " + "Frontend::ONNX."); + backend_ = utils::make_unique(); + auto casted_backend = dynamic_cast(backend_.get()); + if (option.model_format == Frontend::ONNX) { + FDASSERT(casted_backend->InitFromOnnx(option.model_file, ort_option), + "Load model from ONNX failed while initliazing OrtBackend."); + } else { + FDASSERT(casted_backend->InitFromPaddle(option.model_file, + option.params_file, ort_option), + "Load model from Paddle failed while initliazing OrtBackend."); + } +#else + FDASSERT(false, + "OrtBackend is not available, please compiled with " + "ENABLE_ORT_BACKEND=ON."); +#endif +} + +void Runtime::CreateTrtBackend() { +#ifdef ENABLE_TRT_BACKEND + auto trt_option = TrtBackendOption(); + trt_option.gpu_id = option.device_id; + trt_option.enable_fp16 = option.trt_enable_fp16; + trt_option.enable_int8 = option.trt_enable_int8; + trt_option.max_batch_size = option.trt_max_batch_size; + trt_option.max_workspace_size = option.trt_max_workspace_size; + trt_option.max_shape = option.trt_max_shape; + trt_option.min_shape = option.trt_min_shape; + trt_option.opt_shape = option.trt_opt_shape; + trt_option.serialize_file = option.trt_serialize_file; + + // TODO(jiangjiajun): inside usage, maybe remove this later + trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_; + trt_option.custom_op_info_ = option.custom_op_info_; + + FDASSERT(option.model_format == Frontend::PADDLE || + option.model_format == Frontend::ONNX, + "TrtBackend only support model format of Frontend::PADDLE / " + "Frontend::ONNX."); + backend_ = utils::make_unique(); + auto casted_backend = dynamic_cast(backend_.get()); + if (option.model_format == Frontend::ONNX) { + FDASSERT(casted_backend->InitFromOnnx(option.model_file, trt_option), + "Load model from ONNX failed while initliazing TrtBackend."); + } else { + FDASSERT(casted_backend->InitFromPaddle(option.model_file, + option.params_file, trt_option), + "Load model from Paddle failed while initliazing TrtBackend."); + } +#else + FDASSERT(false, + "TrtBackend is not available, please compiled with " + "ENABLE_TRT_BACKEND=ON."); +#endif +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/fastdeploy_runtime.h b/csrc/fastdeploy/fastdeploy_runtime.h new file mode 100644 index 000000000..780945458 --- /dev/null +++ b/csrc/fastdeploy/fastdeploy_runtime.h @@ -0,0 +1,159 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include + +#include "fastdeploy/backends/backend.h" +#include "fastdeploy/utils/perf.h" + +namespace fastdeploy { + +enum FASTDEPLOY_DECL Backend { UNKNOWN, ORT, TRT, PDINFER }; +// AUTOREC will according to the name of model file +// to decide which Frontend is +enum FASTDEPLOY_DECL Frontend { AUTOREC, PADDLE, ONNX }; + +FASTDEPLOY_DECL std::string Str(const Backend& b); +FASTDEPLOY_DECL std::string Str(const Frontend& f); +FASTDEPLOY_DECL std::vector GetAvailableBackends(); + +FASTDEPLOY_DECL bool IsBackendAvailable(const Backend& backend); + +bool CheckModelFormat(const std::string& model_file, + const Frontend& model_format); +Frontend GuessModelFormat(const std::string& model_file); + +struct FASTDEPLOY_DECL RuntimeOption { + // set path of model file and params file + // for onnx, only need to define model_file, but also need to + // define model_format + // model_format support 'paddle' / 'onnx' now. + void SetModelPath(const std::string& model_path, + const std::string& params_path = "", + const std::string& _model_format = "paddle"); + + // set model inference in GPU + void UseCpu(); + + // set model inference in CPU + void UseGpu(int gpu_id = 0); + + // set number of thread while inference in CPU + void SetCpuThreadNum(int thread_num); + + // use paddle inference backend + void UsePaddleBackend(); + + // use onnxruntime backend + void UseOrtBackend(); + + // use tensorrt backend + void UseTrtBackend(); + + // enable mkldnn while use paddle inference in CPU + void EnablePaddleMKLDNN(); + // disable mkldnn while use paddle inference in CPU + void DisablePaddleMKLDNN(); + + // set size of cached shape while enable mkldnn with paddle inference backend + void SetPaddleMKLDNNCacheSize(int size); + + // set tensorrt shape while the inputs of model contain dynamic shape + // min_shape: the minimum shape + // opt_shape: the most common shape while inference, default be empty + // max_shape: the maximum shape, default be empty + + // if opt_shape, max_shape are empty, they will keep same with the min_shape + // which means the shape will be fixed as min_shape while inference + void SetTrtInputShape( + const std::string& input_name, const std::vector& min_shape, + const std::vector& opt_shape = std::vector(), + const std::vector& max_shape = std::vector()); + + // enable half precision while use tensorrt backend + void EnableTrtFP16(); + // disable half precision, change to full precision(float32) + void DisableTrtFP16(); + + void SetTrtCacheFile(const std::string& cache_file_path); + + Backend backend = Backend::UNKNOWN; + // for cpu inference and preprocess + int cpu_thread_num = 8; + int device_id = 0; + + Device device = Device::CPU; + + // ======Only for ORT Backend======== + // -1 means use default value by ort + // 0: ORT_DISABLE_ALL 1: ORT_ENABLE_BASIC 2: ORT_ENABLE_EXTENDED 3: + // ORT_ENABLE_ALL + int ort_graph_opt_level = -1; + int ort_inter_op_num_threads = -1; + // 0: ORT_SEQUENTIAL 1: ORT_PARALLEL + int ort_execution_mode = -1; + + // ======Only for Paddle Backend===== + bool pd_enable_mkldnn = true; + int pd_mkldnn_cache_size = 1; + + // ======Only for Trt Backend======= + std::map> trt_max_shape; + std::map> trt_min_shape; + std::map> trt_opt_shape; + std::string trt_serialize_file = ""; + bool trt_enable_fp16 = false; + bool trt_enable_int8 = false; + size_t trt_max_batch_size = 32; + size_t trt_max_workspace_size = 1 << 30; + + std::string model_file = ""; // Path of model file + std::string params_file = ""; // Path of parameters file, can be empty + Frontend model_format = Frontend::AUTOREC; // format of input model + + // inside parameters, only for inside usage + // remove multiclass_nms in Paddle2ONNX + bool remove_multiclass_nms_ = false; + // for Paddle2ONNX to export custom operators + std::map custom_op_info_; +}; + +struct FASTDEPLOY_DECL Runtime { + public: + // explicit Runtime(const RuntimeOption& _option = RuntimeOption()); + + bool Init(const RuntimeOption& _option); + + bool Infer(std::vector& input_tensors, + std::vector* output_tensors); + + void CreateOrtBackend(); + + void CreatePaddleBackend(); + + void CreateTrtBackend(); + + int NumInputs() { return backend_->NumInputs(); } + int NumOutputs() { return backend_->NumOutputs(); } + TensorInfo GetInputInfo(int index); + TensorInfo GetOutputInfo(int index); + + RuntimeOption option; + + private: + std::unique_ptr backend_; +}; +} // namespace fastdeploy diff --git a/csrc/fastdeploy/function/eigen.cc b/csrc/fastdeploy/function/eigen.cc new file mode 100644 index 000000000..adcfbb195 --- /dev/null +++ b/csrc/fastdeploy/function/eigen.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/function/eigen.h" + +namespace fastdeploy { + +std::shared_ptr EigenDeviceWrapper::instance_ = nullptr; + +std::shared_ptr EigenDeviceWrapper::GetInstance() { + if (instance_ == nullptr) { + instance_ = std::make_shared(); + } + return instance_; +} + +const Eigen::DefaultDevice* EigenDeviceWrapper::GetDevice() const { + return &device_; +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/function/eigen.h b/csrc/fastdeploy/function/eigen.h new file mode 100644 index 000000000..32bacf064 --- /dev/null +++ b/csrc/fastdeploy/function/eigen.h @@ -0,0 +1,109 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "fastdeploy/core/fd_tensor.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace fastdeploy { +// EigenDim converts shape into Eigen::DSizes. +template +struct EigenDim { + using Type = Eigen::DSizes; + + static Type From(const std::vector& dims) { + Type ret; + for (int64_t d = 0; d < dims.size(); d++) { + ret[d] = dims[d]; + } + return ret; + } +}; + +// Interpret FDTensor as EigenTensor and EigenConstTensor. +template +struct EigenTensor { + using Type = Eigen::TensorMap>; + + using ConstType = + Eigen::TensorMap>; + + static Type From(FDTensor& tensor, + const std::vector& dims) { // NOLINT + return Type(reinterpret_cast(tensor.Data()), EigenDim::From(dims)); + } + + static Type From(FDTensor& tensor) { // NOLINT + return From(tensor, tensor.shape); + } // NOLINT + + static ConstType From(const FDTensor& tensor, + const std::vector& dims) { + return ConstType(reinterpret_cast(tensor.Data()), + EigenDim::From(dims)); + } + + static ConstType From(const FDTensor& tensor) { + return From(tensor, tensor.shape); + } +}; + +template +struct EigenScalar { + // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. + using Type = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + using ConstType = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + + static Type From(FDTensor& tensor) { + return Type(reinterpret_cast(tensor.Data())); + } // NOLINT + + static ConstType From(const FDTensor& tensor) { + return ConstType(reinterpret_cast(tensor.Data())); + } +}; + +template +struct EigenVector : public EigenTensor { + // Flatten reshapes a Tensor into an EigenVector. + static typename EigenVector::Type Flatten(FDTensor& tensor) { // NOLINT + return EigenVector::From(tensor, {tensor.Numel()}); + } + + static typename EigenVector::ConstType Flatten( + const FDTensor& tensor) { // NOLINT + return EigenVector::From(tensor, {tensor.Numel()}); + } +}; + +class EigenDeviceWrapper { + public: + static std::shared_ptr GetInstance(); + const Eigen::DefaultDevice* GetDevice() const; + + private: + Eigen::DefaultDevice device_; + static std::shared_ptr instance_; +}; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/function/reduce.cc b/csrc/fastdeploy/function/reduce.cc new file mode 100644 index 000000000..897504e05 --- /dev/null +++ b/csrc/fastdeploy/function/reduce.cc @@ -0,0 +1,246 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "fastdeploy/function/eigen.h" +#include "fastdeploy/function/reduce.h" +#include "fastdeploy/function/reduce_functor.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { + +#ifdef ENABLE_FDTENSOR_FUNC + +template +void ReduceFunctor(const FDTensor& input, FDTensor* output, + const std::vector& dims, bool keep_dim) { + auto x = EigenTensor::From(input); + auto x_rank = static_cast(x.dimensions().size()); + auto reduce_dim = Eigen::array(); + std::vector dims_ref = dims; + + auto out_dims = input.shape; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; + reduce_dim[i] = dims_ref[i]; + out_dims[dims_ref[i]] = 1; + } + auto origin_output_dims = out_dims; + output->Allocate(origin_output_dims, TypeToDataType::dtype); + // construct the squeezed output tensor + if (x_rank > 1) { + const int kDelFlag = -2; + for (size_t i = 0; i < dims_ref.size(); ++i) { + out_dims[dims_ref[i]] = kDelFlag; + } + out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag), + out_dims.end()); + } + + auto& place = *EigenDeviceWrapper::GetInstance()->GetDevice(); + Functor functor; + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, &x, &out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, out_dims); + functor(place, &x, &out, reduce_dim); + if (!keep_dim) { + output->shape = std::move(out_dims); + } + } +} + +#define HANDLE_REDUCE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceFunctor(input, output, dims, keep_dim); \ + } + +inline void GetShuffledDim(const std::vector& src_dims, + std::vector* dst_dims, + const std::vector& reduced_dims, + std::vector* perm_axis) { + // check if it's a reduced dim + std::vector src_dims_check(src_dims.size(), false); + size_t src_size = src_dims.size(); + size_t reduce_size = reduced_dims.size(); + std::vector regular_reduced_dims = reduced_dims; + for (size_t i = 0; i < regular_reduced_dims.size(); i++) { + if (regular_reduced_dims[i] < 0) { + regular_reduced_dims[i] = src_size + regular_reduced_dims[i]; + } + } + + for (size_t i = 0; i < reduce_size; ++i) { + dst_dims->at(src_size - reduce_size + i) = + src_dims[regular_reduced_dims[i]]; + (*perm_axis)[src_size - reduce_size + i] = regular_reduced_dims[i]; + src_dims_check[regular_reduced_dims[i]] = true; + } + + size_t offset = 0; + for (size_t i = 0; i < src_dims_check.size(); ++i) { + bool is_reduced = src_dims_check[i]; + if (!is_reduced) { + (*perm_axis)[offset] = i; + dst_dims->at(offset++) = src_dims[i]; + } + } +} + +template +void GetShuffledInput(const FDTensor& input, FDTensor* shuffled_input, + const std::vector& dims) { + auto shuffled_dims = input.shape; + std::vector perm_axis(input.shape.size()); + GetShuffledDim(input.shape, &shuffled_dims, dims, &perm_axis); + + shuffled_input->Allocate(shuffled_dims, input.dtype); + // TODO(zhoushunjie) : Need to implement trans function + // phi::funcs::TransposeNormal trans; + // trans(dev_ctx, input, shuffled_input, perm_axis); +} + +//////////////// HandleLargeDim +template +void HandleLargeDim(const FDTensor& input, FDTensor* output, + const std::vector& dims, bool keep_dim) { + // shuffle the reduced dim to the end + FDTensor shuffled_input; + GetShuffledInput(input, &shuffled_input, dims); + + // transpose to 2D tensor whose shape is {unreduced, reduced}. + const int64_t unreduced = output->Numel(); + const int64_t reduced = shuffled_input.Numel() / unreduced; + shuffled_input.Allocate({unreduced, reduced}, TypeToDataType::dtype); + + auto output_dim = output->shape; + output->Allocate({unreduced}, TypeToDataType::dtype); + + ReduceFunctor(shuffled_input, output, {1}, keep_dim); + output->shape = output_dim; +} + +////////////// ReduceKernel + +template +void ReduceKernelImpl(const FDTensor& input, FDTensor* output, + const std::vector& dims, bool keep_dim, + bool reduce_all) { + output->Allocate({1}, TypeToDataType::dtype); + const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice(); + if (reduce_all) { + // Flatten and reduce 1-D tensor + auto x = EigenVector::Flatten(input); + auto out = EigenScalar::From(*output); + auto reduce_dim = Eigen::array({{0}}); + + Functor functor; + functor(dev, &x, &out, reduce_dim); + } else { + int ndim = input.shape.size(); + int rdim = dims.size(); + if (ndim > 3) { + HandleLargeDim(input, output, dims, keep_dim); + } else { + HANDLE_REDUCE_DIM(4, 3); + HANDLE_REDUCE_DIM(4, 2); + HANDLE_REDUCE_DIM(4, 1); + HANDLE_REDUCE_DIM(3, 2); + HANDLE_REDUCE_DIM(3, 1); + HANDLE_REDUCE_DIM(2, 1); + HANDLE_REDUCE_DIM(1, 1); + } + } +} + +template +void BoolReduceKernel(const FDTensor& input, FDTensor* output, + const std::vector& dims, bool keep_dim, + bool reduce_all) { + // The dims has full dim, set the reduce_all is True + const auto& input_dim_size = input.shape.size(); + std::set dims_set(dims.begin(), dims.end()); + bool full_dim = true; + for (auto i = 0; i < input_dim_size; i++) { + if (dims_set.find(i) == dims_set.end()) { + full_dim = false; + break; + } + } + reduce_all = (reduce_all || full_dim); + + ReduceKernelImpl(input, output, dims, keep_dim, reduce_all); +} + +template +void Reduce(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + // If the dims has full dim, set the reduce_all is True + const int& input_dim_size = x.shape.size(); + std::set dims_set(dims.begin(), dims.end()); + bool full_dim = true; + for (int i = 0; i < input_dim_size; ++i) { + if (dims_set.find(i) == dims_set.end() && + dims_set.find(i - input_dim_size) == dims_set.end()) { + full_dim = false; + break; + } + } + reduce_all = (reduce_all || full_dim); + + FD_VISIT_ALL_TYPES(x.dtype, "ReduceKernelImpl", ([&] { + ReduceKernelImpl(x, out, dims, keep_dim, + reduce_all); + })); +} + +void Max(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + Reduce(x, out, dims, keep_dim, reduce_all); +} + +void Min(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + Reduce(x, out, dims, keep_dim, reduce_all); +} + +void Sum(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + Reduce(x, out, dims, keep_dim, reduce_all); +} + +void All(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + BoolReduceKernel(x, out, dims, keep_dim, reduce_all); +} + +void Any(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + BoolReduceKernel(x, out, dims, keep_dim, reduce_all); +} + +void Mean(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + Reduce(x, out, dims, keep_dim, reduce_all); +} + +void Prod(const FDTensor& x, FDTensor* out, const std::vector& dims, + bool keep_dim, bool reduce_all) { + Reduce(x, out, dims, keep_dim, reduce_all); +} +#endif + +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/function/reduce.h b/csrc/fastdeploy/function/reduce.h new file mode 100644 index 000000000..af8810c6b --- /dev/null +++ b/csrc/fastdeploy/function/reduce.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/core/fd_tensor.h" + +namespace fastdeploy { + +#ifdef ENABLE_FDTENSOR_FUNC +/** Excute the maximum operation for input FDTensor along given dims. + @param x The input tensor. + @param out The output tensor which stores the result. + @param dims The vector of axis which will be reduced. + @param keep_dim Whether to keep the reduced dims, default false. + @param reduce_all Whether to reduce all dims, default false. +*/ +FASTDEPLOY_DECL void Max(const FDTensor& x, FDTensor* out, + const std::vector& dims, + bool keep_dim = false, bool reduce_all = false); + +/** Excute the minimum operation for input FDTensor along given dims. + @param x The input tensor. + @param out The output tensor which stores the result. + @param dims The vector of axis which will be reduced. + @param keep_dim Whether to keep the reduced dims, default false. + @param reduce_all Whether to reduce all dims, default false. +*/ +FASTDEPLOY_DECL void Min(const FDTensor& x, FDTensor* out, + const std::vector& dims, + bool keep_dim = false, bool reduce_all = false); + +/** Excute the sum operation for input FDTensor along given dims. + @param x The input tensor. + @param out The output tensor which stores the result. + @param dims The vector of axis which will be reduced. + @param keep_dim Whether to keep the reduced dims, default false. + @param reduce_all Whether to reduce all dims, default false. +*/ +FASTDEPLOY_DECL void Sum(const FDTensor& x, FDTensor* out, + const std::vector& dims, + bool keep_dim = false, bool reduce_all = false); + +/** Excute the all operation for input FDTensor along given dims. + @param x The input tensor. + @param out The output tensor which stores the result. + @param dims The vector of axis which will be reduced. + @param keep_dim Whether to keep the reduced dims, default false. + @param reduce_all Whether to reduce all dims, default false. +*/ +FASTDEPLOY_DECL void All(const FDTensor& x, FDTensor* out, + const std::vector& dims, + bool keep_dim = false, bool reduce_all = false); + +/** Excute the any operation for input FDTensor along given dims. + @param x The input tensor. + @param out The output tensor which stores the result. + @param dims The vector of axis which will be reduced. + @param keep_dim Whether to keep the reduced dims, default false. + @param reduce_all Whether to reduce all dims, default false. +*/ +FASTDEPLOY_DECL void Any(const FDTensor& x, FDTensor* out, + const std::vector& dims, + bool keep_dim = false, bool reduce_all = false); + +/** Excute the mean operation for input FDTensor along given dims. + @param x The input tensor. + @param out The output tensor which stores the result. + @param dims The vector of axis which will be reduced. + @param keep_dim Whether to keep the reduced dims, default false. + @param reduce_all Whether to reduce all dims, default false. +*/ +FASTDEPLOY_DECL void Mean(const FDTensor& x, FDTensor* out, + const std::vector& dims, + bool keep_dim = false, bool reduce_all = false); + +/** Excute the product operation for input FDTensor along given dims. + @param x The input tensor. + @param out The output tensor which stores the result. + @param dims The vector of axis which will be reduced. + @param keep_dim Whether to keep the reduced dims, default false. + @param reduce_all Whether to reduce all dims, default false. +*/ +FASTDEPLOY_DECL void Prod(const FDTensor& x, FDTensor* out, + const std::vector& dims, + bool keep_dim = false, bool reduce_all = false); + +#endif +} // namespace fastdeploy diff --git a/csrc/fastdeploy/function/reduce_functor.h b/csrc/fastdeploy/function/reduce_functor.h new file mode 100644 index 000000000..de0c45bb3 --- /dev/null +++ b/csrc/fastdeploy/function/reduce_functor.h @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/function/eigen.h" +namespace fastdeploy { + +//////// Max Functor /////// +struct MaxFunctor { + template + void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) { + y->device(dev) = x->maximum(dim); + } +}; + +//////// Min Functor /////// +struct MinFunctor { + template + void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) { + y->device(dev) = x->minimum(dim); + } +}; + +//////// Sum Functor /////// +struct SumFunctor { + template + void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) { + y->device(dev) = x->sum(dim); + } +}; + +//////// All Functor /////// +struct AllFunctor { + template + void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) { + y->device(dev) = x->all(dim); + } +}; + +//////// Any Functor /////// +struct AnyFunctor { + template + void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) { + y->device(dev) = x->any(dim); + } +}; + +//////// Mean Functor /////// +struct MeanFunctor { + template + void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) { + y->device(dev) = x->mean(dim); + } +}; + +//////// Prod Functor /////// +struct ProdFunctor { + template + void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) { + y->device(dev) = x->prod(dim); + } +}; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/pybind/fastdeploy_model.cc b/csrc/fastdeploy/pybind/fastdeploy_model.cc new file mode 100644 index 000000000..b59c0fd0f --- /dev/null +++ b/csrc/fastdeploy/pybind/fastdeploy_model.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" +#include "fastdeploy/fastdeploy_model.h" + +namespace fastdeploy { + +void BindFDModel(pybind11::module& m) { + pybind11::class_(m, "FastDeployModel") + .def(pybind11::init<>(), "Default Constructor") + .def("model_name", &FastDeployModel::ModelName) + .def("num_inputs_of_runtime", &FastDeployModel::NumInputsOfRuntime) + .def("num_outputs_of_runtime", &FastDeployModel::NumOutputsOfRuntime) + .def("input_info_of_runtime", &FastDeployModel::InputInfoOfRuntime) + .def("output_info_of_runtime", &FastDeployModel::OutputInfoOfRuntime) + .def("initialized", &FastDeployModel::Initialized) + .def_readwrite("runtime_option", &FastDeployModel::runtime_option) + .def_readwrite("valid_cpu_backends", &FastDeployModel::valid_cpu_backends) + .def_readwrite("valid_gpu_backends", + &FastDeployModel::valid_gpu_backends); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/pybind/fastdeploy_runtime.cc b/csrc/fastdeploy/pybind/fastdeploy_runtime.cc new file mode 100644 index 000000000..412b1ccef --- /dev/null +++ b/csrc/fastdeploy/pybind/fastdeploy_runtime.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindRuntime(pybind11::module& m) { + pybind11::class_(m, "RuntimeOption") + .def(pybind11::init()) + .def("set_model_path", &RuntimeOption::SetModelPath) + .def("use_gpu", &RuntimeOption::UseGpu) + .def("use_cpu", &RuntimeOption::UseCpu) + .def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum) + .def("use_paddle_backend", &RuntimeOption::UsePaddleBackend) + .def("use_ort_backend", &RuntimeOption::UseOrtBackend) + .def("use_trt_backend", &RuntimeOption::UseTrtBackend) + .def("enable_paddle_mkldnn", &RuntimeOption::EnablePaddleMKLDNN) + .def("disable_paddle_mkldnn", &RuntimeOption::DisablePaddleMKLDNN) + .def("set_paddle_mkldnn_cache_size", + &RuntimeOption::SetPaddleMKLDNNCacheSize) + .def("set_trt_input_shape", &RuntimeOption::SetTrtInputShape) + .def("enable_trt_fp16", &RuntimeOption::EnableTrtFP16) + .def("disable_trt_fp16", &RuntimeOption::DisableTrtFP16) + .def("set_trt_cache_file", &RuntimeOption::SetTrtCacheFile) + .def_readwrite("model_file", &RuntimeOption::model_file) + .def_readwrite("params_file", &RuntimeOption::params_file) + .def_readwrite("model_format", &RuntimeOption::model_format) + .def_readwrite("backend", &RuntimeOption::backend) + .def_readwrite("cpu_thread_num", &RuntimeOption::cpu_thread_num) + .def_readwrite("device_id", &RuntimeOption::device_id) + .def_readwrite("device", &RuntimeOption::device) + .def_readwrite("ort_graph_opt_level", &RuntimeOption::ort_graph_opt_level) + .def_readwrite("ort_inter_op_num_threads", + &RuntimeOption::ort_inter_op_num_threads) + .def_readwrite("ort_execution_mode", &RuntimeOption::ort_execution_mode) + .def_readwrite("trt_max_shape", &RuntimeOption::trt_max_shape) + .def_readwrite("trt_opt_shape", &RuntimeOption::trt_opt_shape) + .def_readwrite("trt_min_shape", &RuntimeOption::trt_min_shape) + .def_readwrite("trt_serialize_file", &RuntimeOption::trt_serialize_file) + .def_readwrite("trt_enable_fp16", &RuntimeOption::trt_enable_fp16) + .def_readwrite("trt_enable_int8", &RuntimeOption::trt_enable_int8) + .def_readwrite("trt_max_batch_size", &RuntimeOption::trt_max_batch_size) + .def_readwrite("trt_max_workspace_size", + &RuntimeOption::trt_max_workspace_size); + + pybind11::class_(m, "TensorInfo") + .def_readwrite("name", &TensorInfo::name) + .def_readwrite("shape", &TensorInfo::shape) + .def_readwrite("dtype", &TensorInfo::dtype); + + pybind11::class_(m, "Runtime") + .def(pybind11::init()) + .def("init", &Runtime::Init) + .def("infer", + [](Runtime& self, std::map& data) { + std::vector inputs(data.size()); + int index = 0; + for (auto iter = data.begin(); iter != data.end(); ++iter) { + inputs[index].dtype = + NumpyDataTypeToFDDataType(iter->second.dtype()); + inputs[index].shape.insert( + inputs[index].shape.begin(), iter->second.shape(), + iter->second.shape() + iter->second.ndim()); + // TODO(jiangjiajun) Maybe skip memory copy is a better choice + // use SetExternalData + inputs[index].data.resize(iter->second.nbytes()); + memcpy(inputs[index].data.data(), iter->second.mutable_data(), + iter->second.nbytes()); + inputs[index].name = iter->first; + index += 1; + } + + std::vector outputs(self.NumOutputs()); + self.Infer(inputs, &outputs); + + std::vector results; + results.reserve(outputs.size()); + for (size_t i = 0; i < outputs.size(); ++i) { + auto numpy_dtype = FDDataTypeToNumpyDataType(outputs[i].dtype); + results.emplace_back( + pybind11::array(numpy_dtype, outputs[i].shape)); + memcpy(results[i].mutable_data(), outputs[i].data.data(), + outputs[i].Numel() * FDDataTypeSize(outputs[i].dtype)); + } + return results; + }) + .def("num_inputs", &Runtime::NumInputs) + .def("num_outputs", &Runtime::NumOutputs) + .def("get_input_info", &Runtime::GetInputInfo) + .def("get_output_info", &Runtime::GetOutputInfo) + .def_readonly("option", &Runtime::option); + + pybind11::enum_(m, "Backend", pybind11::arithmetic(), + "Backend for inference.") + .value("UNKOWN", Backend::UNKNOWN) + .value("ORT", Backend::ORT) + .value("TRT", Backend::TRT) + .value("PDINFER", Backend::PDINFER); + pybind11::enum_(m, "Frontend", pybind11::arithmetic(), + "Frontend for inference.") + .value("PADDLE", Frontend::PADDLE) + .value("ONNX", Frontend::ONNX); + pybind11::enum_(m, "Device", pybind11::arithmetic(), + "Device for inference.") + .value("CPU", Device::CPU) + .value("GPU", Device::GPU); + + pybind11::enum_(m, "FDDataType", pybind11::arithmetic(), + "Data type of FastDeploy.") + .value("BOOL", FDDataType::BOOL) + .value("INT8", FDDataType::INT8) + .value("INT16", FDDataType::INT16) + .value("INT32", FDDataType::INT32) + .value("INT64", FDDataType::INT64) + .value("FP32", FDDataType::FP32) + .value("FP64", FDDataType::FP64) + .value("UINT8", FDDataType::UINT8); + + m.def("get_available_backends", []() { return GetAvailableBackends(); }); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/pybind/main.cc b/csrc/fastdeploy/pybind/main.cc new file mode 100644 index 000000000..14ff03ef7 --- /dev/null +++ b/csrc/fastdeploy/pybind/main.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindRuntime(pybind11::module&); +void BindFDModel(pybind11::module&); +void BindVision(pybind11::module&); + +pybind11::dtype FDDataTypeToNumpyDataType(const FDDataType& fd_dtype) { + pybind11::dtype dt; + if (fd_dtype == FDDataType::INT32) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::INT64) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::FP32) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::FP64) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::UINT8) { + dt = pybind11::dtype::of(); + } else { + FDASSERT(false, "The function doesn't support data type of " + + Str(fd_dtype) + "."); + } + return dt; +} + +FDDataType NumpyDataTypeToFDDataType(const pybind11::dtype& np_dtype) { + if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::INT32; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::INT64; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::FP32; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::FP64; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::UINT8; + } + FDASSERT(false, + "NumpyDataTypeToFDDataType() only support " + "int32/int64/float32/float64 now."); + return FDDataType::FP32; +} + +void PyArrayToTensor(pybind11::array& pyarray, FDTensor* tensor, + bool share_buffer) { + tensor->dtype = NumpyDataTypeToFDDataType(pyarray.dtype()); + tensor->shape.insert(tensor->shape.begin(), pyarray.shape(), + pyarray.shape() + pyarray.ndim()); + if (share_buffer) { + tensor->external_data_ptr = pyarray.mutable_data(); + } else { + tensor->data.resize(pyarray.nbytes()); + memcpy(tensor->data.data(), pyarray.mutable_data(), pyarray.nbytes()); + } +} + +pybind11::array TensorToPyArray(const FDTensor& tensor) { + auto numpy_dtype = FDDataTypeToNumpyDataType(tensor.dtype); + auto out = pybind11::array(numpy_dtype, tensor.shape); + memcpy(out.mutable_data(), tensor.Data(), tensor.Numel() * FDDataTypeSize(tensor.dtype)); + return out; +} + +#ifdef ENABLE_VISION +int NumpyDataTypeToOpenCvType(const pybind11::dtype& np_dtype) { + if (np_dtype.is(pybind11::dtype::of())) { + return CV_32S; + } else if (np_dtype.is(pybind11::dtype::of())) { + return CV_8U; + } else if (np_dtype.is(pybind11::dtype::of())) { + return CV_8U; + } else if (np_dtype.is(pybind11::dtype::of())) { + return CV_32F; + } else { + FDASSERT( + false, + "NumpyDataTypeToOpenCvType() only support int32/int8/uint8/float32 " + "now."); + } + return CV_8U; +} + +cv::Mat PyArrayToCvMat(pybind11::array& pyarray) { + auto cv_type = NumpyDataTypeToOpenCvType(pyarray.dtype()); + FDASSERT( + pyarray.ndim() == 3, + "Require rank of array to be 3 with HWC format while converting it to " + "cv::Mat."); + int channel = *(pyarray.shape() + 2); + int height = *(pyarray.shape()); + int width = *(pyarray.shape() + 1); + return cv::Mat(height, width, CV_MAKETYPE(cv_type, channel), + pyarray.mutable_data()); +} +#endif + +PYBIND11_MODULE(fastdeploy_main, m) { + m.doc() = + "Make programer easier to deploy deeplearning model, save time to save " + "the world!"; + + BindRuntime(m); + BindFDModel(m); +#ifdef ENABLE_VISION + auto vision_module = + m.def_submodule("vision", "Vision module of FastDeploy."); + BindVision(vision_module); +#endif +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/pybind/main.cc.in b/csrc/fastdeploy/pybind/main.cc.in new file mode 100644 index 000000000..13e0a31c4 --- /dev/null +++ b/csrc/fastdeploy/pybind/main.cc.in @@ -0,0 +1,127 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindRuntime(pybind11::module&); +void BindFDModel(pybind11::module&); +void BindVision(pybind11::module&); + +pybind11::dtype FDDataTypeToNumpyDataType(const FDDataType& fd_dtype) { + pybind11::dtype dt; + if (fd_dtype == FDDataType::INT32) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::INT64) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::FP32) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::FP64) { + dt = pybind11::dtype::of(); + } else if (fd_dtype == FDDataType::UINT8) { + dt = pybind11::dtype::of(); + } else { + FDASSERT(false, "The function doesn't support data type of " + + Str(fd_dtype) + "."); + } + return dt; +} + +FDDataType NumpyDataTypeToFDDataType(const pybind11::dtype& np_dtype) { + if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::INT32; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::INT64; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::FP32; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::FP64; + } else if (np_dtype.is(pybind11::dtype::of())) { + return FDDataType::UINT8; + } + FDASSERT(false, + "NumpyDataTypeToFDDataType() only support " + "int32/int64/float32/float64 now."); + return FDDataType::FP32; +} + +void PyArrayToTensor(pybind11::array& pyarray, FDTensor* tensor, + bool share_buffer) { + tensor->dtype = NumpyDataTypeToFDDataType(pyarray.dtype()); + tensor->shape.insert(tensor->shape.begin(), pyarray.shape(), + pyarray.shape() + pyarray.ndim()); + if (share_buffer) { + tensor->external_data_ptr = pyarray.mutable_data(); + } else { + tensor->data.resize(pyarray.nbytes()); + memcpy(tensor->data.data(), pyarray.mutable_data(), pyarray.nbytes()); + } +} + +pybind11::array TensorToPyArray(const FDTensor& tensor) { + auto numpy_dtype = FDDataTypeToNumpyDataType(tensor.dtype); + auto out = pybind11::array(numpy_dtype, tensor.shape); + memcpy(out.mutable_data(), tensor.Data(), tensor.Numel() * FDDataTypeSize(tensor.dtype)); + return out; +} + +#ifdef ENABLE_VISION +int NumpyDataTypeToOpenCvType(const pybind11::dtype& np_dtype) { + if (np_dtype.is(pybind11::dtype::of())) { + return CV_32S; + } else if (np_dtype.is(pybind11::dtype::of())) { + return CV_8U; + } else if (np_dtype.is(pybind11::dtype::of())) { + return CV_8U; + } else if (np_dtype.is(pybind11::dtype::of())) { + return CV_32F; + } else { + FDASSERT( + false, + "NumpyDataTypeToOpenCvType() only support int32/int8/uint8/float32 " + "now."); + } + return CV_8U; +} + +cv::Mat PyArrayToCvMat(pybind11::array& pyarray) { + auto cv_type = NumpyDataTypeToOpenCvType(pyarray.dtype()); + FDASSERT( + pyarray.ndim() == 3, + "Require rank of array to be 3 with HWC format while converting it to " + "cv::Mat."); + int channel = *(pyarray.shape() + 2); + int height = *(pyarray.shape()); + int width = *(pyarray.shape() + 1); + return cv::Mat(height, width, CV_MAKETYPE(cv_type, channel), + pyarray.mutable_data()); +} +#endif + +PYBIND11_MODULE(@PY_LIBRARY_NAME@, m) { + m.doc() = + "Make programer easier to deploy deeplearning model, save time to save " + "the world!"; + + BindRuntime(m); + BindFDModel(m); +#ifdef ENABLE_VISION + auto vision_module = + m.def_submodule("vision", "Vision module of FastDeploy."); + BindVision(vision_module); +#endif +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/pybind/main.h b/csrc/fastdeploy/pybind/main.h new file mode 100644 index 000000000..23f0eccc2 --- /dev/null +++ b/csrc/fastdeploy/pybind/main.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "fastdeploy/fastdeploy_runtime.h" + +#ifdef ENABLE_VISION +#include "fastdeploy/vision.h" +#endif + +namespace fastdeploy { + +void BindBackend(pybind11::module&); +void BindVision(pybind11::module&); + +pybind11::dtype FDDataTypeToNumpyDataType(const FDDataType& fd_dtype); + +FDDataType NumpyDataTypeToFDDataType(const pybind11::dtype& np_dtype); + +void PyArrayToTensor(pybind11::array& pyarray, FDTensor* tensor, + bool share_buffer = false); +pybind11::array TensorToPyArray(const FDTensor& tensor); + +#ifdef ENABLE_VISION +cv::Mat PyArrayToCvMat(pybind11::array& pyarray); +#endif + +template +FDDataType CTypeToFDDataType() { + if (std::is_same::value) { + return FDDataType::INT32; + } else if (std::is_same::value) { + return FDDataType::INT64; + } else if (std::is_same::value) { + return FDDataType::FP32; + } else if (std::is_same::value) { + return FDDataType::FP64; + } + FDASSERT(false, + "CTypeToFDDataType only support int32/int64/float32/float64 now."); + return FDDataType::FP32; +} + +template +std::vector PyBackendInfer( + T& self, const std::vector& names, + std::vector& data) { + std::vector inputs(data.size()); + for (size_t i = 0; i < data.size(); ++i) { + // TODO(jiangjiajun) here is considered to use user memory directly + inputs[i].dtype = NumpyDataTypeToFDDataType(data[i].dtype()); + inputs[i].shape.insert(inputs[i].shape.begin(), data[i].shape(), + data[i].shape() + data[i].ndim()); + inputs[i].data.resize(data[i].nbytes()); + memcpy(inputs[i].data.data(), data[i].mutable_data(), data[i].nbytes()); + inputs[i].name = names[i]; + } + + std::vector outputs(self.NumOutputs()); + self.Infer(inputs, &outputs); + + std::vector results; + results.reserve(outputs.size()); + for (size_t i = 0; i < outputs.size(); ++i) { + auto numpy_dtype = FDDataTypeToNumpyDataType(outputs[i].dtype); + results.emplace_back(pybind11::array(numpy_dtype, outputs[i].shape)); + memcpy(results[i].mutable_data(), outputs[i].data.data(), + outputs[i].Numel() * FDDataTypeSize(outputs[i].dtype)); + } + return results; +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/text.h b/csrc/fastdeploy/text.h new file mode 100644 index 000000000..184f0f4f9 --- /dev/null +++ b/csrc/fastdeploy/text.h @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "fastdeploy/core/config.h" +#ifdef ENABLE_TEXT +#include "fastdeploy/text/text_model.h" +#endif diff --git a/csrc/fastdeploy/text/common/option.h b/csrc/fastdeploy/text/common/option.h new file mode 100644 index 000000000..a795fd066 --- /dev/null +++ b/csrc/fastdeploy/text/common/option.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { +namespace text { + +struct FASTDEPLOY_DECL TextPreprocessOption {}; +struct FASTDEPLOY_DECL TextPostprocessOption {}; +struct FASTDEPLOY_DECL PredictionOption {}; + +} // namespace text +} // namespace fastdeploy diff --git a/csrc/fastdeploy/text/common/result.cc b/csrc/fastdeploy/text/common/result.cc new file mode 100644 index 000000000..cb7efbb73 --- /dev/null +++ b/csrc/fastdeploy/text/common/result.cc @@ -0,0 +1,18 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "fastdeploy/text/common/result.h" + +namespace fastdeploy { +namespace text {} // namespace text +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/text/common/result.h b/csrc/fastdeploy/text/common/result.h new file mode 100644 index 000000000..4a6f716a3 --- /dev/null +++ b/csrc/fastdeploy/text/common/result.h @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { +namespace text { + +struct FASTDEPLOY_DECL Result {}; + +} // namespace text +} // namespace fastdeploy diff --git a/csrc/fastdeploy/text/postprocessor/postprocessor.cc b/csrc/fastdeploy/text/postprocessor/postprocessor.cc new file mode 100644 index 000000000..e8f717743 --- /dev/null +++ b/csrc/fastdeploy/text/postprocessor/postprocessor.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/text/postprocessor/postprocessor.h" + +namespace fastdeploy { +namespace text { + +bool Postprocessor::Decode(const std::vector& model_result, + Result* decoded_result) const { + return true; +} + +bool Postprocessor::DecodeBatch(const std::vector& model_result, + Result* decoded_result) const { + return true; +} + +} // namespace text +} // namespace fastdeploy diff --git a/csrc/fastdeploy/text/postprocessor/postprocessor.h b/csrc/fastdeploy/text/postprocessor/postprocessor.h new file mode 100644 index 000000000..76f6a7090 --- /dev/null +++ b/csrc/fastdeploy/text/postprocessor/postprocessor.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "fastdeploy/core/fd_tensor.h" +#include "fastdeploy/text/common/result.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { +namespace text { + +class Postprocessor { + public: + virtual bool Decode(const std::vector& model_result, + Result* decoded_result) const; + virtual bool DecodeBatch(const std::vector& model_result, + Result* decoded_result) const; +}; + +} // namespace text +} // namespace fastdeploy diff --git a/csrc/fastdeploy/text/preprocessor/preprocessor.cc b/csrc/fastdeploy/text/preprocessor/preprocessor.cc new file mode 100644 index 000000000..2e2715f61 --- /dev/null +++ b/csrc/fastdeploy/text/preprocessor/preprocessor.cc @@ -0,0 +1,32 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/text/preprocessor/preprocessor.h" + +namespace fastdeploy { +namespace text { + +bool Preprocessor::Encode(const std::string& raw_text, + std::vector* encoded_tensor) const { + return true; +} + +bool Preprocessor::EncodeBatch(const std::vector& raw_texts, + std::vector* encoded_tensor) const { + return true; +} + +} // namespace text +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/text/preprocessor/preprocessor.h b/csrc/fastdeploy/text/preprocessor/preprocessor.h new file mode 100644 index 000000000..799967093 --- /dev/null +++ b/csrc/fastdeploy/text/preprocessor/preprocessor.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "fastdeploy/core/fd_tensor.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { +namespace text { + +class Preprocessor { + public: + virtual bool Encode(const std::string& raw_text, + std::vector* encoded_tensor) const; + virtual bool EncodeBatch(const std::vector& raw_texts, + std::vector* encoded_tensor) const; +}; + +} // namespace text +} // namespace fastdeploy diff --git a/csrc/fastdeploy/text/text_model.cc b/csrc/fastdeploy/text/text_model.cc new file mode 100644 index 000000000..d5a40c0e5 --- /dev/null +++ b/csrc/fastdeploy/text/text_model.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/text/text_model.h" +#include "fastdeploy/text/common/option.h" +#include "fastdeploy/text/common/result.h" +#include "fastdeploy/text/postprocessor/postprocessor.h" +#include "fastdeploy/text/preprocessor/preprocessor.h" + +namespace fastdeploy { +namespace text { + +bool TextModel::Predict(const std::string& raw_text, Result* result, + const PredictionOption& option) { + // Preprocess + std::vector input_tensor; + std::vector output_tensor; + if (!preprocessor_->Encode(raw_text, &input_tensor)) { + FDERROR << "Failed to preprocess input data while using model:" + << ModelName() << "." << std::endl; + return false; + } + + // Inference Runtime + if (!Infer(input_tensor, &output_tensor)) { + FDERROR << "Failed to inference while using model:" << ModelName() << "." + << std::endl; + return false; + } + + // Postprocess + if (postprocessor_->Decode(output_tensor, result)) { + FDERROR << "Failed to postprocess while using model:" << ModelName() << "." + << std::endl; + return false; + } + return true; +} + +bool TextModel::PredictBatch(const std::vector& raw_text_array, + Result* results, const PredictionOption& option) { + // Preprocess + std::vector input_tensor; + std::vector output_tensor; + if (!preprocessor_->EncodeBatch(raw_text_array, &input_tensor)) { + FDERROR << "Failed to preprocess input data while using model:" + << ModelName() << "." << std::endl; + return false; + } + + // Inference Runtime + if (!Infer(input_tensor, &output_tensor)) { + FDERROR << "Failed to inference while using model:" << ModelName() << "." + << std::endl; + return false; + } + + // Postprocess + if (postprocessor_->DecodeBatch(output_tensor, results)) { + FDERROR << "Failed to postprocess while using model:" << ModelName() << "." + << std::endl; + return false; + } + return true; +} + +} // namespace text +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/text/text_model.h b/csrc/fastdeploy/text/text_model.h new file mode 100644 index 000000000..b7fbd5929 --- /dev/null +++ b/csrc/fastdeploy/text/text_model.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/utils/unique_ptr.h" + +namespace fastdeploy { +namespace text { + +class Preprocessor; +class Postprocessor; +class Result; +class PredictionOption; + +class FASTDEPLOY_DECL TextModel : public FastDeployModel { + public: + virtual std::string ModelName() const { return "TextModel"; } + virtual bool Predict(const std::string& raw_text, Result* result, + const PredictionOption& option); + virtual bool PredictBatch(const std::vector& raw_text_array, + Result* result, const PredictionOption& option); + template + void SetPreprocessor(Args&&... args) { + preprocessor_ = utils::make_unique(std::forward(args)...); + } + template + void SetPostprocessor(Args&&... args) { + postprocessor_ = utils::make_unique(std::forward(args)...); + } + + private: + std::unique_ptr preprocessor_; + std::unique_ptr postprocessor_; +}; + +} // namespace text +} // namespace fastdeploy diff --git a/csrc/fastdeploy/text/text_pybind.cc b/csrc/fastdeploy/text/text_pybind.cc new file mode 100644 index 000000000..564892f16 --- /dev/null +++ b/csrc/fastdeploy/text/text_pybind.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. \ No newline at end of file diff --git a/csrc/fastdeploy/utils/perf.h b/csrc/fastdeploy/utils/perf.h new file mode 100644 index 000000000..9f451c3a9 --- /dev/null +++ b/csrc/fastdeploy/utils/perf.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/utils/utils.h" +#include // NOLINT + +namespace fastdeploy { + +class FASTDEPLOY_DECL TimeCounter { + public: + void Start() { begin_ = std::chrono::system_clock::now(); } + + void End() { end_ = std::chrono::system_clock::now(); } + + double Duration() { + auto duration = + std::chrono::duration_cast(end_ - begin_); + return static_cast(duration.count()) * + std::chrono::microseconds::period::num / + std::chrono::microseconds::period::den; + } + + void PrintInfo(const std::string& prefix = "TimeCounter: ", + bool print_out = true) { + if (!print_out) { + return; + } + FDLogger() << prefix << " duration = " << Duration() << "s." << std::endl; + } + + private: + std::chrono::time_point begin_; + std::chrono::time_point end_; +}; + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/utils/unique_ptr.h b/csrc/fastdeploy/utils/unique_ptr.h new file mode 100644 index 000000000..2f24ef70c --- /dev/null +++ b/csrc/fastdeploy/utils/unique_ptr.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace fastdeploy { +namespace utils { +// Trait to select overloads and return types for MakeUnique. +template +struct MakeUniqueResult { + using scalar = std::unique_ptr; +}; +template +struct MakeUniqueResult { + using array = std::unique_ptr; +}; +template +struct MakeUniqueResult { + using invalid = void; +}; + +// MakeUnique(...) is an early implementation of C++14 std::make_unique. +// It is designed to be 100% compatible with std::make_unique so that the +// eventual switchover will be a simple renaming operation. +template +typename MakeUniqueResult::scalar make_unique(Args &&... args) { // NOLINT + return std::unique_ptr( + new T(std::forward(args)...)); // NOLINT(build/c++11) +} + +// Overload for array of unknown bound. +// The allocation of arrays needs to use the array form of new, +// and cannot take element constructor arguments. +template +typename MakeUniqueResult::array make_unique(size_t n) { + return std::unique_ptr(new typename std::remove_extent::type[n]()); +} + +// Reject arrays of known bound. +template +typename MakeUniqueResult::invalid make_unique(Args &&... /* args */) = + delete; // NOLINT + +} // namespace utils +} // namespace fastdeploy diff --git a/csrc/fastdeploy/utils/utils.cc b/csrc/fastdeploy/utils/utils.cc new file mode 100644 index 000000000..3899bcf5e --- /dev/null +++ b/csrc/fastdeploy/utils/utils.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { + +FDLogger::FDLogger(bool verbose, const std::string& prefix) { + verbose_ = verbose; + line_ = ""; + prefix_ = prefix; +} + +FDLogger& FDLogger::operator<<(std::ostream& (*os)(std::ostream&)) { + if (!verbose_) { + return *this; + } + std::cout << prefix_ << " " << line_ << std::endl; + line_ = ""; + return *this; +} + +bool ReadBinaryFromFile(const std::string& file, std::string* contents) { + std::ifstream fin(file, std::ios::in | std::ios::binary); + if (!fin.is_open()) { + FDERROR << "Failed to open file: " << file << " to read." << std::endl; + return false; + } + fin.seekg(0, std::ios::end); + contents->clear(); + contents->resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(contents->at(0)), contents->size()); + fin.close(); + return true; +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/utils/utils.h b/csrc/fastdeploy/utils/utils.h new file mode 100644 index 000000000..3e309a12a --- /dev/null +++ b/csrc/fastdeploy/utils/utils.h @@ -0,0 +1,150 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include + +#if defined(_WIN32) +#ifdef FASTDEPLOY_LIB +#define FASTDEPLOY_DECL __declspec(dllexport) +#else +#define FASTDEPLOY_DECL __declspec(dllimport) +#endif // FASTDEPLOY_LIB +#else +#define FASTDEPLOY_DECL __attribute__((visibility("default"))) +#endif // _WIN32 + +namespace fastdeploy { + +class FASTDEPLOY_DECL FDLogger { + public: + FDLogger() { + line_ = ""; + prefix_ = "[FastDeploy]"; + verbose_ = true; + } + explicit FDLogger(bool verbose, const std::string& prefix = "[FastDeploy]"); + + template + FDLogger& operator<<(const T& val) { + if (!verbose_) { + return *this; + } + std::stringstream ss; + ss << val; + line_ += ss.str(); + return *this; + } + FDLogger& operator<<(std::ostream& (*os)(std::ostream&)); + ~FDLogger() { + if (!verbose_ && line_ != "") { + std::cout << line_ << std::endl; + } + } + + private: + std::string line_; + std::string prefix_; + bool verbose_ = true; +}; + +FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file, + std::string* contents); + +#ifndef __REL_FILE__ +#define __REL_FILE__ __FILE__ +#endif + +#define FDERROR \ + FDLogger(true, "[ERROR]") << __REL_FILE__ << "(" << __LINE__ \ + << ")::" << __FUNCTION__ << "\t" + +#define FDWARNING \ + FDLogger(true, "[WARNING]") << __REL_FILE__ << "(" << __LINE__ \ + << ")::" << __FUNCTION__ << "\t" + +#define FDINFO \ + FDLogger(true, "[INFO]") << __REL_FILE__ << "(" << __LINE__ \ + << ")::" << __FUNCTION__ << "\t" + +#define FDASSERT(condition, message) \ + if (!(condition)) { \ + FDERROR << message << std::endl; \ + std::abort(); \ + } + +///////// Basic Marco /////////// + +#define FD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \ + case enum_type: { \ + using HINT = type; \ + __VA_ARGS__(); \ + break; \ + } + +#define FD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \ + FD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__) + +#define FD_VISIT_ALL_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::BOOL, bool, \ + __VA_ARGS__) \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t, \ + __VA_ARGS__) \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t, \ + __VA_ARGS__) \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float, \ + __VA_ARGS__) \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double, \ + __VA_ARGS__) \ + default: \ + FDASSERT(false, "Invalid enum data type.") \ + } \ + }() + +#define FD_VISIT_FLOAT_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float, \ + __VA_ARGS__) \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double, \ + __VA_ARGS__) \ + default: \ + FDASSERT(false, "Invalid enum data type.") \ + } \ + }() + +#define FD_VISIT_INT_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t, \ + __VA_ARGS__) \ + FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t, \ + __VA_ARGS__) \ + default: \ + FDASSERT(false, "Invalid enum data type.") \ + } \ + }() + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision.h b/csrc/fastdeploy/vision.h new file mode 100644 index 000000000..21371b5a1 --- /dev/null +++ b/csrc/fastdeploy/vision.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "fastdeploy/core/config.h" +#ifdef ENABLE_VISION +#include "fastdeploy/vision/detection/contrib/nanodet_plus.h" +#include "fastdeploy/vision/detection/contrib/scaledyolov4.h" +#include "fastdeploy/vision/detection/contrib/yolor.h" +#include "fastdeploy/vision/detection/contrib/yolov5.h" +#include "fastdeploy/vision/detection/contrib/yolov5lite.h" +#include "fastdeploy/vision/detection/contrib/yolov6.h" +#include "fastdeploy/vision/detection/contrib/yolov7.h" +#include "fastdeploy/vision/detection/contrib/yolox.h" +#include "fastdeploy/vision/facedet/contrib/retinaface.h" +#include "fastdeploy/vision/facedet/contrib/scrfd.h" +#include "fastdeploy/vision/facedet/contrib/ultraface.h" +#include "fastdeploy/vision/facedet/contrib/yolov5face.h" +#include "fastdeploy/vision/faceid/contrib/arcface.h" +#include "fastdeploy/vision/faceid/contrib/cosface.h" +#include "fastdeploy/vision/faceid/contrib/insightface_rec.h" +#include "fastdeploy/vision/faceid/contrib/partial_fc.h" +#include "fastdeploy/vision/faceid/contrib/vpl.h" +#include "fastdeploy/vision/matting/contrib/modnet.h" +#include "fastdeploy/vision/ppcls/model.h" +#include "fastdeploy/vision/detection/ppdet/model.h" +#include "fastdeploy/vision/ppseg/model.h" +#endif + +#include "fastdeploy/vision/visualize/visualize.h" diff --git a/csrc/fastdeploy/vision/AddModel.md b/csrc/fastdeploy/vision/AddModel.md new file mode 100644 index 000000000..30080bd5e --- /dev/null +++ b/csrc/fastdeploy/vision/AddModel.md @@ -0,0 +1,3 @@ +# 如何添加一个模型 + +本文档以[yolov5](https://github.com/ultralytics/yolov5)为例,说明如何添加新的模型支持。 diff --git a/csrc/fastdeploy/vision/common/processors/base.cc b/csrc/fastdeploy/vision/common/processors/base.cc new file mode 100644 index 000000000..d770522d8 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/base.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/base.h" +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { +namespace vision { + +ProcLib Processor::default_lib = ProcLib::DEFAULT; + +bool Processor::CpuRun(Mat* mat) { + FDERROR << "Unimplemented CpuRun." << std::endl; + return false; +} + +#ifdef ENABLE_OPENCV_CUDA +bool Processor::GpuRun(Mat* mat) { + FDERROR << "Unimplemented GpuRun." << std::endl; + return false; +} +#endif + +bool Processor::operator()(Mat* mat, ProcLib lib) { + // if default_lib is set + // then use default_lib + ProcLib target = lib; + if (default_lib != ProcLib::DEFAULT) { + target = default_lib; + } + + if (target == ProcLib::OPENCV_CUDA) { +#ifdef ENABLE_OPENCV_CUDA + bool ret = GpuRun(mat); + mat->device = Device::GPU; + return ret; +#else + FDERROR + << "OpenCV is not compiled with CUDA, cannot process image with CUDA." + << std::endl; + return false; +#endif + } + bool ret = CpuRun(mat); + mat->device = Device::CPU; + return ret; +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/base.h b/csrc/fastdeploy/vision/common/processors/base.h new file mode 100644 index 000000000..d4138864a --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/base.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/utils/utils.h" +#include "fastdeploy/vision/common/processors/mat.h" +#include "opencv2/highgui/highgui.hpp" +#include "opencv2/imgproc/imgproc.hpp" + +namespace fastdeploy { +namespace vision { + +enum ProcLib { DEFAULT, OPENCV_CPU, OPENCV_CUDA }; + +class Processor { + public: + // default_lib has the highest priority + // all the function in `processor` will force to use + // default_lib if this flag is set. + // DEFAULT means this flag is not set + static ProcLib default_lib; + + // virtual bool ShapeInfer(const std::vector& in_shape, + // std::vector* out_shape) = 0; + virtual std::string Name() = 0; + virtual bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + virtual bool GpuRun(Mat* mat); +#endif + + virtual bool operator()(Mat* mat, + ProcLib lib = ProcLib::OPENCV_CPU); +}; + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/cast.cc b/csrc/fastdeploy/vision/common/processors/cast.cc new file mode 100644 index 000000000..b9a757f14 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/cast.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/cast.h" + +namespace fastdeploy { +namespace vision { + +bool Cast::CpuRun(Mat* mat) { + cv::Mat* im = mat->GetCpuMat(); + int c = im->channels(); + if (dtype_ == "float") { + if (im->type() != CV_32FC(c)) { + im->convertTo(*im, CV_32FC(c)); + } + } else if (dtype_ == "double") { + if (im->type() != CV_64FC(c)) { + im->convertTo(*im, CV_64FC(c)); + } + } else { + FDWARNING << "Cast not support for " << dtype_ + << " now! will skip this operation." << std::endl; + } + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool Cast::GpuRun(Mat* mat) { + cv::cuda::GpuMat* im = mat->GetGpuMat(); + int c = im->channels(); + if (dtype_ == "float") { + if (im->type() != CV_32FC(c)) { + im->convertTo(*im, CV_32FC(c)); + } + } else if (dtype_ == "double") { + if (im->type() != CV_64FC(c)) { + im->convertTo(*im, CV_64FC(c)); + } + } else { + FDWARNING << "Cast not support for " << dtype_ + << " now! will skip this operation." << std::endl; + } + return true; +} +#endif + +bool Cast::Run(Mat* mat, const std::string& dtype, ProcLib lib) { + auto c = Cast(dtype); + return c(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/cast.h b/csrc/fastdeploy/vision/common/processors/cast.h new file mode 100644 index 000000000..1111f08a6 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/cast.h @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class Cast : public Processor { + public: + explicit Cast(const std::string& dtype = "float") : dtype_(dtype) {} + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "Cast"; } + static bool Run(Mat* mat, const std::string& dtype, + ProcLib lib = ProcLib::OPENCV_CPU); + + private: + std::string dtype_; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/center_crop.cc b/csrc/fastdeploy/vision/common/processors/center_crop.cc new file mode 100644 index 000000000..27b86ca2d --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/center_crop.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/center_crop.h" + +namespace fastdeploy { +namespace vision { + +bool CenterCrop::CpuRun(Mat* mat) { + cv::Mat* im = mat->GetCpuMat(); + int height = static_cast(im->rows); + int width = static_cast(im->cols); + if (height < height_ || width < width_) { + FDERROR << "[CenterCrop] Image size less than crop size" << std::endl; + return false; + } + int offset_x = static_cast((width - width_) / 2); + int offset_y = static_cast((height - height_) / 2); + cv::Rect crop_roi(offset_x, offset_y, width_, height_); + *im = (*im)(crop_roi); + mat->SetWidth(width_); + mat->SetHeight(height_); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool CenterCrop::GpuRun(Mat* mat) { + cv::cuda::GpuMat* im = mat->GetGpuMat(); + int height = static_cast(im->rows); + int width = static_cast(im->cols); + if (height < height_ || width < width_) { + FDERROR << "[CenterCrop] Image size less than crop size" << std::endl; + return false; + } + int offset_x = static_cast((width - width_) / 2); + int offset_y = static_cast((height - height_) / 2); + cv::Rect crop_roi(offset_x, offset_y, width_, height_); + *im = (*im)(crop_roi); + mat->SetWidth(width_); + mat->SetHeight(height_); + return true; +} +#endif + +bool CenterCrop::Run(Mat* mat, const int& width, const int& height, + ProcLib lib) { + auto c = CenterCrop(width, height); + return c(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/center_crop.h b/csrc/fastdeploy/vision/common/processors/center_crop.h new file mode 100644 index 000000000..86ad0e20d --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/center_crop.h @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class CenterCrop : public Processor { + public: + CenterCrop(int width, int height) : height_(height), width_(width) {} + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "CenterCrop"; } + + static bool Run(Mat* mat, const int& width, const int& height, + ProcLib lib = ProcLib::OPENCV_CPU); + + private: + int height_; + int width_; +}; + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/color_space_convert.cc b/csrc/fastdeploy/vision/common/processors/color_space_convert.cc new file mode 100644 index 000000000..bcdaf365a --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/color_space_convert.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/color_space_convert.h" + +namespace fastdeploy { +namespace vision { +bool BGR2RGB::CpuRun(Mat* mat) { + cv::Mat* im = mat->GetCpuMat(); + cv::cvtColor(*im, *im, cv::COLOR_BGR2RGB); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool BGR2RGB::GpuRun(Mat* mat) { + cv::cuda::GpuMat* im = mat->GetGpuMat(); + cv::cuda::cvtColor(*im, *im, cv::COLOR_BGR2RGB); + return true; +} +#endif + +bool RGB2BGR::CpuRun(Mat* mat) { + cv::Mat* im = mat->GetCpuMat(); + cv::cvtColor(*im, *im, cv::COLOR_RGB2BGR); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool RGB2BGR::GpuRun(Mat* mat) { + cv::cuda::GpuMat* im = mat->GetGpuMat(); + cv::cuda::cvtColor(*im, *im, cv::COLOR_RGB2BGR); + return true; +} +#endif + +bool BGR2RGB::Run(Mat* mat, ProcLib lib) { + auto b = BGR2RGB(); + return b(mat, lib); +} + +bool RGB2BGR::Run(Mat* mat, ProcLib lib) { + auto r = RGB2BGR(); + return r(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/color_space_convert.h b/csrc/fastdeploy/vision/common/processors/color_space_convert.h new file mode 100644 index 000000000..472bcf16d --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/color_space_convert.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class BGR2RGB : public Processor { + public: + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + virtual std::string Name() { return "BGR2RGB"; } + + static bool Run(Mat* mat, ProcLib lib = ProcLib::OPENCV_CPU); +}; + +class RGB2BGR : public Processor { + public: + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "RGB2BGR"; } + + static bool Run(Mat* mat, ProcLib lib = ProcLib::OPENCV_CPU); +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/convert.cc b/csrc/fastdeploy/vision/common/processors/convert.cc new file mode 100644 index 000000000..a7ca6de07 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/convert.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/convert.h" + +namespace fastdeploy { + +namespace vision { + +Convert::Convert(const std::vector& alpha, + const std::vector& beta) { + FDASSERT(alpha.size() == beta.size(), + "Convert: requires the size of alpha equal to the size of beta."); + FDASSERT(alpha.size() != 0, + "Convert: requires the size of alpha and beta > 0."); + alpha_.assign(alpha.begin(), alpha.end()); + beta_.assign(beta.begin(), beta.end()); +} + +bool Convert::CpuRun(Mat* mat) { + cv::Mat* im = mat->GetCpuMat(); + std::vector split_im; + cv::split(*im, split_im); + for (int c = 0; c < im->channels(); c++) { + split_im[c].convertTo(split_im[c], CV_32FC1, alpha_[c], beta_[c]); + } + cv::merge(split_im, *im); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool Convert::GpuRun(Mat* mat) { + cv::cuda::GpuMat* im = mat->GetGpuMat(); + std::vector split_im; + cv::cuda::split(*im, split_im); + for (int c = 0; c < im->channels(); c++) { + split_im[c].convertTo(split_im[c], CV_32FC1, alpha_[c], beta_[c]); + } + cv::cuda::merge(split_im, *im); + return true; +} +#endif + +bool Convert::Run(Mat* mat, const std::vector& alpha, + const std::vector& beta, ProcLib lib) { + auto c = Convert(alpha, beta); + return c(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/common/processors/convert.h b/csrc/fastdeploy/vision/common/processors/convert.h new file mode 100644 index 000000000..5d5a5276f --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/convert.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { +class Convert : public Processor { + public: + Convert(const std::vector& alpha, const std::vector& beta); + + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "Convert"; } + + // Compute `result = mat * alpha + beta` directly by channel. + // The default behavior is the same as OpenCV's convertTo method. + static bool Run(Mat* mat, const std::vector& alpha, + const std::vector& beta, + ProcLib lib = ProcLib::OPENCV_CPU); + + private: + std::vector alpha_; + std::vector beta_; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/hwc2chw.cc b/csrc/fastdeploy/vision/common/processors/hwc2chw.cc new file mode 100644 index 000000000..5bea87e18 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/hwc2chw.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/hwc2chw.h" + +namespace fastdeploy { +namespace vision { +bool HWC2CHW::CpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "HWC2CHW: The input data is not Layout::HWC format!" + << std::endl; + return false; + } + cv::Mat* im = mat->GetCpuMat(); + cv::Mat im_clone = im->clone(); + int rh = im->rows; + int rw = im->cols; + int rc = im->channels(); + + // float* data = reinterpret_cast(im->data); + for (int i = 0; i < rc; ++i) { + // cv::extractChannel(im_clone, cv::Mat(rh, rw, im->type() % 8, data + i + // * rh * rw), + // i); + cv::extractChannel( + im_clone, + cv::Mat(rh, rw, im->type() % 8, + im->ptr() + i * rh * rw * FDDataTypeSize(mat->Type())), + i); + } + mat->layout = Layout::CHW; + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool HWC2CHW::GpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "HWC2CHW: The input data is not Layout::HWC format!" + << std::endl; + return false; + } + cv::cuda::GpuMat* im = mat->GetGpuMat(); + cv::cuda::GpuMat im_clone = im->clone(); + int rh = im->rows; + int rw = im->cols; + int rc = im->channels(); + int num_pixels = rh * rw; + std::vector channels{ + cv::cuda::GpuMat(rh, rw, im->type() % 8, &(im->ptr()[0])), + cv::cuda::GpuMat(rh, rw, im->type() % 8, &(im->ptr()[num_pixels])), + cv::cuda::GpuMat(rh, rw, im->type() % 8, &(im->ptr()[num_pixels * 2]))}; + cv::cuda::split(im_clone, channels); + mat->layout = Layout::CHW; + return true; +} +#endif + +bool HWC2CHW::Run(Mat* mat, ProcLib lib) { + auto h = HWC2CHW(); + return h(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/hwc2chw.h b/csrc/fastdeploy/vision/common/processors/hwc2chw.h new file mode 100644 index 000000000..56fa3ede8 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/hwc2chw.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class HWC2CHW : public Processor { + public: + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "HWC2CHW"; } + + static bool Run(Mat* mat, ProcLib lib = ProcLib::OPENCV_CPU); +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/mat.cc b/csrc/fastdeploy/vision/common/processors/mat.cc new file mode 100644 index 000000000..2afffa416 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/mat.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "fastdeploy/vision/common/processors/mat.h" +#include "fastdeploy/utils/utils.h" +namespace fastdeploy { +namespace vision { + +#ifdef ENABLE_OPENCV_CUDA +cv::cuda::GpuMat* Mat::GetGpuMat() { + if (device == Device::CPU) { + gpu_mat.upload(cpu_mat); + } + return &gpu_mat; +} +#endif + +cv::Mat* Mat::GetCpuMat() { +#ifdef ENABLE_OPENCV_CUDA + if (device == Device::GPU) { + gpu_mat.download(cpu_mat); + } +#endif + return &cpu_mat; +} + +void Mat::ShareWithTensor(FDTensor* tensor) { + if (device == Device::GPU) { +#ifdef ENABLE_OPENCV_CUDA + tensor->SetExternalData({Channels(), Height(), Width()}, Type(), + GetGpuMat()->ptr()); + tensor->device = Device::GPU; +#endif + } else { + tensor->SetExternalData({Channels(), Height(), Width()}, Type(), + GetCpuMat()->ptr()); + tensor->device = Device::CPU; + } + if (layout == Layout::HWC) { + tensor->shape = {Height(), Width(), Channels()}; + } +} + +bool Mat::CopyToTensor(FDTensor* tensor) { + cv::Mat* im = GetCpuMat(); + int total_bytes = im->total() * im->elemSize(); + if (total_bytes != tensor->Nbytes()) { + FDERROR << "While copy Mat to Tensor, requires the memory size be same, " + "but now size of Tensor = " + << tensor->Nbytes() << ", size of Mat = " << total_bytes << "." + << std::endl; + return false; + } + memcpy(tensor->MutableData(), im->ptr(), im->total() * im->elemSize()); + return true; +} + +void Mat::PrintInfo(const std::string& flag) { + cv::Mat* im = GetCpuMat(); + cv::Scalar mean = cv::mean(*im); + std::cout << flag << ": " + << "Channel=" << Channels() << ", height=" << Height() + << ", width=" << Width() << ", mean="; + for (int i = 0; i < Channels(); ++i) { + std::cout << mean[i] << " "; + } + std::cout << std::endl; +} + +FDDataType Mat::Type() { + int type = -1; + if (device == Device::GPU) { +#ifdef ENABLE_OPENCV_CUDA + type = gpu_mat.type(); +#endif + } else { + type = cpu_mat.type(); + } + if (type < 0) { + FDASSERT(false, + "While calling Mat::Type(), get negative value, which is not " + "expected!."); + } + type = type % 8; + if (type == 0) { + return FDDataType::UINT8; + } else if (type == 1) { + return FDDataType::INT8; + } else if (type == 2) { + FDASSERT(false, "While calling Mat::Type(), get UINT16 type which is not " + "supported now."); + } else if (type == 3) { + return FDDataType::INT16; + } else if (type == 4) { + return FDDataType::INT32; + } else if (type == 5) { + return FDDataType::FP32; + } else if (type == 6) { + return FDDataType::FP64; + } else { + FDASSERT(false, "While calling Mat::Type(), get type = " + + std::to_string(type) + ", which is not expected!."); + } +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/mat.h b/csrc/fastdeploy/vision/common/processors/mat.h new file mode 100644 index 000000000..cf4736238 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/mat.h @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "fastdeploy/core/fd_tensor.h" +#include "opencv2/core/core.hpp" + +#ifdef ENABLE_OPENCV_CUDA +#include "opencv2/core/cuda.hpp" +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudaimgproc.hpp" +#include "opencv2/cudawarping.hpp" +#endif + +namespace fastdeploy { +namespace vision { + +enum Layout { HWC, CHW }; + +struct FASTDEPLOY_DECL Mat { + explicit Mat(cv::Mat& mat) { + cpu_mat = mat; + device = Device::CPU; + layout = Layout::HWC; + height = cpu_mat.rows; + width = cpu_mat.cols; + channels = cpu_mat.channels(); + } + + private: + int channels; + int height; + int width; + cv::Mat cpu_mat; +#ifdef ENABLE_OPENCV_CUDA + cv::cuda::GpuMat gpu_mat; +#endif + + public: +#ifdef ENABLE_OPENCV_CUDA + cv::cuda::GpuMat* GetGpuMat(); +#endif + cv::Mat* GetCpuMat(); + + FDDataType Type(); + int Channels() const { return channels; } + int Width() const { return width; } + int Height() const { return height; } + void SetChannels(int s) { channels = s; } + void SetWidth(int w) { width = w; } + void SetHeight(int h) { height = h; } + + // Transfer the vision::Mat to FDTensor + void ShareWithTensor(FDTensor* tensor); + // Only support copy to cpu tensor now + bool CopyToTensor(FDTensor* tensor); + + // debug functions + // TODO(jiangjiajun) Develop a right process pipeline with c++ is not a easy + // things + // Will add more debug function here to help debug processed image + // This function will print shape / mean of each channels of the Mat + void PrintInfo(const std::string& flag); + + Layout layout = Layout::HWC; + Device device = Device::CPU; +}; + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/normalize.cc b/csrc/fastdeploy/vision/common/processors/normalize.cc new file mode 100644 index 000000000..b75406070 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/normalize.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/normalize.h" + +namespace fastdeploy { +namespace vision { +Normalize::Normalize(const std::vector& mean, + const std::vector& std, bool is_scale, + const std::vector& min, + const std::vector& max) { + FDASSERT(mean.size() == std.size(), + "Normalize: requires the size of mean equal to the size of std."); + std::vector mean_(mean.begin(), mean.end()); + std::vector std_(std.begin(), std.end()); + std::vector min_(mean.size(), 0.0); + std::vector max_(mean.size(), 255.0); + if (min.size() != 0) { + FDASSERT( + min.size() == mean.size(), + "Normalize: while min is defined, requires the size of min equal to " + "the size of mean."); + min_.assign(min.begin(), min.end()); + } + if (max.size() != 0) { + FDASSERT( + min.size() == mean.size(), + "Normalize: while max is defined, requires the size of max equal to " + "the size of mean."); + max_.assign(max.begin(), max.end()); + } + for (auto c = 0; c < mean_.size(); ++c) { + double alpha = 1.0; + if (is_scale) { + alpha /= (max_[c] - min_[c]); + } + double beta = -1.0 * (mean_[c] + min_[c] * alpha) / std_[c]; + alpha /= std_[c]; + alpha_.push_back(alpha); + beta_.push_back(beta); + } +} + +bool Normalize::CpuRun(Mat* mat) { + cv::Mat* im = mat->GetCpuMat(); + std::vector split_im; + cv::split(*im, split_im); + for (int c = 0; c < im->channels(); c++) { + split_im[c].convertTo(split_im[c], CV_32FC1, alpha_[c], beta_[c]); + } + cv::merge(split_im, *im); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool Normalize::GpuRun(Mat* mat) { + cv::cuda::GpuMat* im = mat->GetGpuMat(); + std::vector split_im; + cv::cuda::split(*im, split_im); + for (int c = 0; c < im->channels(); c++) { + split_im[c].convertTo(split_im[c], CV_32FC1, alpha_[c], beta_[c]); + } + cv::cuda::merge(split_im, *im); + return true; +} +#endif + +bool Normalize::Run(Mat* mat, const std::vector& mean, + const std::vector& std, bool is_scale, + const std::vector& min, + const std::vector& max, ProcLib lib) { + auto n = Normalize(mean, std, is_scale, min, max); + return n(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/normalize.h b/csrc/fastdeploy/vision/common/processors/normalize.h new file mode 100644 index 000000000..b8a66e945 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/normalize.h @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { +class Normalize : public Processor { + public: + Normalize(const std::vector& mean, const std::vector& std, + bool is_scale = true, + const std::vector& min = std::vector(), + const std::vector& max = std::vector()); + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "Normalize"; } + + // While use normalize, it is more recommend not use this function + // this function will need to compute result = ((mat / 255) - mean) / std + // if we use the following method + // ``` + // auto norm = Normalize(...) + // norm(mat) + // ``` + // There will be some precomputation in contruct function + // and the `norm(mat)` only need to compute result = mat * alpha + beta + // which will reduce lots of time + static bool Run(Mat* mat, const std::vector& mean, + const std::vector& std, bool is_scale = true, + const std::vector& min = std::vector(), + const std::vector& max = std::vector(), + ProcLib lib = ProcLib::OPENCV_CPU); + private: + std::vector alpha_; + std::vector beta_; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/pad.cc b/csrc/fastdeploy/vision/common/processors/pad.cc new file mode 100644 index 000000000..3b26d28bc --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/pad.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/pad.h" + +namespace fastdeploy { +namespace vision { + +bool Pad::CpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "Pad: The input data must be Layout::HWC format!" << std::endl; + return false; + } + if (mat->Channels() > 4) { + FDERROR << "Pad: Only support channels <= 4." << std::endl; + return false; + } + if (mat->Channels() != value_.size()) { + FDERROR << "Pad: Require input channels equals to size of padding value, " + "but now channels = " + << mat->Channels() + << ", the size of padding values = " << value_.size() << "." + << std::endl; + return false; + } + cv::Mat* im = mat->GetCpuMat(); + cv::Scalar value; + if (value_.size() == 1) { + value = cv::Scalar(value_[0]); + } else if (value_.size() == 2) { + value = cv::Scalar(value_[0], value_[1]); + } else if (value_.size() == 3) { + value = cv::Scalar(value_[0], value_[1], value_[2]); + } else { + value = cv::Scalar(value_[0], value_[1], value_[2], value_[3]); + } + cv::copyMakeBorder(*im, *im, top_, bottom_, left_, right_, + cv::BORDER_CONSTANT, value); + mat->SetHeight(im->rows); + mat->SetWidth(im->cols); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool Pad::GpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "Pad: The input data must be Layout::HWC format!" << std::endl; + return false; + } + if (mat->Channels() > 4) { + FDERROR << "Pad: Only support channels <= 4." << std::endl; + return false; + } + if (mat->Channels() != value_.size()) { + FDERROR << "Pad: Require input channels equals to size of padding value, " + "but now channels = " + << mat->Channels() + << ", the size of padding values = " << value_.size() << "." + << std::endl; + return false; + } + cv::cuda::GpuMat* im = mat->GetGpuMat(); + cv::Scalar value; + if (value_.size() == 1) { + value = cv::Scalar(value_[0]); + } else if (value_.size() == 2) { + value = cv::Scalar(value_[0], value_[1]); + } else if (value_.size() == 3) { + value = cv::Scalar(value_[0], value_[1], value_[2]); + } else { + value = cv::Scalar(value_[0], value_[1], value_[2], value_[3]); + } + cv::cuda::copyMakeBorder(*im, *im, top_, bottom_, left_, right_, + cv::BORDER_CONSTANT, value); + mat->SetHeight(im->rows); + mat->SetWidth(im->cols); + return true; +} +#endif + +bool Pad::Run(Mat* mat, const int& top, const int& bottom, const int& left, + const int& right, const std::vector& value, + ProcLib lib) { + auto p = Pad(top, bottom, left, right, value); + return p(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/pad.h b/csrc/fastdeploy/vision/common/processors/pad.h new file mode 100644 index 000000000..110365960 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/pad.h @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class Pad : public Processor { + public: + Pad(int top, int bottom, int left, int right, + const std::vector& value) { + top_ = top; + bottom_ = bottom; + left_ = left; + right_ = right; + value_ = value; + } + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "Pad"; } + + static bool Run(Mat* mat, const int& top, const int& bottom, const int& left, + const int& right, const std::vector& value, + ProcLib lib = ProcLib::OPENCV_CPU); + + private: + int top_; + int bottom_; + int left_; + int right_; + std::vector value_; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/pad_to_size.cc b/csrc/fastdeploy/vision/common/processors/pad_to_size.cc new file mode 100644 index 000000000..d4cbacd87 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/pad_to_size.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/pad_to_size.h" + +namespace fastdeploy { +namespace vision { + +bool PadToSize::CpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "PadToSize: The input data must be Layout::HWC format!" + << std::endl; + return false; + } + if (mat->Channels() > 4) { + FDERROR << "PadToSize: Only support channels <= 4." << std::endl; + return false; + } + if (mat->Channels() != value_.size()) { + FDERROR + << "PadToSize: Require input channels equals to size of padding value, " + "but now channels = " + << mat->Channels() << ", the size of padding values = " << value_.size() + << "." << std::endl; + return false; + } + int origin_w = mat->Width(); + int origin_h = mat->Height(); + if (origin_w > width_) { + FDERROR << "PadToSize: the input width:" << origin_w + << " is greater than the target width: " << width_ << "." + << std::endl; + return false; + } + if (origin_h > height_) { + FDERROR << "PadToSize: the input height:" << origin_h + << " is greater than the target height: " << height_ << "." + << std::endl; + return false; + } + if (origin_w == width_ && origin_h == height_) { + return true; + } + + cv::Mat* im = mat->GetCpuMat(); + cv::Scalar value; + if (value_.size() == 1) { + value = cv::Scalar(value_[0]); + } else if (value_.size() == 2) { + value = cv::Scalar(value_[0], value_[1]); + } else if (value_.size() == 3) { + value = cv::Scalar(value_[0], value_[1], value_[2]); + } else { + value = cv::Scalar(value_[0], value_[1], value_[2], value_[3]); + } + // top, bottom, left, right + cv::copyMakeBorder(*im, *im, 0, height_ - origin_h, 0, width_ - origin_w, + cv::BORDER_CONSTANT, value); + mat->SetHeight(height_); + mat->SetWidth(width_); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool PadToSize::GpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "PadToSize: The input data must be Layout::HWC format!" + << std::endl; + return false; + } + if (mat->Channels() > 4) { + FDERROR << "PadToSize: Only support channels <= 4." << std::endl; + return false; + } + if (mat->Channels() != value_.size()) { + FDERROR + << "PadToSize: Require input channels equals to size of padding value, " + "but now channels = " + << mat->Channels() << ", the size of padding values = " << value_.size() + << "." << std::endl; + return false; + } + + int origin_w = mat->Width(); + int origin_h = mat->Height(); + if (origin_w > width_) { + FDERROR << "PadToSize: the input width:" << origin_w + << " is greater than the target width: " << width_ << "." + << std::endl; + return false; + } + if (origin_h > height_) { + FDERROR << "PadToSize: the input height:" << origin_h + << " is greater than the target height: " << height_ << "." + << std::endl; + return false; + } + if (origin_w == width_ && origin_h == height_) { + return true; + } + + cv::cuda::GpuMat* im = mat->GetGpuMat(); + cv::Scalar value; + if (value_.size() == 1) { + value = cv::Scalar(value_[0]); + } else if (value_.size() == 2) { + value = cv::Scalar(value_[0], value_[1]); + } else if (value_.size() == 3) { + value = cv::Scalar(value_[0], value_[1], value_[2]); + } else { + value = cv::Scalar(value_[0], value_[1], value_[2], value_[3]); + } + + // top, bottom, left, right + cv::cuda::copyMakeBorder(*im, *im, 0, height_ - origin_h, 0, + width_ - origin_w, cv::BORDER_CONSTANT, value); + mat->SetHeight(height_); + mat->SetWidth(width_); + return true; +} +#endif + +bool PadToSize::Run(Mat* mat, int width, int height, + const std::vector& value, ProcLib lib) { + auto p = PadToSize(width, height, value); + return p(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/pad_to_size.h b/csrc/fastdeploy/vision/common/processors/pad_to_size.h new file mode 100644 index 000000000..ece0158f7 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/pad_to_size.h @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class PadToSize : public Processor { + public: + // only support pad with left-top padding mode + PadToSize(int width, int height, const std::vector& value) { + width_ = width; + height_ = height; + value_ = value; + } + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "PadToSize"; } + + static bool Run(Mat* mat, int width, int height, + const std::vector& value, + ProcLib lib = ProcLib::OPENCV_CPU); + + private: + int width_; + int height_; + std::vector value_; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/resize.cc b/csrc/fastdeploy/vision/common/processors/resize.cc new file mode 100644 index 000000000..d6b8b9e2f --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/resize.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/resize.h" + +namespace fastdeploy { +namespace vision { + +bool Resize::CpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "Resize: The format of input is not HWC." << std::endl; + return false; + } + cv::Mat* im = mat->GetCpuMat(); + int origin_w = im->cols; + int origin_h = im->rows; + if (width_ > 0 && height_ > 0) { + if (use_scale_) { + float scale_w = width_ * 1.0 / origin_w; + float scale_h = height_ * 1.0 / origin_h; + cv::resize(*im, *im, cv::Size(0, 0), scale_w, scale_h, interp_); + } else { + cv::resize(*im, *im, cv::Size(width_, height_), 0, 0, interp_); + } + } else if (scale_w_ > 0 && scale_h_ > 0) { + cv::resize(*im, *im, cv::Size(0, 0), scale_w_, scale_h_, interp_); + } else { + FDERROR << "Resize: the parameters must satisfy (width > 0 && height > 0) " + "or (scale_w > 0 && scale_h > 0)." + << std::endl; + return false; + } + mat->SetWidth(im->cols); + mat->SetHeight(im->rows); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool Resize::GpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "Resize: The format of input is not HWC." << std::endl; + return false; + } + cv::cuda::GpuMat* im = mat->GetGpuMat(); + int origin_w = im->cols; + int origin_h = im->rows; + if (width_ > 0 && height_ > 0) { + if (use_scale_) { + float scale_w = width_ * 1.0 / origin_w; + float scale_h = height_ * 1.0 / origin_h; + cv::cuda::resize(*im, *im, cv::Size(0, 0), scale_w, scale_h, interp_); + } else { + cv::cuda::resize(*im, *im, cv::Size(width_, height_), 0, 0, interp_); + } + } else if (scale_w_ > 0 && scale_h_ > 0) { + cv::cuda::resize(*im, *im, cv::Size(0, 0), scale_w_, scale_h_, interp_); + } else { + FDERROR << "Resize: the parameters must satisfy (width > 0 && height > 0) " + "or (scale_w > 0 && scale_h > 0)." + << std::endl; + return false; + } + mat->SetWidth(im->cols); + mat->SetHeight(im->rows); + return true; +} +#endif + +bool Resize::Run(Mat* mat, int width, int height, float scale_w, float scale_h, + int interp, bool use_scale, ProcLib lib) { + if (mat->Height() == height && mat->Width() == width) { + return true; + } + auto r = Resize(width, height, scale_w, scale_h, interp, use_scale); + return r(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/resize.h b/csrc/fastdeploy/vision/common/processors/resize.h new file mode 100644 index 000000000..5b6e9c025 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/resize.h @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class Resize : public Processor { + public: + Resize(int width, int height, float scale_w = -1.0, float scale_h = -1.0, + int interp = 1, bool use_scale = false) { + width_ = width; + height_ = height; + scale_w_ = scale_w; + scale_h_ = scale_h; + interp_ = interp; + use_scale_ = use_scale; + } + + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "Resize"; } + + static bool Run(Mat* mat, int width, int height, float scale_w = -1.0, + float scale_h = -1.0, int interp = 1, bool use_scale = false, + ProcLib lib = ProcLib::OPENCV_CPU); + + bool SetWidthAndHeight(int width, int height) { + width_ = width; + height_ = height; + return true; + } + + std::tuple GetWidthAndHeight() { + return std::make_tuple(width_, height_); + } + + private: + int width_; + int height_; + float scale_w_ = -1.0; + float scale_h_ = -1.0; + int interp_ = 1; + bool use_scale_ = false; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/resize_by_short.cc b/csrc/fastdeploy/vision/common/processors/resize_by_short.cc new file mode 100644 index 000000000..8e850425f --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/resize_by_short.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/resize_by_short.h" + +namespace fastdeploy { +namespace vision { + +bool ResizeByShort::CpuRun(Mat* mat) { + cv::Mat* im = mat->GetCpuMat(); + int origin_w = im->cols; + int origin_h = im->rows; + double scale = GenerateScale(origin_w, origin_h); + if (use_scale_) { + cv::resize(*im, *im, cv::Size(), scale, scale, interp_); + } else { + int width = static_cast(round(scale * im->cols)); + int height = static_cast(round(scale * im->rows)); + cv::resize(*im, *im, cv::Size(width, height), 0, 0, interp_); + } + mat->SetWidth(im->cols); + mat->SetHeight(im->rows); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool ResizeByShort::GpuRun(Mat* mat) { + cv::cuda::GpuMat* im = mat->GetGpuMat(); + int origin_w = im->cols; + int origin_h = im->rows; + double scale = GenerateScale(origin_w, origin_h); + im->convertTo(*im, CV_32FC(im->channels())); + if (use_scale_) { + cv::cuda::resize(*im, *im, cv::Size(), scale, scale, interp_); + } else { + int width = static_cast(round(scale * im->cols)); + int height = static_cast(round(scale * im->rows)); + cv::cuda::resize(*im, *im, cv::Size(width, height), 0, 0, interp_); + } + mat->SetWidth(im->cols); + mat->SetHeight(im->rows); + return true; +} +#endif + +double ResizeByShort::GenerateScale(const int origin_w, const int origin_h) { + int im_size_max = std::max(origin_w, origin_h); + int im_size_min = std::min(origin_w, origin_h); + double scale = + static_cast(target_size_) / static_cast(im_size_min); + if (max_size_ > 0) { + if (round(scale * im_size_max) > max_size_) { + scale = static_cast(max_size_) / static_cast(im_size_max); + } + } + return scale; +} + +bool ResizeByShort::Run(Mat* mat, int target_size, int interp, bool use_scale, + int max_size, ProcLib lib) { + auto r = ResizeByShort(target_size, interp, use_scale, max_size); + return r(mat, lib); +} +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/resize_by_short.h b/csrc/fastdeploy/vision/common/processors/resize_by_short.h new file mode 100644 index 000000000..023748e9e --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/resize_by_short.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class ResizeByShort : public Processor { + public: + ResizeByShort(int target_size, int interp = 1, bool use_scale = true, + int max_size = -1) { + target_size_ = target_size; + max_size_ = max_size; + interp_ = interp; + use_scale_ = use_scale; + } + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "ResizeByShort"; } + + static bool Run(Mat* mat, int target_size, int interp = 1, + bool use_scale = true, int max_size = -1, + ProcLib lib = ProcLib::OPENCV_CPU); + + private: + double GenerateScale(const int origin_w, const int origin_h); + int target_size_; + int max_size_; + int interp_; + bool use_scale_; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/stride_pad.cc b/csrc/fastdeploy/vision/common/processors/stride_pad.cc new file mode 100644 index 000000000..8597c8375 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/stride_pad.cc @@ -0,0 +1,124 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/common/processors/stride_pad.h" + +namespace fastdeploy { +namespace vision { + +bool StridePad::CpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "StridePad: The input data must be Layout::HWC format!" + << std::endl; + return false; + } + if (mat->Channels() > 4) { + FDERROR << "StridePad: Only support channels <= 4." << std::endl; + return false; + } + if (mat->Channels() != value_.size()) { + FDERROR + << "StridePad: Require input channels equals to size of padding value, " + "but now channels = " + << mat->Channels() << ", the size of padding values = " << value_.size() + << "." << std::endl; + return false; + } + int origin_w = mat->Width(); + int origin_h = mat->Height(); + + int pad_h = (mat->Height() / stride_) * stride_ + + (mat->Height() % stride_ != 0) * stride_ - mat->Height(); + int pad_w = (mat->Width() / stride_) * stride_ + + (mat->Width() % stride_ != 0) * stride_ - mat->Width(); + if (pad_h == 0 && pad_w == 0) { + return true; + } + cv::Mat* im = mat->GetCpuMat(); + cv::Scalar value; + if (value_.size() == 1) { + value = cv::Scalar(value_[0]); + } else if (value_.size() == 2) { + value = cv::Scalar(value_[0], value_[1]); + } else if (value_.size() == 3) { + value = cv::Scalar(value_[0], value_[1], value_[2]); + } else { + value = cv::Scalar(value_[0], value_[1], value_[2], value_[3]); + } + // top, bottom, left, right + cv::copyMakeBorder(*im, *im, 0, pad_h, 0, pad_w, cv::BORDER_CONSTANT, value); + mat->SetHeight(origin_h + pad_h); + mat->SetWidth(origin_w + pad_w); + return true; +} + +#ifdef ENABLE_OPENCV_CUDA +bool StridePad::GpuRun(Mat* mat) { + if (mat->layout != Layout::HWC) { + FDERROR << "StridePad: The input data must be Layout::HWC format!" + << std::endl; + return false; + } + if (mat->Channels() > 4) { + FDERROR << "StridePad: Only support channels <= 4." << std::endl; + return false; + } + if (mat->Channels() != value_.size()) { + FDERROR + << "StridePad: Require input channels equals to size of padding value, " + "but now channels = " + << mat->Channels() << ", the size of padding values = " << value_.size() + << "." << std::endl; + return false; + } + + int origin_w = mat->Width(); + int origin_h = mat->Height(); + int pad_h = (mat->Height() / stride_) * stride_ + + (mat->Height() % stride_ != 0) * stride_; + int pad_w = (mat->Width() / stride_) * stride_ + + (mat->Width() % stride_ != 0) * stride_; + if (pad_h == 0 && pad_w == 0) { + return true; + } + + cv::cuda::GpuMat* im = mat->GetGpuMat(); + cv::Scalar value; + if (value_.size() == 1) { + value = cv::Scalar(value_[0]); + } else if (value_.size() == 2) { + value = cv::Scalar(value_[0], value_[1]); + } else if (value_.size() == 3) { + value = cv::Scalar(value_[0], value_[1], value_[2]); + } else { + value = cv::Scalar(value_[0], value_[1], value_[2], value_[3]); + } + + // top, bottom, left, right + cv::cuda::copyMakeBorder(*im, *im, 0, pad_h, 0, pad_w, cv::BORDER_CONSTANT, + value); + mat->SetHeight(origin_h + pad_h); + mat->SetWidth(origin_w + pad_w); + return true; +} +#endif + +bool StridePad::Run(Mat* mat, int stride, const std::vector& value, + ProcLib lib) { + auto p = StridePad(stride, value); + return p(mat, lib); +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/stride_pad.h b/csrc/fastdeploy/vision/common/processors/stride_pad.h new file mode 100644 index 000000000..c002ca697 --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/stride_pad.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/base.h" + +namespace fastdeploy { +namespace vision { + +class StridePad : public Processor { + public: + // only support pad with left-top padding mode + StridePad(int stride, const std::vector& value) { + stride_ = stride; + value_ = value; + } + bool CpuRun(Mat* mat); +#ifdef ENABLE_OPENCV_CUDA + bool GpuRun(Mat* mat); +#endif + std::string Name() { return "StridePad"; } + + static bool Run(Mat* mat, int stride, + const std::vector& value = std::vector(), + ProcLib lib = ProcLib::OPENCV_CPU); + + private: + int stride_ = 32; + std::vector value_; +}; +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/processors/transform.h b/csrc/fastdeploy/vision/common/processors/transform.h new file mode 100644 index 000000000..fed3d0c9a --- /dev/null +++ b/csrc/fastdeploy/vision/common/processors/transform.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/vision/common/processors/cast.h" +#include "fastdeploy/vision/common/processors/center_crop.h" +#include "fastdeploy/vision/common/processors/color_space_convert.h" +#include "fastdeploy/vision/common/processors/convert.h" +#include "fastdeploy/vision/common/processors/hwc2chw.h" +#include "fastdeploy/vision/common/processors/normalize.h" +#include "fastdeploy/vision/common/processors/pad.h" +#include "fastdeploy/vision/common/processors/pad_to_size.h" +#include "fastdeploy/vision/common/processors/resize.h" +#include "fastdeploy/vision/common/processors/resize_by_short.h" +#include "fastdeploy/vision/common/processors/stride_pad.h" diff --git a/csrc/fastdeploy/vision/common/result.cc b/csrc/fastdeploy/vision/common/result.cc new file mode 100644 index 000000000..854d6fcab --- /dev/null +++ b/csrc/fastdeploy/vision/common/result.cc @@ -0,0 +1,306 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { + +void ClassifyResult::Clear() { + std::vector().swap(label_ids); + std::vector().swap(scores); +} + +std::string ClassifyResult::Str() { + std::string out; + out = "ClassifyResult(\nlabel_ids: "; + for (size_t i = 0; i < label_ids.size(); ++i) { + out = out + std::to_string(label_ids[i]) + ", "; + } + out += "\nscores: "; + for (size_t i = 0; i < label_ids.size(); ++i) { + out = out + std::to_string(scores[i]) + ", "; + } + out += "\n)"; + return out; +} + +DetectionResult::DetectionResult(const DetectionResult& res) { + boxes.assign(res.boxes.begin(), res.boxes.end()); + scores.assign(res.scores.begin(), res.scores.end()); + label_ids.assign(res.label_ids.begin(), res.label_ids.end()); +} + +void DetectionResult::Clear() { + std::vector>().swap(boxes); + std::vector().swap(scores); + std::vector().swap(label_ids); +} + +void DetectionResult::Reserve(int size) { + boxes.reserve(size); + scores.reserve(size); + label_ids.reserve(size); +} + +void DetectionResult::Resize(int size) { + boxes.resize(size); + scores.resize(size); + label_ids.resize(size); +} + +std::string DetectionResult::Str() { + std::string out; + out = "DetectionResult: [xmin, ymin, xmax, ymax, score, label_id]\n"; + for (size_t i = 0; i < boxes.size(); ++i) { + out = out + std::to_string(boxes[i][0]) + "," + + std::to_string(boxes[i][1]) + ", " + std::to_string(boxes[i][2]) + + ", " + std::to_string(boxes[i][3]) + ", " + + std::to_string(scores[i]) + ", " + std::to_string(label_ids[i]) + + "\n"; + } + return out; +} + +FaceDetectionResult::FaceDetectionResult(const FaceDetectionResult& res) { + boxes.assign(res.boxes.begin(), res.boxes.end()); + landmarks.assign(res.landmarks.begin(), res.landmarks.end()); + scores.assign(res.scores.begin(), res.scores.end()); + landmarks_per_face = res.landmarks_per_face; +} + +void FaceDetectionResult::Clear() { + std::vector>().swap(boxes); + std::vector().swap(scores); + std::vector>().swap(landmarks); + landmarks_per_face = 0; +} + +void FaceDetectionResult::Reserve(int size) { + boxes.reserve(size); + scores.reserve(size); + if (landmarks_per_face > 0) { + landmarks.reserve(size * landmarks_per_face); + } +} + +void FaceDetectionResult::Resize(int size) { + boxes.resize(size); + scores.resize(size); + if (landmarks_per_face > 0) { + landmarks.resize(size * landmarks_per_face); + } +} + +std::string FaceDetectionResult::Str() { + std::string out; + // format without landmarks + if (landmarks_per_face <= 0) { + out = "FaceDetectionResult: [xmin, ymin, xmax, ymax, score]\n"; + for (size_t i = 0; i < boxes.size(); ++i) { + out = out + std::to_string(boxes[i][0]) + "," + + std::to_string(boxes[i][1]) + ", " + std::to_string(boxes[i][2]) + + ", " + std::to_string(boxes[i][3]) + ", " + + std::to_string(scores[i]) + "\n"; + } + return out; + } + // format with landmarks + FDASSERT((landmarks.size() == boxes.size() * landmarks_per_face), + "The size of landmarks != boxes.size * landmarks_per_face."); + out = "FaceDetectionResult: [xmin, ymin, xmax, ymax, score, (x, y) x " + + std::to_string(landmarks_per_face) + "]\n"; + for (size_t i = 0; i < boxes.size(); ++i) { + out = out + std::to_string(boxes[i][0]) + "," + + std::to_string(boxes[i][1]) + ", " + std::to_string(boxes[i][2]) + + ", " + std::to_string(boxes[i][3]) + ", " + + std::to_string(scores[i]) + ", "; + for (size_t j = 0; j < landmarks_per_face; ++j) { + out = out + "(" + + std::to_string(landmarks[i * landmarks_per_face + j][0]) + "," + + std::to_string(landmarks[i * landmarks_per_face + j][1]); + if (j < landmarks_per_face - 1) { + out = out + "), "; + } else { + out = out + ")\n"; + } + } + } + return out; +} + +void SegmentationResult::Clear() { + std::vector().swap(label_map); + std::vector().swap(score_map); + std::vector().swap(shape); + contain_score_map = false; +} + +void SegmentationResult::Reserve(int size) { + label_map.reserve(size); + if (contain_score_map > 0) { + score_map.reserve(size); + } +} + +void SegmentationResult::Resize(int size) { + label_map.resize(size); + if (contain_score_map) { + score_map.resize(size); + } +} + +std::string SegmentationResult::Str() { + std::string out; + out = "SegmentationResult Image masks 10 rows x 10 cols: \n"; + for (size_t i = 0; i < 10; ++i) { + out += "["; + for (size_t j = 0; j < 10; ++j) { + out = out + std::to_string(label_map[i * 10 + j]) + ", "; + } + out += ".....]\n"; + } + out += "...........\n"; + if (contain_score_map) { + out += "SegmentationResult Score map 10 rows x 10 cols: \n"; + for (size_t i = 0; i < 10; ++i) { + out += "["; + for (size_t j = 0; j < 10; ++j) { + out = out + std::to_string(score_map[i * 10 + j]) + ", "; + } + out += ".....]\n"; + } + out += "...........\n"; + } + out += "result shape is: [" + std::to_string(shape[0]) + " " + + std::to_string(shape[1]) + "]"; + return out; +} + +FaceRecognitionResult::FaceRecognitionResult(const FaceRecognitionResult& res) { + embedding.assign(res.embedding.begin(), res.embedding.end()); +} + +void FaceRecognitionResult::Clear() { std::vector().swap(embedding); } + +void FaceRecognitionResult::Reserve(int size) { embedding.reserve(size); } + +void FaceRecognitionResult::Resize(int size) { embedding.resize(size); } + +std::string FaceRecognitionResult::Str() { + std::string out; + out = "FaceRecognitionResult: ["; + size_t numel = embedding.size(); + if (numel <= 0) { + return out + "Empty Result]"; + } + // max, min, mean + float min_val = embedding.at(0); + float max_val = embedding.at(0); + float total_val = embedding.at(0); + for (size_t i = 1; i < numel; ++i) { + float val = embedding.at(i); + total_val += val; + if (val < min_val) { + min_val = val; + } + if (val > max_val) { + max_val = val; + } + } + float mean_val = total_val / static_cast(numel); + out = out + "Dim(" + std::to_string(numel) + "), " + "Min(" + + std::to_string(min_val) + "), " + "Max(" + std::to_string(max_val) + + "), " + "Mean(" + std::to_string(mean_val) + ")]\n"; + return out; +} + +MattingResult::MattingResult(const MattingResult& res) { + alpha.assign(res.alpha.begin(), res.alpha.end()); + foreground.assign(res.foreground.begin(), res.foreground.end()); + shape.assign(res.shape.begin(), res.shape.end()); + contain_foreground = res.contain_foreground; +} + +void MattingResult::Clear() { + std::vector().swap(alpha); + std::vector().swap(foreground); + std::vector().swap(shape); + contain_foreground = false; +} + +void MattingResult::Reserve(int size) { + alpha.reserve(size); + if (contain_foreground) { + FDASSERT((shape.size() == 3), + "Please initial shape (h,w,c) before call Reserve."); + int c = static_cast(shape[3]); + foreground.reserve(size * c); + } +} + +void MattingResult::Resize(int size) { + alpha.resize(size); + if (contain_foreground) { + FDASSERT((shape.size() == 3), + "Please initial shape (h,w,c) before call Resize."); + int c = static_cast(shape[3]); + foreground.resize(size * c); + } +} + +std::string MattingResult::Str() { + std::string out; + out = "MattingResult["; + if (contain_foreground) { + out += "Foreground(true)"; + } else { + out += "Foreground(false)"; + } + out += ", Alpha("; + size_t numel = alpha.size(); + if (numel <= 0) { + return out + "[Empty Result]"; + } + // max, min, mean + float min_val = alpha.at(0); + float max_val = alpha.at(0); + float total_val = alpha.at(0); + for (size_t i = 1; i < numel; ++i) { + float val = alpha.at(i); + total_val += val; + if (val < min_val) { + min_val = val; + } + if (val > max_val) { + max_val = val; + } + } + float mean_val = total_val / static_cast(numel); + // shape + std::string shape_str = "Shape("; + for (size_t i = 0; i < shape.size(); ++i) { + if ((i + 1) != shape.size()) { + shape_str += std::to_string(shape[i]) + ","; + } else { + shape_str += std::to_string(shape[i]) + ")"; + } + } + out = out + "Numel(" + std::to_string(numel) + "), " + shape_str + ", Min(" + + std::to_string(min_val) + "), " + "Max(" + std::to_string(max_val) + + "), " + "Mean(" + std::to_string(mean_val) + "))]\n"; + return out; +} + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/common/result.h b/csrc/fastdeploy/vision/common/result.h new file mode 100644 index 000000000..f57178cee --- /dev/null +++ b/csrc/fastdeploy/vision/common/result.h @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "opencv2/core/core.hpp" + +namespace fastdeploy { +namespace vision { +enum FASTDEPLOY_DECL ResultType { + UNKNOWN_RESULT, + CLASSIFY, + DETECTION, + SEGMENTATION, + FACE_DETECTION, + FACE_RECOGNITION, + MATTING +}; + +struct FASTDEPLOY_DECL BaseResult { + ResultType type = ResultType::UNKNOWN_RESULT; +}; + +struct FASTDEPLOY_DECL ClassifyResult : public BaseResult { + std::vector label_ids; + std::vector scores; + ResultType type = ResultType::CLASSIFY; + + void Clear(); + std::string Str(); +}; + +struct FASTDEPLOY_DECL DetectionResult : public BaseResult { + // box: xmin, ymin, xmax, ymax + std::vector> boxes; + std::vector scores; + std::vector label_ids; + ResultType type = ResultType::DETECTION; + + DetectionResult() {} + DetectionResult(const DetectionResult& res); + + void Clear(); + + void Reserve(int size); + + void Resize(int size); + + std::string Str(); +}; + +struct FASTDEPLOY_DECL FaceDetectionResult : public BaseResult { + // box: xmin, ymin, xmax, ymax + std::vector> boxes; + // landmark: x, y, landmarks may empty if the + // model don't detect face with landmarks. + // Note, one face might have multiple landmarks, + // such as 5/19/21/68/98/..., etc. + std::vector> landmarks; + std::vector scores; + ResultType type = ResultType::FACE_DETECTION; + // set landmarks_per_face manually in your post processes. + int landmarks_per_face; + + FaceDetectionResult() { landmarks_per_face = 0; } + FaceDetectionResult(const FaceDetectionResult& res); + + void Clear(); + + void Reserve(int size); + + void Resize(int size); + + std::string Str(); +}; + +struct FASTDEPLOY_DECL SegmentationResult : public BaseResult { + // mask + std::vector label_map; + std::vector score_map; + std::vector shape; + bool contain_score_map = false; + + ResultType type = ResultType::SEGMENTATION; + + void Clear(); + + void Reserve(int size); + + void Resize(int size); + + std::string Str(); +}; + +struct FASTDEPLOY_DECL FaceRecognitionResult : public BaseResult { + // face embedding vector with 128/256/512 ... dim + std::vector embedding; + + ResultType type = ResultType::FACE_RECOGNITION; + + FaceRecognitionResult() {} + FaceRecognitionResult(const FaceRecognitionResult& res); + + void Clear(); + + void Reserve(int size); + + void Resize(int size); + + std::string Str(); +}; + +struct FASTDEPLOY_DECL MattingResult : public BaseResult { + // alpha matte and fgr (predicted foreground: HWC/BGR float32) + std::vector alpha; // h x w + std::vector foreground; // h x w x c (c=3 default) + // height, width, channel for foreground and alpha + // must be (h,w,c) and setup before Reserve and Resize + // c is only for foreground if contain_foreground is true. + std::vector shape; + bool contain_foreground = false; + + ResultType type = ResultType::MATTING; + + MattingResult() {} + MattingResult(const MattingResult& res); + + void Clear(); + + void Reserve(int size); + + void Resize(int size); + + std::string Str(); +}; + +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/nanodet_plus.cc b/csrc/fastdeploy/vision/detection/contrib/nanodet_plus.cc new file mode 100644 index 000000000..267012c11 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/nanodet_plus.cc @@ -0,0 +1,355 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/nanodet_plus.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace detection { + +struct NanoDetPlusCenterPoint { + int grid0; + int grid1; + int stride; +}; + +void GenerateNanoDetPlusCenterPoints( + const std::vector& size, const std::vector& downsample_strides, + std::vector* center_points) { + // size: tuple of input (width, height), e.g (320, 320) + // downsample_strides: downsample strides in NanoDet and + // NanoDet-Plus, e.g (8, 16, 32, 64) + const int width = size[0]; + const int height = size[1]; + for (const auto& ds : downsample_strides) { + int num_grid_w = width / ds; + int num_grid_h = height / ds; + for (int g1 = 0; g1 < num_grid_h; ++g1) { + for (int g0 = 0; g0 < num_grid_w; ++g0) { + (*center_points).emplace_back(NanoDetPlusCenterPoint{g0, g1, ds}); + } + } + } +} + +void WrapAndResize(Mat* mat, std::vector size, std::vector color, + bool keep_ratio = false) { + // Reference: nanodet/data/transform/warp.py#L139 + // size: tuple of input (width, height) + // The default value of `keep_ratio` is `fasle` in + // `config/nanodet-plus-m-1.5x_320.yml` for both + // train and val processes. So, we just let this + // option default `false` according to the official + // implementation in NanoDet and NanoDet-Plus. + // Note, this function will apply a normal resize + // operation to input Mat if the keep_ratio option + // is fasle and the behavior will be the same as + // yolov5's letterbox if keep_ratio is true. + + // with keep_ratio = false (default) + if (!keep_ratio) { + int resize_h = size[1]; + int resize_w = size[0]; + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + return; + } + // with keep_ratio = true, same as yolov5's letterbox + float r = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + + int resize_h = int(round(static_cast(mat->Height()) * r)); + int resize_w = int(round(static_cast(mat->Width()) * r)); + + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +void GFLRegression(const float* logits, size_t reg_num, float* offset) { + // Hint: reg_num = reg_max + 1 + FDASSERT(((nullptr != logits) && (reg_num != 0)), + "NanoDetPlus: logits is nullptr or reg_num is 0 in GFLRegression."); + // softmax + float total_exp = 0.f; + std::vector softmax_probs(reg_num); + for (size_t i = 0; i < reg_num; ++i) { + softmax_probs[i] = std::exp(logits[i]); + total_exp += softmax_probs[i]; + } + for (size_t i = 0; i < reg_num; ++i) { + softmax_probs[i] = softmax_probs[i] / total_exp; + } + // gfl regression -> offset + for (size_t i = 0; i < reg_num; ++i) { + (*offset) += static_cast(i) * softmax_probs[i]; + } +} + +NanoDetPlus::NanoDetPlus(const std::string& model_file, + const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool NanoDetPlus::Initialize() { + // parameters for preprocess + size = {320, 320}; + padding_value = {0.0f, 0.0f, 0.0f}; + keep_ratio = false; + downsample_strides = {8, 16, 32, 64}; + max_wh = 4096.0f; + reg_max = 7; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + return true; +} + +bool NanoDetPlus::Preprocess( + Mat* mat, FDTensor* output, + std::map>* im_info) { + // NanoDet-Plus preprocess steps + // 1. WrapAndResize + // 2. HWC->CHW + // 3. Normalize or Convert (keep BGR order) + WrapAndResize(mat, size, padding_value, keep_ratio); + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + // Compute `result = mat * alpha + beta` directly by channel + // Reference: /config/nanodet-plus-m-1.5x_320.yml#L89 + // from mean: [103.53, 116.28, 123.675], std: [57.375, 57.12, 58.395] + // x' = (x - mean) / std to x'= x * alpha + beta. + // e.g alpha[0] = 0.017429f = 1.0f / 57.375f + // e.g beta[0] = -103.53f * 0.0174291f + std::vector alpha = {0.017429f, 0.017507f, 0.017125f}; + std::vector beta = {-103.53f * 0.0174291f, -116.28f * 0.0175070f, + -123.675f * 0.0171247f}; // BGR order + Convert::Run(mat, alpha, beta); + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool NanoDetPlus::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + // generate center points with dowmsample strides + std::vector center_points; + GenerateNanoDetPlusCenterPoints(size, downsample_strides, ¢er_points); + + // infer_result shape might look like (1,2125,112) + const int num_cls_reg = infer_result.shape[2]; // e.g 112 + const int num_classes = num_cls_reg - (reg_max + 1) * 4; // e.g 80 + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + float* scores = data + i * num_cls_reg; + float* max_class_score = std::max_element(scores, scores + num_classes); + float confidence = (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(scores, max_class_score); + // fetch i-th center point + float grid0 = static_cast(center_points.at(i).grid0); + float grid1 = static_cast(center_points.at(i).grid1); + float downsample_stride = static_cast(center_points.at(i).stride); + // apply gfl regression to get offsets (l,t,r,b) + float* logits = data + i * num_cls_reg + num_classes; // 32|44... + std::vector offsets(4); + for (size_t j = 0; j < 4; ++j) { + GFLRegression(logits + j * (reg_max + 1), reg_max + 1, &offsets[j]); + } + // convert from offsets to [x1, y1, x2, y2] + float l = offsets[0]; // left + float t = offsets[1]; // top + float r = offsets[2]; // right + float b = offsets[3]; // bottom + + float x1 = (grid0 - l) * downsample_stride; // cx - l x1 + float y1 = (grid1 - t) * downsample_stride; // cy - t y1 + float x2 = (grid0 + r) * downsample_stride; // cx + r x2 + float y2 = (grid1 + b) * downsample_stride; // cy + b y2 + + result->boxes.emplace_back( + std::array{x1 + label_id * max_wh, y1 + label_id * max_wh, + x2 + label_id * max_wh, y2 + label_id * max_wh}); + // label_id * max_wh for multi classes NMS + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + // without keep_ratio + if (!keep_ratio) { + // x' = (x / out_w) * ipt_w = x / (out_w / ipt_w) + // y' = (y / out_h) * ipt_h = y / (out_h / ipt_h) + float r_w = out_w / ipt_w; + float r_h = out_h / ipt_h; + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max(result->boxes[i][0] / r_w, 0.0f); + result->boxes[i][1] = std::max(result->boxes[i][1] / r_h, 0.0f); + result->boxes[i][2] = std::max(result->boxes[i][2] / r_w, 0.0f); + result->boxes[i][3] = std::max(result->boxes[i][3] / r_h, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; + } + // with keep_ratio + float r = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * r) / 2; + float pad_w = (out_w - ipt_w * r) / 2; + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / r, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / r, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / r, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / r, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool NanoDetPlus::Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold, float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/detection/contrib/nanodet_plus.h b/csrc/fastdeploy/vision/detection/contrib/nanodet_plus.h new file mode 100644 index 000000000..a407b8715 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/nanodet_plus.h @@ -0,0 +1,101 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace detection { + +class FASTDEPLOY_DECL NanoDetPlus : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + NanoDetPlus(const std::string& model_file, + const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "nanodet"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.35f, + float nms_iou_threshold = 0.5f); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of input size (width, height), e.g (320, 320) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // keep aspect ratio or not when perform resize operation. + // This option is set as `false` by default in NanoDet-Plus. + bool keep_ratio; + // downsample strides for NanoDet-Plus to generate anchors, will + // take (8, 16, 32, 64) as default values. + std::vector downsample_strides; + // for offseting the boxes by classes when using NMS, default 4096. + float max_wh; + // reg_max for GFL regression, default 7 + int reg_max; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致 + bool IsDynamicInput() const { return is_dynamic_input_; } + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // RangiLyu/nanodet official 'export_onnx.py' script will export static ONNX + // by default. + // This value will auto check by fastdeploy after the internal Runtime + // initialized. + bool is_dynamic_input_; +}; + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/nanodet_plus_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/nanodet_plus_pybind.cc new file mode 100644 index 000000000..b415c0b3b --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/nanodet_plus_pybind.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindNanoDetPlus(pybind11::module& m) { + pybind11::class_( + m, "NanoDetPlus") + .def(pybind11::init()) + .def("predict", + [](vision::detection::NanoDetPlus& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::NanoDetPlus::size) + .def_readwrite("padding_value", + &vision::detection::NanoDetPlus::padding_value) + .def_readwrite("keep_ratio", &vision::detection::NanoDetPlus::keep_ratio) + .def_readwrite("downsample_strides", + &vision::detection::NanoDetPlus::downsample_strides) + .def_readwrite("max_wh", &vision::detection::NanoDetPlus::max_wh) + .def_readwrite("reg_max", &vision::detection::NanoDetPlus::reg_max); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/scaledyolov4.cc b/csrc/fastdeploy/vision/detection/contrib/scaledyolov4.cc new file mode 100644 index 000000000..dff2118f3 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/scaledyolov4.cc @@ -0,0 +1,255 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/scaledyolov4.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +void ScaledYOLOv4::LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill, bool scale_up, int stride) { + float scale = + std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width()); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(mat->Height() * scale)); + int resize_w = int(round(mat->Width() * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +ScaledYOLOv4::ScaledYOLOv4(const std::string& model_file, + const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER}; + valid_gpu_backends = {Backend::PDINFER}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool ScaledYOLOv4::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + max_wh = 7680.0; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_shape is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + return true; +} + +bool ScaledYOLOv4::Preprocess( + Mat* mat, FDTensor* output, + std::map>* im_info) { + // process after image load + float ratio = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (ratio != 1.0) { + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(mat->Height() * ratio); + int resize_w = int(mat->Width() * ratio); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // ScaledYOLOv4's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + ScaledYOLOv4::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, + is_scale_up, stride); + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + std::vector alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + std::vector beta = {0.0f, 0.0f, 0.0f}; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool ScaledYOLOv4::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * scale) / 2.0f; + float pad_w = (out_w - ipt_w * scale) / 2.0f; + if (is_mini_pad) { + // 和 LetterBox中_auto=true的处理逻辑对应 + pad_h = static_cast(static_cast(pad_h) % stride); + pad_w = static_cast(static_cast(pad_w) % stride); + } + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool ScaledYOLOv4::Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold, float nms_iou_threshold) { + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/scaledyolov4.h b/csrc/fastdeploy/vision/detection/contrib/scaledyolov4.h new file mode 100644 index 000000000..bb7ff0a28 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/scaledyolov4.h @@ -0,0 +1,103 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL ScaledYOLOv4 : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + ScaledYOLOv4(const std::string& model_file, + const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + virtual std::string ModelName() const { return "ScaledYOLOv4"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.25, + float nms_iou_threshold = 0.5); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // for offseting the boxes by classes when using NMS + float max_wh; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 对图片进行LetterBox处理 + // mat 为读取到的原图 + // size 为输入模型的图像尺寸 + void LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill = false, bool scale_up = true, + int stride = 32); + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // while is_dynamic_shape if 'false', is_mini_pad will force 'false'. This + // value will + // auto check by fastdeploy after the internal Runtime already initialized. + bool is_dynamic_input_; +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/scaledyolov4_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/scaledyolov4_pybind.cc new file mode 100644 index 000000000..3e8e43b9e --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/scaledyolov4_pybind.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindScaledYOLOv4(pybind11::module& m) { + pybind11::class_( + m, "ScaledYOLOv4") + .def(pybind11::init()) + .def("predict", + [](vision::detection::ScaledYOLOv4& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::ScaledYOLOv4::size) + .def_readwrite("padding_value", + &vision::detection::ScaledYOLOv4::padding_value) + .def_readwrite("is_mini_pad", + &vision::detection::ScaledYOLOv4::is_mini_pad) + .def_readwrite("is_no_pad", &vision::detection::ScaledYOLOv4::is_no_pad) + .def_readwrite("is_scale_up", + &vision::detection::ScaledYOLOv4::is_scale_up) + .def_readwrite("stride", &vision::detection::ScaledYOLOv4::stride) + .def_readwrite("max_wh", &vision::detection::ScaledYOLOv4::max_wh); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolor.cc b/csrc/fastdeploy/vision/detection/contrib/yolor.cc new file mode 100644 index 000000000..5e6fa2fdd --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolor.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/yolor.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +void YOLOR::LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill, bool scale_up, int stride) { + float scale = + std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width()); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(mat->Height() * scale)); + int resize_w = int(round(mat->Width() * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +YOLOR::YOLOR(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER}; + valid_gpu_backends = {Backend::PDINFER}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool YOLOR::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + max_wh = 7680.0; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_shape is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + return true; +} + +bool YOLOR::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + // process after image load + float ratio = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (ratio != 1.0) { + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(mat->Height() * ratio); + int resize_w = int(mat->Width() * ratio); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // yolor's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + YOLOR::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, + is_scale_up, stride); + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + std::vector alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + std::vector beta = {0.0f, 0.0f, 0.0f}; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool YOLOR::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * scale) / 2.0f; + float pad_w = (out_w - ipt_w * scale) / 2.0f; + if (is_mini_pad) { + // 和 LetterBox中_auto=true的处理逻辑对应 + pad_h = static_cast(static_cast(pad_h) % stride); + pad_w = static_cast(static_cast(pad_w) % stride); + } + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool YOLOR::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold, + float nms_iou_threshold) { + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } + + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolor.h b/csrc/fastdeploy/vision/detection/contrib/yolor.h new file mode 100644 index 000000000..2de7a456f --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolor.h @@ -0,0 +1,102 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL YOLOR : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + YOLOR(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + virtual std::string ModelName() const { return "YOLOR"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.25, + float nms_iou_threshold = 0.5); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // for offseting the boxes by classes when using NMS + float max_wh; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 对图片进行LetterBox处理 + // mat 为读取到的原图 + // size 为输入模型的图像尺寸 + void LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill = false, bool scale_up = true, + int stride = 32); + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // while is_dynamic_shape if 'false', is_mini_pad will force 'false'. This + // value will + // auto check by fastdeploy after the internal Runtime already initialized. + bool is_dynamic_input_; +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolor_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/yolor_pybind.cc new file mode 100644 index 000000000..0e0a21ca5 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolor_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindYOLOR(pybind11::module& m) { + pybind11::class_(m, "YOLOR") + .def(pybind11::init()) + .def("predict", + [](vision::detection::YOLOR& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::YOLOR::size) + .def_readwrite("padding_value", &vision::detection::YOLOR::padding_value) + .def_readwrite("is_mini_pad", &vision::detection::YOLOR::is_mini_pad) + .def_readwrite("is_no_pad", &vision::detection::YOLOR::is_no_pad) + .def_readwrite("is_scale_up", &vision::detection::YOLOR::is_scale_up) + .def_readwrite("stride", &vision::detection::YOLOR::stride) + .def_readwrite("max_wh", &vision::detection::YOLOR::max_wh); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov5.cc b/csrc/fastdeploy/vision/detection/contrib/yolov5.cc new file mode 100644 index 000000000..306051e80 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov5.cc @@ -0,0 +1,295 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/yolov5.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +void YOLOv5::LetterBox(Mat* mat, std::vector size, + std::vector color, bool _auto, bool scale_fill, + bool scale_up, int stride) { + float scale = + std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width()); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(mat->Height() * scale)); + int resize_w = int(round(mat->Width() * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + Resize::Run(mat, resize_w, resize_h); + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +YOLOv5::YOLOv5(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool YOLOv5::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + max_wh = 7680.0; + multi_label = true; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_shape is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + return true; +} + +bool YOLOv5::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + // process after image load + double ratio = (size[0] * 1.0) / std::max(static_cast(mat->Height()), + static_cast(mat->Width())); + if (ratio != 1.0) { + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(mat->Height() * ratio); + int resize_w = int(mat->Width() * ratio); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // yolov5's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, is_scale_up, + stride); + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + std::vector alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + std::vector beta = {0.0f, 0.0f, 0.0f}; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool YOLOv5::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold, bool multi_label) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + if (multi_label) { + result->Reserve(infer_result.shape[1] * (infer_result.shape[2] - 5)); + } else { + result->Reserve(infer_result.shape[1]); + } + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + if (multi_label) { + for (size_t j = 5; j < infer_result.shape[2]; ++j) { + confidence = data[s + 4]; + float* class_score = data + s + j; + confidence *= (*class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, class_score); + + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + } else { + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + } + + if (result->boxes.size() == 0) { + return true; + } + + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + for (size_t i = 0; i < result->boxes.size(); ++i) { + float pad_h = (out_h - ipt_h * scale) / 2; + float pad_w = (out_w - ipt_w * scale) / 2; + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h); + } + return true; +} + +bool YOLOv5::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold, + float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold, multi_label)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov5.h b/csrc/fastdeploy/vision/detection/contrib/yolov5.h new file mode 100644 index 000000000..68c910d23 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov5.h @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL YOLOv5 : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + YOLOv5(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "yolov5"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.25, + float nms_iou_threshold = 0.5); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // for offseting the boxes by classes when using NMS + float max_wh; + // for different strategies to get boxes when postprocessing + bool multi_label; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* outputs, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + // multi_label 后处理时box选取是否采用多标签方式 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold, + bool multi_label); + + // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致 + bool IsDynamicInput() const { return is_dynamic_input_; } + + void LetterBox(Mat* mat, std::vector size, std::vector color, + bool _auto, bool scale_fill = false, bool scale_up = true, + int stride = 32); + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // YOLOv5 official 'export_onnx.py' script will export dynamic ONNX by + // default. + // while is_dynamic_shape if 'false', is_mini_pad will force 'false'. This + // value will + // auto check by fastdeploy after the internal Runtime already initialized. + bool is_dynamic_input_; +}; + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov5_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/yolov5_pybind.cc new file mode 100644 index 000000000..65ba538b8 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov5_pybind.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindYOLOv5(pybind11::module& m) { + pybind11::class_(m, "YOLOv5") + .def(pybind11::init()) + .def("predict", + [](vision::detection::YOLOv5& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::YOLOv5::size) + .def_readwrite("padding_value", &vision::detection::YOLOv5::padding_value) + .def_readwrite("is_mini_pad", &vision::detection::YOLOv5::is_mini_pad) + .def_readwrite("is_no_pad", &vision::detection::YOLOv5::is_no_pad) + .def_readwrite("is_scale_up", &vision::detection::YOLOv5::is_scale_up) + .def_readwrite("stride", &vision::detection::YOLOv5::stride) + .def_readwrite("max_wh", &vision::detection::YOLOv5::max_wh) + .def_readwrite("multi_label", &vision::detection::YOLOv5::multi_label); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov5lite.cc b/csrc/fastdeploy/vision/detection/contrib/yolov5lite.cc new file mode 100644 index 000000000..26ca15f1e --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov5lite.cc @@ -0,0 +1,399 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/yolov5lite.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +void YOLOv5Lite::LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill, bool scale_up, int stride) { + float scale = + std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width()); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(mat->Height() * scale)); + int resize_w = int(round(mat->Width() * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +void YOLOv5Lite::GenerateAnchors(const std::vector& size, + const std::vector& downsample_strides, + std::vector* anchors, + int num_anchors) { + // size: tuple of input (width, height) + // downsample_strides: downsample strides in YOLOv5Lite, e.g (8,16,32) + const int width = size[0]; + const int height = size[1]; + for (int i = 0; i < downsample_strides.size(); ++i) { + const int ds = downsample_strides[i]; + int num_grid_w = width / ds; + int num_grid_h = height / ds; + for (int an = 0; an < num_anchors; ++an) { + float anchor_w = anchor_config[i][an * 2]; + float anchor_h = anchor_config[i][an * 2 + 1]; + for (int g1 = 0; g1 < num_grid_h; ++g1) { + for (int g0 = 0; g0 < num_grid_w; ++g0) { + (*anchors).emplace_back(Anchor{g0, g1, ds, anchor_w, anchor_h}); + } + } + } + } +} + +YOLOv5Lite::YOLOv5Lite(const std::string& model_file, + const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool YOLOv5Lite::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + downsample_strides = {8, 16, 32}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + max_wh = 7680.0; + is_decode_exported = false; + anchor_config = {{10.0, 13.0, 16.0, 30.0, 33.0, 23.0}, + {30.0, 61.0, 62.0, 45.0, 59.0, 119.0}, + {116.0, 90.0, 156.0, 198.0, 373.0, 326.0}}; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_shape is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + return true; +} + +bool YOLOv5Lite::Preprocess( + Mat* mat, FDTensor* output, + std::map>* im_info) { + // process after image load + float ratio = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (ratio != 1.0) { + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(mat->Height() * ratio); + int resize_w = int(mat->Width() * ratio); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // yolov5lite's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + YOLOv5Lite::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, + is_scale_up, stride); + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + std::vector alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + std::vector beta = {0.0f, 0.0f, 0.0f}; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool YOLOv5Lite::PostprocessWithDecode( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + // generate anchors with dowmsample strides + std::vector anchors; + int num_anchors = anchor_config[0].size() / 2; + GenerateAnchors(size, downsample_strides, &anchors, num_anchors); + // infer_result shape might look like (1,n,85=5+80) + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // fetch i-th anchor + float grid0 = static_cast(anchors.at(i).grid0); + float grid1 = static_cast(anchors.at(i).grid1); + float downsample_stride = static_cast(anchors.at(i).stride); + float anchor_w = static_cast(anchors.at(i).anchor_w); + float anchor_h = static_cast(anchors.at(i).anchor_h); + // convert from offsets to [x, y, w, h] + float dx = data[s]; + float dy = data[s + 1]; + float dw = data[s + 2]; + float dh = data[s + 3]; + + float x = (dx * 2.0f - 0.5f + grid0) * downsample_stride; + float y = (dy * 2.0f - 0.5f + grid1) * downsample_stride; + float w = std::pow(dw * 2.0f, 2.0f) * anchor_w; + float h = std::pow(dh * 2.0f, 2.0f) * anchor_h; + + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + x - w / 2.0f + label_id * max_wh, y - h / 2.0f + label_id * max_wh, + x + w / 2.0f + label_id * max_wh, y + h / 2.0f + label_id * max_wh}); + // label_id * max_wh for multi classes NMS + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * scale) / 2.0f; + float pad_w = (out_w - ipt_w * scale) / 2.0f; + if (is_mini_pad) { + // 和 LetterBox中_auto=true的处理逻辑对应 + pad_h = static_cast(static_cast(pad_h) % stride); + pad_w = static_cast(static_cast(pad_w) % stride); + } + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool YOLOv5Lite::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * scale) / 2.0f; + float pad_w = (out_w - ipt_w * scale) / 2.0f; + if (is_mini_pad) { + // 和 LetterBox中_auto=true的处理逻辑对应 + pad_h = static_cast(static_cast(pad_h) % stride); + pad_w = static_cast(static_cast(pad_w) % stride); + } + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool YOLOv5Lite::Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold, float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (is_decode_exported) { + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + } else { + if (!PostprocessWithDecode(output_tensors[0], result, im_info, + conf_threshold, nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov5lite.h b/csrc/fastdeploy/vision/detection/contrib/yolov5lite.h new file mode 100644 index 000000000..2add202f4 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov5lite.h @@ -0,0 +1,138 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL YOLOv5Lite : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + YOLOv5Lite(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + virtual std::string ModelName() const { return "YOLOv5-Lite"; } + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.45, + float nms_iou_threshold = 0.25); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // for offseting the boxes by classes when using NMS + float max_wh; + // downsample strides for YOLOv5Lite to generate anchors, will take + // (8,16,32) as default values, might have stride=64. + std::vector downsample_strides; + // anchors parameters, downsample_strides will take + // (8,16,32), each stride has three anchors with width and hight. + std::vector> anchor_config; + // whether the model_file was exported with decode module. The official + // YOLOv5Lite/export.py script will export ONNX file without + // decode module. Please set it 'true' manually if the model file + // was exported with decode module. + // false : ONNX files without decode module. + // true : ONNX file with decode module. + bool is_decode_exported; + + private: + // necessary parameters for GenerateAnchors to generate anchors when ONNX file + // without decode module. + struct Anchor { + int grid0; + int grid1; + int stride; + float anchor_w; + float anchor_h; + }; + + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // YOLOv5Lite的官方脚本默认导出不带decode模块的模型文件 需要在后处理进行decode + // the official YOLOv5Lite/export.py will export ONNX file without decode + // module. + // this fuction support the postporocess for ONNX file without decode module. + // set the `is_decode_exported = false`, this function will work. + bool PostprocessWithDecode( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 对图片进行LetterBox处理 + // mat 为读取到的原图 + // size 为输入模型的图像尺寸 + void LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill = false, bool scale_up = true, + int stride = 32); + // generate anchors for decodeing when ONNX file without decode module. + void GenerateAnchors(const std::vector& size, + const std::vector& downsample_strides, + std::vector* anchors, const int num_anchors = 3); + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // while is_dynamic_shape if 'false', is_mini_pad will force 'false'. This + // value will + // auto check by fastdeploy after the internal Runtime already initialized. + bool is_dynamic_input_; +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov5lite_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/yolov5lite_pybind.cc new file mode 100644 index 000000000..dd064e3be --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov5lite_pybind.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindYOLOv5Lite(pybind11::module& m) { + pybind11::class_(m, + "YOLOv5Lite") + .def(pybind11::init()) + .def("predict", + [](vision::detection::YOLOv5Lite& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::YOLOv5Lite::size) + .def_readwrite("padding_value", + &vision::detection::YOLOv5Lite::padding_value) + .def_readwrite("is_mini_pad", &vision::detection::YOLOv5Lite::is_mini_pad) + .def_readwrite("is_no_pad", &vision::detection::YOLOv5Lite::is_no_pad) + .def_readwrite("is_scale_up", &vision::detection::YOLOv5Lite::is_scale_up) + .def_readwrite("stride", &vision::detection::YOLOv5Lite::stride) + .def_readwrite("max_wh", &vision::detection::YOLOv5Lite::max_wh) + .def_readwrite("anchor_config", + &vision::detection::YOLOv5Lite::anchor_config) + .def_readwrite("is_decode_exported", + &vision::detection::YOLOv5Lite::is_decode_exported); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov6.cc b/csrc/fastdeploy/vision/detection/contrib/yolov6.cc new file mode 100644 index 000000000..7c6827433 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov6.cc @@ -0,0 +1,267 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/yolov6.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace detection { + +void YOLOv6::LetterBox(Mat* mat, std::vector size, + std::vector color, bool _auto, bool scale_fill, + bool scale_up, int stride) { + float scale = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(static_cast(mat->Height()) * scale)); + int resize_w = int(round(static_cast(mat->Width()) * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +YOLOv6::YOLOv6(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool YOLOv6::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + max_wh = 4096.0f; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_shape is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + return true; +} + +bool YOLOv6::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + // process after image load + float ratio = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (ratio != 1.0) { + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(round(static_cast(mat->Height()) * ratio)); + int resize_w = int(round(static_cast(mat->Width()) * ratio)); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // yolov6's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, is_scale_up, + stride); + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + std::vector alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + std::vector beta = {0.0f, 0.0f, 0.0f}; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool YOLOv6::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + for (size_t i = 0; i < result->boxes.size(); ++i) { + float pad_h = (out_h - ipt_h * scale) / 2; + float pad_w = (out_w - ipt_w * scale) / 2; + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool YOLOv6::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold, + float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov6.h b/csrc/fastdeploy/vision/detection/contrib/yolov6.h new file mode 100644 index 000000000..64af6e2eb --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov6.h @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace detection { + +class FASTDEPLOY_DECL YOLOv6 : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + YOLOv6(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "YOLOv6"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.25, + float nms_iou_threshold = 0.5); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // for offseting the boxes by classes when using NMS, default 4096 in + // meituan/YOLOv6 + float max_wh; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* outputs, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致 + bool IsDynamicInput() const { return is_dynamic_input_; } + + void LetterBox(Mat* mat, std::vector size, std::vector color, + bool _auto, bool scale_fill = false, bool scale_up = true, + int stride = 32); + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // meituan/YOLOv6 official 'export_onnx.py' script will export static ONNX by + // default. + // while is_dynamic_input if 'false', is_mini_pad will force 'false'. This + // value will + // auto check by fastdeploy after the internal Runtime already initialized. + bool is_dynamic_input_; +}; + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov6_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/yolov6_pybind.cc new file mode 100644 index 000000000..a1d0131df --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov6_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindYOLOv6(pybind11::module& m) { + pybind11::class_(m, "YOLOv6") + .def(pybind11::init()) + .def("predict", + [](vision::detection::YOLOv6& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::YOLOv6::size) + .def_readwrite("padding_value", &vision::detection::YOLOv6::padding_value) + .def_readwrite("is_mini_pad", &vision::detection::YOLOv6::is_mini_pad) + .def_readwrite("is_no_pad", &vision::detection::YOLOv6::is_no_pad) + .def_readwrite("is_scale_up", &vision::detection::YOLOv6::is_scale_up) + .def_readwrite("stride", &vision::detection::YOLOv6::stride) + .def_readwrite("max_wh", &vision::detection::YOLOv6::max_wh); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov7.cc b/csrc/fastdeploy/vision/detection/contrib/yolov7.cc new file mode 100644 index 000000000..edc1b9048 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov7.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/yolov7.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +void YOLOv7::LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill, bool scale_up, int stride) { + float scale = + std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width()); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(mat->Height() * scale)); + int resize_w = int(round(mat->Width() * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +YOLOv7::YOLOv7(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER}; + valid_gpu_backends = {Backend::PDINFER}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + initialized = Initialize(); +} + +bool YOLOv7::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + max_wh = 7680.0; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_shape is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + return true; +} + +bool YOLOv7::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + // process after image load + float ratio = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (ratio != 1.0) { + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(mat->Height() * ratio); + int resize_w = int(mat->Width() * ratio); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // yolov7's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + YOLOv7::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, + is_scale_up, stride); + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + std::vector alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + std::vector beta = {0.0f, 0.0f, 0.0f}; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool YOLOv7::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * scale) / 2.0f; + float pad_w = (out_w - ipt_w * scale) / 2.0f; + if (is_mini_pad) { + // 和 LetterBox中_auto=true的处理逻辑对应 + pad_h = static_cast(static_cast(pad_h) % stride); + pad_w = static_cast(static_cast(pad_w) % stride); + } + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool YOLOv7::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold, + float nms_iou_threshold) { + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } + + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov7.h b/csrc/fastdeploy/vision/detection/contrib/yolov7.h new file mode 100644 index 000000000..02b874b2c --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov7.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL YOLOv7 : public FastDeployModel { + public: + YOLOv7(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + virtual std::string ModelName() const { return "yolov7"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.25, + float nms_iou_threshold = 0.5); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // for offseting the boxes by classes when using NMS + float max_wh; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 对图片进行LetterBox处理 + // mat 为读取到的原图 + // size 为输入模型的图像尺寸 + void LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill = false, bool scale_up = true, + int stride = 32); + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // while is_dynamic_shape if 'false', is_mini_pad will force 'false'. This + // value will + // auto check by fastdeploy after the internal Runtime already initialized. + bool is_dynamic_input_; +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolov7_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/yolov7_pybind.cc new file mode 100644 index 000000000..bf196fa9f --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolov7_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindYOLOv7(pybind11::module& m) { + pybind11::class_(m, "YOLOv7") + .def(pybind11::init()) + .def("predict", + [](vision::detection::YOLOv7& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::YOLOv7::size) + .def_readwrite("padding_value", &vision::detection::YOLOv7::padding_value) + .def_readwrite("is_mini_pad", &vision::detection::YOLOv7::is_mini_pad) + .def_readwrite("is_no_pad", &vision::detection::YOLOv7::is_no_pad) + .def_readwrite("is_scale_up", &vision::detection::YOLOv7::is_scale_up) + .def_readwrite("stride", &vision::detection::YOLOv7::stride) + .def_readwrite("max_wh", &vision::detection::YOLOv7::max_wh); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolox.cc b/csrc/fastdeploy/vision/detection/contrib/yolox.cc new file mode 100644 index 000000000..5d3880657 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolox.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/contrib/yolox.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace detection { + +struct YOLOXAnchor { + int grid0; + int grid1; + int stride; +}; + +void GenerateYOLOXAnchors(const std::vector& size, + const std::vector& downsample_strides, + std::vector* anchors) { + // size: tuple of input (width, height) + // downsample_strides: downsample strides in YOLOX, e.g (8,16,32) + const int width = size[0]; + const int height = size[1]; + for (const auto& ds : downsample_strides) { + int num_grid_w = width / ds; + int num_grid_h = height / ds; + for (int g1 = 0; g1 < num_grid_h; ++g1) { + for (int g0 = 0; g0 < num_grid_w; ++g0) { + (*anchors).emplace_back(YOLOXAnchor{g0, g1, ds}); + } + } + } +} + +void LetterBoxWithRightBottomPad(Mat* mat, std::vector size, + std::vector color) { + // specific pre process for YOLOX, not the same as YOLOv5 + // reference: YOLOX/yolox/data/data_augment.py#L142 + float r = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + + int resize_h = int(round(static_cast(mat->Height()) * r)); + int resize_w = int(round(static_cast(mat->Width()) * r)); + + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + // right-bottom padding for YOLOX + if (pad_h > 0 || pad_w > 0) { + int top = 0; + int left = 0; + int right = pad_w; + int bottom = pad_h; + Pad::Run(mat, top, bottom, left, right, color); + } +} + +YOLOX::YOLOX(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool YOLOX::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + downsample_strides = {8, 16, 32}; + max_wh = 4096.0f; + is_decode_exported = false; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + return true; +} + +bool YOLOX::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + // YOLOX ( >= v0.1.1) preprocess steps + // 1. preproc + // 2. HWC->CHW + // 3. NO!!! BRG2GRB and Normalize needed in YOLOX + LetterBoxWithRightBottomPad(mat, size, padding_value); + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool YOLOX::Postprocess( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + data[s] - data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh, + data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh, + data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh}); + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float r = std::min(out_h / ipt_h, out_w / ipt_w); + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max(result->boxes[i][0] / r, 0.0f); + result->boxes[i][1] = std::max(result->boxes[i][1] / r, 0.0f); + result->boxes[i][2] = std::max(result->boxes[i][2] / r, 0.0f); + result->boxes[i][3] = std::max(result->boxes[i][3] / r, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool YOLOX::PostprocessWithDecode( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + result->Clear(); + result->Reserve(infer_result.shape[1]); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + // generate anchors with dowmsample strides + std::vector anchors; + GenerateYOLOXAnchors(size, downsample_strides, &anchors); + + // infer_result shape might look like (1,n,85=5+80) + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + int s = i * infer_result.shape[2]; + float confidence = data[s + 4]; + float* max_class_score = + std::max_element(data + s + 5, data + s + infer_result.shape[2]); + confidence *= (*max_class_score); + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + int32_t label_id = std::distance(data + s + 5, max_class_score); + // fetch i-th anchor + float grid0 = static_cast(anchors.at(i).grid0); + float grid1 = static_cast(anchors.at(i).grid1); + float downsample_stride = static_cast(anchors.at(i).stride); + // convert from offsets to [x, y, w, h] + float dx = data[s]; + float dy = data[s + 1]; + float dw = data[s + 2]; + float dh = data[s + 3]; + + float x = (dx + grid0) * downsample_stride; + float y = (dy + grid1) * downsample_stride; + float w = std::exp(dw) * downsample_stride; + float h = std::exp(dh) * downsample_stride; + + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + x - w / 2.0f + label_id * max_wh, y - h / 2.0f + label_id * max_wh, + x + w / 2.0f + label_id * max_wh, y + h / 2.0f + label_id * max_wh}); + // label_id * max_wh for multi classes NMS + result->label_ids.push_back(label_id); + result->scores.push_back(confidence); + } + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float r = std::min(out_h / ipt_h, out_w / ipt_w); + for (size_t i = 0; i < result->boxes.size(); ++i) { + int32_t label_id = (result->label_ids)[i]; + // clip box + result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id; + result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id; + result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id; + result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id; + result->boxes[i][0] = std::max(result->boxes[i][0] / r, 0.0f); + result->boxes[i][1] = std::max(result->boxes[i][1] / r, 0.0f); + result->boxes[i][2] = std::max(result->boxes[i][2] / r, 0.0f); + result->boxes[i][3] = std::max(result->boxes[i][3] / r, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool YOLOX::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold, + float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (is_decode_exported) { + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + } else { + if (!PostprocessWithDecode(output_tensors[0], result, im_info, + conf_threshold, nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolox.h b/csrc/fastdeploy/vision/detection/contrib/yolox.h new file mode 100644 index 000000000..fc27ca1ed --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolox.h @@ -0,0 +1,107 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace detection { + +class FASTDEPLOY_DECL YOLOX : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + YOLOX(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "YOLOX"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, DetectionResult* result, + float conf_threshold = 0.25, + float nms_iou_threshold = 0.5); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // whether the model_file was exported with decode module. The official + // YOLOX/tools/export_onnx.py script will export ONNX file without + // decode module. Please set it 'true' manually if the model file + // was exported with decode module. + bool is_decode_exported; + // downsample strides for YOLOX to generate anchors, will take + // (8,16,32) as default values, might have stride=64. + std::vector downsample_strides; + // for offseting the boxes by classes when using NMS, default 4096. + float max_wh; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* outputs, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // YOLOX的官方脚本默认导出不带decode模块的模型文件 需要在后处理进行decode + bool PostprocessWithDecode( + FDTensor& infer_result, DetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致 + bool IsDynamicInput() const { return is_dynamic_input_; } + + // whether to inference with dynamic shape (e.g ONNX export with dynamic shape + // or not.) + // megvii/YOLOX official 'export_onnx.py' script will export static ONNX by + // default. + // while is_dynamic_shape if 'false', is_mini_pad will force 'false'. This + // value will + // auto check by fastdeploy after the internal Runtime already initialized. + bool is_dynamic_input_; +}; + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/contrib/yolox_pybind.cc b/csrc/fastdeploy/vision/detection/contrib/yolox_pybind.cc new file mode 100644 index 000000000..68cb6a426 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/contrib/yolox_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindYOLOX(pybind11::module& m) { + pybind11::class_(m, "YOLOX") + .def(pybind11::init()) + .def("predict", + [](vision::detection::YOLOX& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::detection::YOLOX::size) + .def_readwrite("padding_value", &vision::detection::YOLOX::padding_value) + .def_readwrite("is_decode_exported", + &vision::detection::YOLOX::is_decode_exported) + .def_readwrite("downsample_strides", + &vision::detection::YOLOX::downsample_strides) + .def_readwrite("max_wh", &vision::detection::YOLOX::max_wh); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/detection_pybind.cc b/csrc/fastdeploy/vision/detection/detection_pybind.cc new file mode 100644 index 000000000..a865dc11e --- /dev/null +++ b/csrc/fastdeploy/vision/detection/detection_pybind.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindYOLOv7(pybind11::module& m); +void BindScaledYOLOv4(pybind11::module& m); +void BindYOLOR(pybind11::module& m); +void BindYOLOv6(pybind11::module& m); +void BindYOLOv5Lite(pybind11::module& m); +void BindYOLOv5(pybind11::module& m); +void BindYOLOX(pybind11::module& m); +void BindNanoDetPlus(pybind11::module& m); +void BindPPDet(pybind11::module& m); + +void BindDetection(pybind11::module& m) { + auto detection_module = + m.def_submodule("detection", "Image object detection models."); + BindPPDet(detection_module); + BindYOLOv7(detection_module); + BindScaledYOLOv4(detection_module); + BindYOLOR(detection_module); + BindYOLOv6(detection_module); + BindYOLOv5Lite(detection_module); + BindYOLOv5(detection_module); + BindYOLOX(detection_module); + BindNanoDetPlus(detection_module); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/model.h b/csrc/fastdeploy/vision/detection/ppdet/model.h new file mode 100644 index 000000000..f40c6b7fe --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/model.h @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/vision/detection/ppdet/picodet.h" +#include "fastdeploy/vision/detection/ppdet/ppyolo.h" +#include "fastdeploy/vision/detection/ppdet/ppyoloe.h" +#include "fastdeploy/vision/detection/ppdet/rcnn.h" +#include "fastdeploy/vision/detection/ppdet/yolov3.h" +#include "fastdeploy/vision/detection/ppdet/yolox.h" diff --git a/csrc/fastdeploy/vision/detection/ppdet/picodet.cc b/csrc/fastdeploy/vision/detection/ppdet/picodet.cc new file mode 100644 index 000000000..d89fab2ae --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/picodet.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/ppdet/picodet.h" +#include "yaml-cpp/yaml.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +PicoDet::PicoDet(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT}; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + background_label = -1; + keep_top_k = 100; + nms_eta = 1; + nms_threshold = 0.6; + nms_top_k = 1000; + normalized = true; + score_threshold = 0.025; + CheckIfContainDecodeAndNMS(); + initialized = Initialize(); +} + +bool PicoDet::CheckIfContainDecodeAndNMS() { + YAML::Node cfg; + try { + cfg = YAML::LoadFile(config_file_); + } catch (YAML::BadFile& e) { + FDERROR << "Failed to load yaml file " << config_file_ + << ", maybe you should check this file." << std::endl; + return false; + } + + if (cfg["arch"].as() == "PicoDet") { + FDERROR << "The arch in config file is PicoDet, which means this model " + "doesn contain box decode and nms, please export model with " + "decode and nms." + << std::endl; + return false; + } + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/picodet.h b/csrc/fastdeploy/vision/detection/ppdet/picodet.h new file mode 100644 index 000000000..984e56222 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/picodet.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/vision/detection/ppdet/ppyoloe.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL PicoDet : public PPYOLOE { + public: + PicoDet(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + // Only support picodet contains decode and nms + bool CheckIfContainDecodeAndNMS(); + + virtual std::string ModelName() const { return "PicoDet"; } +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/ppdet_pybind.cc b/csrc/fastdeploy/vision/detection/ppdet/ppdet_pybind.cc new file mode 100644 index 000000000..2f4b0fefc --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/ppdet_pybind.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindPPDet(pybind11::module& m) { + pybind11::class_(m, + "PPYOLOE") + .def(pybind11::init()) + .def("predict", [](vision::detection::PPYOLOE& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res); + return res; + }); + + pybind11::class_(m, + "PPYOLO") + .def(pybind11::init()) + .def("predict", [](vision::detection::PPYOLO& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res); + return res; + }); + + pybind11::class_(m, + "PPYOLOv2") + .def(pybind11::init()) + .def("predict", [](vision::detection::PPYOLOv2& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res); + return res; + }); + + pybind11::class_(m, + "PicoDet") + .def(pybind11::init()) + .def("predict", [](vision::detection::PicoDet& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res); + return res; + }); + + pybind11::class_(m, "PaddleYOLOX") + .def(pybind11::init()) + .def("predict", [](vision::detection::PaddleYOLOX& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res); + return res; + }); + + pybind11::class_(m, + "FasterRCNN") + .def(pybind11::init()) + .def("predict", + [](vision::detection::FasterRCNN& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res); + return res; + }); + + pybind11::class_(m, + "YOLOv3") + .def(pybind11::init()) + .def("predict", [](vision::detection::YOLOv3& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::DetectionResult res; + self.Predict(&mat, &res); + return res; + }); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/ppyolo.cc b/csrc/fastdeploy/vision/detection/ppdet/ppyolo.cc new file mode 100644 index 000000000..6c202f0d0 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/ppyolo.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/ppdet/ppyolo.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +PPYOLO::PPYOLO(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::PDINFER}; + valid_gpu_backends = {Backend::PDINFER}; + has_nms_ = true; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool PPYOLO::Initialize() { + if (!BuildPreprocessPipelineFromConfig()) { + FDERROR << "Failed to build preprocess pipeline from configuration file." + << std::endl; + return false; + } + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + return true; +} + +bool PPYOLO::Preprocess(Mat* mat, std::vector* outputs) { + int origin_w = mat->Width(); + int origin_h = mat->Height(); + for (size_t i = 0; i < processors_.size(); ++i) { + if (!(*(processors_[i].get()))(mat)) { + FDERROR << "Failed to process image data in " << processors_[i]->Name() + << "." << std::endl; + return false; + } + } + + outputs->resize(3); + (*outputs)[0].Allocate({1, 2}, FDDataType::FP32, "im_shape"); + (*outputs)[2].Allocate({1, 2}, FDDataType::FP32, "scale_factor"); + float* ptr0 = static_cast((*outputs)[0].MutableData()); + ptr0[0] = mat->Height(); + ptr0[1] = mat->Width(); + float* ptr2 = static_cast((*outputs)[2].MutableData()); + ptr2[0] = mat->Height() * 1.0 / origin_h; + ptr2[1] = mat->Width() * 1.0 / origin_w; + (*outputs)[1].name = "image"; + mat->ShareWithTensor(&((*outputs)[1])); + // reshape to [1, c, h, w] + (*outputs)[1].shape.insert((*outputs)[1].shape.begin(), 1); + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/ppyolo.h b/csrc/fastdeploy/vision/detection/ppdet/ppyolo.h new file mode 100644 index 000000000..1b3b48780 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/ppyolo.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/vision/detection/ppdet/ppyoloe.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL PPYOLO : public PPYOLOE { + public: + PPYOLO(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + virtual std::string ModelName() const { return "PaddleDetection/PPYOLO"; } + + virtual bool Preprocess(Mat* mat, std::vector* outputs); + virtual bool Initialize(); + + protected: + PPYOLO() {} +}; + +class FASTDEPLOY_DECL PPYOLOv2 : public PPYOLO { + public: + PPYOLOv2(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE) : PPYOLO(model_file, params_file, config_file, custom_option, model_format) { + } + + virtual std::string ModelName() const { return "PaddleDetection/PPYOLOv2"; } +}; + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/ppyoloe.cc b/csrc/fastdeploy/vision/detection/ppdet/ppyoloe.cc new file mode 100644 index 000000000..2e4b56ecb --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/ppyoloe.cc @@ -0,0 +1,258 @@ +#include "fastdeploy/vision/detection/ppdet/ppyoloe.h" +#include "fastdeploy/vision/utils/utils.h" +#include "yaml-cpp/yaml.h" +#ifdef ENABLE_PADDLE_FRONTEND +#include "paddle2onnx/converter.h" +#endif + +namespace fastdeploy { +namespace vision { +namespace detection { + +PPYOLOE::PPYOLOE(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT}; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +void PPYOLOE::GetNmsInfo() { + if (runtime_option.model_format == Frontend::PADDLE) { + std::string contents; + if (!ReadBinaryFromFile(runtime_option.model_file, &contents)) { + return; + } + auto reader = paddle2onnx::PaddleReader(contents.c_str(), contents.size()); + if (reader.has_nms) { + has_nms_ = true; + background_label = reader.nms_params.background_label; + keep_top_k = reader.nms_params.keep_top_k; + nms_eta = reader.nms_params.nms_eta; + nms_threshold = reader.nms_params.nms_threshold; + score_threshold = reader.nms_params.score_threshold; + nms_top_k = reader.nms_params.nms_top_k; + normalized = reader.nms_params.normalized; + } + } +} + +bool PPYOLOE::Initialize() { +#ifdef ENABLE_PADDLE_FRONTEND + // remove multiclass_nms3 now + // this is a trick operation for ppyoloe while inference on trt + GetNmsInfo(); + runtime_option.remove_multiclass_nms_ = true; + runtime_option.custom_op_info_["multiclass_nms3"] = "MultiClassNMS"; +#endif + if (!BuildPreprocessPipelineFromConfig()) { + FDERROR << "Failed to build preprocess pipeline from configuration file." + << std::endl; + return false; + } + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + + if (has_nms_ && runtime_option.backend == Backend::TRT) { + FDINFO << "Detected operator multiclass_nms3 in your model, will replace " + "it with fastdeploy::backend::MultiClassNMS(background_label=" + << background_label << ", keep_top_k=" << keep_top_k + << ", nms_eta=" << nms_eta << ", nms_threshold=" << nms_threshold + << ", score_threshold=" << score_threshold + << ", nms_top_k=" << nms_top_k << ", normalized=" << normalized + << ")." << std::endl; + has_nms_ = false; + } + return true; +} + +bool PPYOLOE::BuildPreprocessPipelineFromConfig() { + processors_.clear(); + YAML::Node cfg; + try { + cfg = YAML::LoadFile(config_file_); + } catch (YAML::BadFile& e) { + FDERROR << "Failed to load yaml file " << config_file_ + << ", maybe you should check this file." << std::endl; + return false; + } + + processors_.push_back(std::make_shared()); + + for (const auto& op : cfg["Preprocess"]) { + std::string op_name = op["type"].as(); + if (op_name == "NormalizeImage") { + auto mean = op["mean"].as>(); + auto std = op["std"].as>(); + bool is_scale = op["is_scale"].as(); + processors_.push_back(std::make_shared(mean, std, is_scale)); + } else if (op_name == "Resize") { + bool keep_ratio = op["keep_ratio"].as(); + auto target_size = op["target_size"].as>(); + int interp = op["interp"].as(); + FDASSERT(target_size.size(), + "Require size of target_size be 2, but now it's " + + std::to_string(target_size.size()) + "."); + if (!keep_ratio) { + int width = target_size[1]; + int height = target_size[0]; + processors_.push_back( + std::make_shared(width, height, -1.0, -1.0, interp, false)); + } else { + int min_target_size = std::min(target_size[0], target_size[1]); + int max_target_size = std::max(target_size[0], target_size[1]); + processors_.push_back(std::make_shared( + min_target_size, interp, true, max_target_size)); + } + } else if (op_name == "Permute") { + // Do nothing, do permute as the last operation + continue; + // processors_.push_back(std::make_shared()); + } else if (op_name == "Pad") { + auto size = op["size"].as>(); + auto value = op["fill_value"].as>(); + processors_.push_back(std::make_shared("float")); + processors_.push_back( + std::make_shared(size[1], size[0], value)); + } else if (op_name == "PadStride") { + auto stride = op["stride"].as(); + processors_.push_back( + std::make_shared(stride, std::vector(3, 0))); + } else { + FDERROR << "Unexcepted preprocess operator: " << op_name << "." + << std::endl; + return false; + } + } + processors_.push_back(std::make_shared()); + return true; +} + +bool PPYOLOE::Preprocess(Mat* mat, std::vector* outputs) { + int origin_w = mat->Width(); + int origin_h = mat->Height(); + for (size_t i = 0; i < processors_.size(); ++i) { + if (!(*(processors_[i].get()))(mat)) { + FDERROR << "Failed to process image data in " << processors_[i]->Name() + << "." << std::endl; + return false; + } + } + + outputs->resize(2); + (*outputs)[0].name = InputInfoOfRuntime(0).name; + mat->ShareWithTensor(&((*outputs)[0])); + + // reshape to [1, c, h, w] + (*outputs)[0].shape.insert((*outputs)[0].shape.begin(), 1); + + (*outputs)[1].Allocate({1, 2}, FDDataType::FP32, InputInfoOfRuntime(1).name); + float* ptr = static_cast((*outputs)[1].MutableData()); + ptr[0] = mat->Height() * 1.0 / origin_h; + ptr[1] = mat->Width() * 1.0 / origin_w; + return true; +} + +bool PPYOLOE::Postprocess(std::vector& infer_result, + DetectionResult* result) { + FDASSERT(infer_result[1].shape[0] == 1, + "Only support batch = 1 in FastDeploy now."); + + if (!has_nms_) { + int boxes_index = 0; + int scores_index = 1; + if (infer_result[0].shape[1] == infer_result[1].shape[2]) { + boxes_index = 0; + scores_index = 1; + } else if (infer_result[0].shape[2] == infer_result[1].shape[1]) { + boxes_index = 1; + scores_index = 0; + } else { + FDERROR << "The shape of boxes and scores should be [batch, boxes_num, " + "4], [batch, classes_num, boxes_num]" + << std::endl; + return false; + } + + backend::MultiClassNMS nms; + nms.background_label = background_label; + nms.keep_top_k = keep_top_k; + nms.nms_eta = nms_eta; + nms.nms_threshold = nms_threshold; + nms.score_threshold = score_threshold; + nms.nms_top_k = nms_top_k; + nms.normalized = normalized; + nms.Compute(static_cast(infer_result[boxes_index].Data()), + static_cast(infer_result[scores_index].Data()), + infer_result[boxes_index].shape, + infer_result[scores_index].shape); + if (nms.out_num_rois_data[0] > 0) { + result->Reserve(nms.out_num_rois_data[0]); + } + for (size_t i = 0; i < nms.out_num_rois_data[0]; ++i) { + result->label_ids.push_back(nms.out_box_data[i * 6]); + result->scores.push_back(nms.out_box_data[i * 6 + 1]); + result->boxes.emplace_back(std::array{ + nms.out_box_data[i * 6 + 2], nms.out_box_data[i * 6 + 3], + nms.out_box_data[i * 6 + 4], nms.out_box_data[i * 6 + 5]}); + } + } else { + int box_num = 0; + if (infer_result[1].dtype == FDDataType::INT32) { + box_num = *(static_cast(infer_result[1].Data())); + } else if (infer_result[1].dtype == FDDataType::INT64) { + box_num = *(static_cast(infer_result[1].Data())); + } else { + FDASSERT( + false, + "The output box_num of PPYOLOE model should be type of int32/int64."); + } + result->Reserve(box_num); + float* box_data = static_cast(infer_result[0].Data()); + for (size_t i = 0; i < box_num; ++i) { + result->label_ids.push_back(box_data[i * 6]); + result->scores.push_back(box_data[i * 6 + 1]); + result->boxes.emplace_back( + std::array{box_data[i * 6 + 2], box_data[i * 6 + 3], + box_data[i * 6 + 4], box_data[i * 6 + 5]}); + } + } + return true; +} + +bool PPYOLOE::Predict(cv::Mat* im, DetectionResult* result) { + Mat mat(*im); + std::vector processed_data; + if (!Preprocess(&mat, &processed_data)) { + FDERROR << "Failed to preprocess input data while using model:" + << ModelName() << "." << std::endl; + return false; + } + + float* tmp = static_cast(processed_data[1].Data()); + std::vector infer_result; + if (!Infer(processed_data, &infer_result)) { + FDERROR << "Failed to inference while using model:" << ModelName() << "." + << std::endl; + return false; + } + + if (!Postprocess(infer_result, result)) { + FDERROR << "Failed to postprocess while using model:" << ModelName() << "." + << std::endl; + return false; + } + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/ppyoloe.h b/csrc/fastdeploy/vision/detection/ppdet/ppyoloe.h new file mode 100644 index 000000000..2d8cca99f --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/ppyoloe.h @@ -0,0 +1,68 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL PPYOLOE : public FastDeployModel { + public: + PPYOLOE(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + virtual std::string ModelName() const { return "PaddleDetection/PPYOLOE"; } + + virtual bool Initialize(); + + virtual bool BuildPreprocessPipelineFromConfig(); + + virtual bool Preprocess(Mat* mat, std::vector* outputs); + + virtual bool Postprocess(std::vector& infer_result, + DetectionResult* result); + + virtual bool Predict(cv::Mat* im, DetectionResult* result); + + protected: + PPYOLOE() {} + + std::vector> processors_; + std::string config_file_; + // configuration for nms + int64_t background_label = -1; + int64_t keep_top_k = 300; + float nms_eta = 1.0; + float nms_threshold = 0.7; + float score_threshold = 0.01; + int64_t nms_top_k = 10000; + bool normalized = true; + bool has_nms_ = false; + + // This function will used to check if this model contains multiclass_nms + // and get parameters from the operator + void GetNmsInfo(); +}; + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/rcnn.cc b/csrc/fastdeploy/vision/detection/ppdet/rcnn.cc new file mode 100644 index 000000000..38ecc3d1c --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/rcnn.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/ppdet/rcnn.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +FasterRCNN::FasterRCNN(const std::string& model_file, + const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::PDINFER}; + valid_gpu_backends = {Backend::PDINFER}; + has_nms_ = true; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool FasterRCNN::Initialize() { + if (!BuildPreprocessPipelineFromConfig()) { + FDERROR << "Failed to build preprocess pipeline from configuration file." + << std::endl; + return false; + } + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + return true; +} + +bool FasterRCNN::Preprocess(Mat* mat, std::vector* outputs) { + int origin_w = mat->Width(); + int origin_h = mat->Height(); + float scale[2] = {1.0, 1.0}; + for (size_t i = 0; i < processors_.size(); ++i) { + if (!(*(processors_[i].get()))(mat)) { + FDERROR << "Failed to process image data in " << processors_[i]->Name() + << "." << std::endl; + return false; + } + if (processors_[i]->Name().find("Resize") != std::string::npos) { + scale[0] = mat->Height() * 1.0 / origin_h; + scale[1] = mat->Width() * 1.0 / origin_w; + } + } + + outputs->resize(3); + (*outputs)[0].Allocate({1, 2}, FDDataType::FP32, "im_shape"); + (*outputs)[2].Allocate({1, 2}, FDDataType::FP32, "scale_factor"); + float* ptr0 = static_cast((*outputs)[0].MutableData()); + ptr0[0] = mat->Height(); + ptr0[1] = mat->Width(); + float* ptr2 = static_cast((*outputs)[2].MutableData()); + ptr2[0] = scale[0]; + ptr2[1] = scale[1]; + (*outputs)[1].name = "image"; + mat->ShareWithTensor(&((*outputs)[1])); + // reshape to [1, c, h, w] + (*outputs)[1].shape.insert((*outputs)[1].shape.begin(), 1); + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/rcnn.h b/csrc/fastdeploy/vision/detection/ppdet/rcnn.h new file mode 100644 index 000000000..d44ca852e --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/rcnn.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/vision/detection/ppdet/ppyoloe.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL FasterRCNN : public PPYOLOE { + public: + FasterRCNN(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + virtual std::string ModelName() const { return "PaddleDetection/FasterRCNN"; } + + virtual bool Preprocess(Mat* mat, std::vector* outputs); + virtual bool Initialize(); + + protected: + FasterRCNN() {} +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/yolov3.cc b/csrc/fastdeploy/vision/detection/ppdet/yolov3.cc new file mode 100644 index 000000000..309d65640 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/yolov3.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/ppdet/yolov3.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +YOLOv3::YOLOv3(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::PDINFER}; + valid_gpu_backends = {Backend::PDINFER}; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool YOLOv3::Preprocess(Mat* mat, std::vector* outputs) { + int origin_w = mat->Width(); + int origin_h = mat->Height(); + for (size_t i = 0; i < processors_.size(); ++i) { + if (!(*(processors_[i].get()))(mat)) { + FDERROR << "Failed to process image data in " << processors_[i]->Name() + << "." << std::endl; + return false; + } + } + + outputs->resize(3); + (*outputs)[0].Allocate({1, 2}, FDDataType::FP32, "im_shape"); + (*outputs)[2].Allocate({1, 2}, FDDataType::FP32, "scale_factor"); + float* ptr0 = static_cast((*outputs)[0].MutableData()); + ptr0[0] = mat->Height(); + ptr0[1] = mat->Width(); + float* ptr2 = static_cast((*outputs)[2].MutableData()); + ptr2[0] = mat->Height() * 1.0 / origin_h; + ptr2[1] = mat->Width() * 1.0 / origin_w; + (*outputs)[1].name = "image"; + mat->ShareWithTensor(&((*outputs)[1])); + // reshape to [1, c, h, w] + (*outputs)[1].shape.insert((*outputs)[1].shape.begin(), 1); + return true; +} + +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/yolov3.h b/csrc/fastdeploy/vision/detection/ppdet/yolov3.h new file mode 100644 index 000000000..1b65bfca1 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/yolov3.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/vision/detection/ppdet/ppyoloe.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL YOLOv3 : public PPYOLOE { + public: + YOLOv3(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + virtual std::string ModelName() const { return "PaddleDetection/YOLOv3"; } + + virtual bool Preprocess(Mat* mat, std::vector* outputs); +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/yolox.cc b/csrc/fastdeploy/vision/detection/ppdet/yolox.cc new file mode 100644 index 000000000..a60ebfcc4 --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/yolox.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/detection/ppdet/yolox.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +PaddleYOLOX::PaddleYOLOX(const std::string& model_file, const std::string& params_file, + const std::string& config_file, const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT}; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + background_label = -1; + keep_top_k = 1000; + nms_eta = 1; + nms_threshold = 0.65; + nms_top_k = 10000; + normalized = true; + score_threshold = 0.001; + initialized = Initialize(); +} + +bool PaddleYOLOX::Preprocess(Mat* mat, std::vector* outputs) { + int origin_w = mat->Width(); + int origin_h = mat->Height(); + float scale[2] = {1.0, 1.0}; + for (size_t i = 0; i < processors_.size(); ++i) { + if (!(*(processors_[i].get()))(mat)) { + FDERROR << "Failed to process image data in " << processors_[i]->Name() + << "." << std::endl; + return false; + } + if (processors_[i]->Name().find("Resize") != std::string::npos) { + scale[0] = mat->Height() * 1.0 / origin_h; + scale[1] = mat->Width() * 1.0 / origin_w; + } + } + + outputs->resize(2); + (*outputs)[0].name = InputInfoOfRuntime(0).name; + mat->ShareWithTensor(&((*outputs)[0])); + + // reshape to [1, c, h, w] + (*outputs)[0].shape.insert((*outputs)[0].shape.begin(), 1); + + (*outputs)[1].Allocate({1, 2}, FDDataType::FP32, InputInfoOfRuntime(1).name); + float* ptr = static_cast((*outputs)[1].MutableData()); + ptr[0] = scale[0]; + ptr[1] = scale[1]; + return true; +} +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/detection/ppdet/yolox.h b/csrc/fastdeploy/vision/detection/ppdet/yolox.h new file mode 100644 index 000000000..4ffe2f39c --- /dev/null +++ b/csrc/fastdeploy/vision/detection/ppdet/yolox.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/vision/detection/ppdet/ppyoloe.h" + +namespace fastdeploy { +namespace vision { +namespace detection { + +class FASTDEPLOY_DECL PaddleYOLOX : public PPYOLOE { + public: + PaddleYOLOX(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + virtual bool Preprocess(Mat* mat, std::vector* outputs); + + virtual std::string ModelName() const { return "PaddleDetection/YOLOX"; } +}; +} // namespace detection +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/retinaface.cc b/csrc/fastdeploy/vision/facedet/contrib/retinaface.cc new file mode 100644 index 000000000..ebb52010e --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/retinaface.cc @@ -0,0 +1,310 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/facedet/contrib/retinaface.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +struct RetinaAnchor { + float cx; + float cy; + float s_kx; + float s_ky; +}; + +void GenerateRetinaAnchors(const std::vector& size, + const std::vector& downsample_strides, + const std::vector>& min_sizes, + std::vector* anchors) { + // size: tuple of input (width, height) + // downsample_strides: downsample strides (steps), e.g (8,16,32) + // min_sizes: width and height for each anchor, + // e.g {{16, 32}, {64, 128}, {256, 512}} + int h = size[1]; + int w = size[0]; + std::vector> feature_maps; + for (auto s : downsample_strides) { + feature_maps.push_back( + {static_cast( + std::ceil(static_cast(h) / static_cast(s))), + static_cast( + std::ceil(static_cast(w) / static_cast(s)))}); + } + + (*anchors).clear(); + const size_t num_feature_map = feature_maps.size(); + // reference: layers/functions/prior_box.py#L21 + for (size_t k = 0; k < num_feature_map; ++k) { + auto f_map = feature_maps.at(k); // e.g [640//8,640//8] + auto tmp_min_sizes = min_sizes.at(k); // e.g [8,16] + int f_h = f_map.at(0); + int f_w = f_map.at(1); + for (size_t i = 0; i < f_h; ++i) { + for (size_t j = 0; j < f_w; ++j) { + for (auto min_size : tmp_min_sizes) { + float s_kx = + static_cast(min_size) / static_cast(w); // e.g 16/w + float s_ky = + static_cast(min_size) / static_cast(h); // e.g 16/h + // (x + 0.5) * step / w normalized loc mapping to input width + // (y + 0.5) * step / h normalized loc mapping to input height + float s = static_cast(downsample_strides.at(k)); + float cx = (static_cast(j) + 0.5f) * s / static_cast(w); + float cy = (static_cast(i) + 0.5f) * s / static_cast(h); + (*anchors).emplace_back( + RetinaAnchor{cx, cy, s_kx, s_ky}); // without clip + } + } + } + } +} + +RetinaFace::RetinaFace(const std::string& model_file, + const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool RetinaFace::Initialize() { + // parameters for preprocess + size = {640, 640}; + variance = {0.1f, 0.2f}; + downsample_strides = {8, 16, 32}; + min_sizes = {{16, 32}, {64, 128}, {256, 512}}; + landmarks_per_face = 5; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + return true; +} + +bool RetinaFace::Preprocess( + Mat* mat, FDTensor* output, + std::map>* im_info) { + // retinaface's preprocess steps + // 1. Resize + // 2. Convert(opencv style) or Normalize + // 3. HWC->CHW + int resize_w = size[0]; + int resize_h = size[1]; + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + + // Compute `result = mat * alpha + beta` directly by channel + // Reference: detect.py#L94 + std::vector alpha = {1.f, 1.f, 1.f}; + std::vector beta = {-104.f, -117.f, -123.f}; // BGR; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool RetinaFace::Postprocess( + std::vector& infer_result, FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + // retinaface has 3 output tensors, boxes & conf & landmarks + FDASSERT( + (infer_result.size() == 3), + "The default number of output tensor must be 3 according to retinaface."); + FDTensor& boxes_tensor = infer_result.at(0); // (1,n,4) + FDTensor& conf_tensor = infer_result.at(1); // (1,n,2) + FDTensor& landmarks_tensor = infer_result.at(2); // (1,n,10) + FDASSERT((boxes_tensor.shape[0] == 1), "Only support batch =1 now."); + if (boxes_tensor.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + + result->Clear(); + // must be setup landmarks_per_face before reserve + result->landmarks_per_face = landmarks_per_face; + result->Reserve(boxes_tensor.shape[1]); + + float* boxes_ptr = static_cast(boxes_tensor.Data()); + float* conf_ptr = static_cast(conf_tensor.Data()); + float* landmarks_ptr = static_cast(landmarks_tensor.Data()); + const size_t num_bboxes = boxes_tensor.shape[1]; // n + // fetch original image shape + auto iter_ipt = im_info.find("input_shape"); + FDASSERT((iter_ipt != im_info.end()), + "Cannot find input_shape from im_info."); + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + + // generate anchors with dowmsample strides + std::vector anchors; + GenerateRetinaAnchors(size, downsample_strides, min_sizes, &anchors); + + // decode bounding boxes + for (size_t i = 0; i < num_bboxes; ++i) { + float confidence = conf_ptr[2 * i + 1]; + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + float prior_cx = anchors.at(i).cx; + float prior_cy = anchors.at(i).cy; + float prior_s_kx = anchors.at(i).s_kx; + float prior_s_ky = anchors.at(i).s_ky; + + // fetch offsets (dx,dy,dw,dh) + float dx = boxes_ptr[4 * i + 0]; + float dy = boxes_ptr[4 * i + 1]; + float dw = boxes_ptr[4 * i + 2]; + float dh = boxes_ptr[4 * i + 3]; + // reference: Pytorch_Retinaface/utils/box_utils.py + float x = prior_cx + dx * variance[0] * prior_s_kx; + float y = prior_cy + dy * variance[0] * prior_s_ky; + float w = prior_s_kx * std::exp(dw * variance[1]); + float h = prior_s_ky * std::exp(dh * variance[1]); // (0.~1.) + // from (x,y,w,h) to (x1,y1,x2,y2) + float x1 = (x - w / 2.f) * ipt_w; + float y1 = (y - h / 2.f) * ipt_h; + float x2 = (x + w / 2.f) * ipt_w; + float y2 = (y + h / 2.f) * ipt_h; + result->boxes.emplace_back(std::array{x1, y1, x2, y2}); + result->scores.push_back(confidence); + // decode landmarks (default 5 landmarks) + if (landmarks_per_face > 0) { + // reference: utils/box_utils.py#L241 + for (size_t j = 0; j < landmarks_per_face * 2; j += 2) { + float ldx = landmarks_ptr[i * (landmarks_per_face * 2) + (j + 0)]; + float ldy = landmarks_ptr[i * (landmarks_per_face * 2) + (j + 1)]; + float lx = (prior_cx + ldx * variance[0] * prior_s_kx) * ipt_w; + float ly = (prior_cy + ldy * variance[0] * prior_s_ky) * ipt_h; + result->landmarks.emplace_back(std::array{lx, ly}); + } + } + } + + if (result->boxes.size() == 0) { + return true; + } + + utils::NMS(result, nms_iou_threshold); + + // scale and clip box + for (size_t i = 0; i < result->boxes.size(); ++i) { + result->boxes[i][0] = std::max(result->boxes[i][0], 0.0f); + result->boxes[i][1] = std::max(result->boxes[i][1], 0.0f); + result->boxes[i][2] = std::max(result->boxes[i][2], 0.0f); + result->boxes[i][3] = std::max(result->boxes[i][3], 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + // scale and clip landmarks + for (size_t i = 0; i < result->landmarks.size(); ++i) { + result->landmarks[i][0] = std::max(result->landmarks[i][0], 0.0f); + result->landmarks[i][1] = std::max(result->landmarks[i][1], 0.0f); + result->landmarks[i][0] = std::min(result->landmarks[i][0], ipt_w - 1.0f); + result->landmarks[i][1] = std::min(result->landmarks[i][1], ipt_h - 1.0f); + } + return true; +} + +bool RetinaFace::Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold, float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors, result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace facedet +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/facedet/contrib/retinaface.h b/csrc/fastdeploy/vision/facedet/contrib/retinaface.h new file mode 100644 index 000000000..e1ef50e2e --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/retinaface.h @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +class FASTDEPLOY_DECL RetinaFace : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + RetinaFace(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "Pytorch_Retinaface"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold = 0.25f, + float nms_iou_threshold = 0.4f); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height), default (640, 640) + std::vector size; + // variance in RetinaFace's prior-box(anchor) generate process, + // default (0.1, 0.2) + std::vector variance; + // downsample strides (namely, steps) for RetinaFace to + // generate anchors, will take (8,16,32) as default values. + std::vector downsample_strides; + // min sizes, width and height for each anchor. + std::vector> min_sizes; + // landmarks_per_face, default 5 in RetinaFace + int landmarks_per_face; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(std::vector& infer_result, + FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致 + bool IsDynamicInput() const { return is_dynamic_input_; } + + bool is_dynamic_input_; +}; + +} // namespace facedet +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/retinaface_pybind.cc b/csrc/fastdeploy/vision/facedet/contrib/retinaface_pybind.cc new file mode 100644 index 000000000..9419327c4 --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/retinaface_pybind.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindRetinaFace(pybind11::module& m) { + pybind11::class_(m, + "RetinaFace") + .def(pybind11::init()) + .def("predict", + [](vision::facedet::RetinaFace& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::FaceDetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::facedet::RetinaFace::size) + .def_readwrite("variance", &vision::facedet::RetinaFace::variance) + .def_readwrite("downsample_strides", + &vision::facedet::RetinaFace::downsample_strides) + .def_readwrite("min_sizes", &vision::facedet::RetinaFace::min_sizes) + .def_readwrite("landmarks_per_face", + &vision::facedet::RetinaFace::landmarks_per_face); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/scrfd.cc b/csrc/fastdeploy/vision/facedet/contrib/scrfd.cc new file mode 100644 index 000000000..ffcff65c9 --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/scrfd.cc @@ -0,0 +1,369 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/facedet/contrib/scrfd.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +void SCRFD::LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill, bool scale_up, int stride) { + float scale = + std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width()); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(mat->Height() * scale)); + int resize_w = int(round(mat->Width() * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +SCRFD::SCRFD(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool SCRFD::Initialize() { + // parameters for preprocess + use_kps = true; + size = {640, 640}; + padding_value = {0.0, 0.0, 0.0}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + downsample_strides = {8, 16, 32}; + num_anchors = 2; + landmarks_per_face = 5; + center_points_is_update_ = false; + max_nms = 30000; + // num_outputs = use_kps ? 9 : 6; + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_shape is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + + return true; +} + +bool SCRFD::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + float ratio = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (ratio != 1.0) { + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(mat->Height() * ratio); + int resize_w = int(mat->Width() * ratio); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // scrfd's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + SCRFD::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, + is_scale_up, stride); + + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + // Original Repo/tools/scrfd.py: cv2.dnn.blobFromImage(img, 1.0/128, + // input_size, (127.5, 127.5, 127.5), swapRB=True) + std::vector alpha = {1.f / 128.f, 1.f / 128.f, 1.f / 128.f}; + std::vector beta = {-127.5f / 128.f, -127.5f / 128.f, -127.5f / 128.f}; + Convert::Run(mat, alpha, beta); + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +void SCRFD::GeneratePoints() { + if (center_points_is_update_ && !is_dynamic_input_) { + return; + } + // 8, 16, 32 + for (auto local_stride : downsample_strides) { + unsigned int num_grid_w = size[0] / local_stride; + unsigned int num_grid_h = size[1] / local_stride; + // y + for (unsigned int i = 0; i < num_grid_h; ++i) { + // x + for (unsigned int j = 0; j < num_grid_w; ++j) { + // num_anchors, col major + for (unsigned int k = 0; k < num_anchors; ++k) { + SCRFDPoint point; + point.cx = static_cast(j); + point.cy = static_cast(i); + center_points_[local_stride].push_back(point); + } + } + } + } + + center_points_is_update_ = true; +} + +bool SCRFD::Postprocess( + std::vector& infer_result, FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + // number of downsample_strides + int fmc = downsample_strides.size(); + // scrfd has 6,9,10,15 output tensors + FDASSERT((infer_result.size() == 9 || infer_result.size() == 6 || + infer_result.size() == 10 || infer_result.size() == 15), + "The default number of output tensor must be 6, 9, 10, or 15 " + "according to scrfd."); + FDASSERT((fmc == 3 || fmc == 5), "The fmc must be 3 or 5"); + FDASSERT((infer_result.at(0).shape[0] == 1), "Only support batch =1 now."); + for (int i = 0; i < fmc; ++i) { + if (infer_result.at(i).dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + } + int total_num_boxes = 0; + // compute the reserve space. + for (int f = 0; f < fmc; ++f) { + total_num_boxes += infer_result.at(f).shape[1]; + }; + GeneratePoints(); + result->Clear(); + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * scale) / 2.0f; + float pad_w = (out_w - ipt_w * scale) / 2.0f; + if (is_mini_pad) { + // 和 LetterBox中_auto=true的处理逻辑对应 + pad_h = static_cast(static_cast(pad_h) % stride); + pad_w = static_cast(static_cast(pad_w) % stride); + } + // must be setup landmarks_per_face before reserve + result->landmarks_per_face = landmarks_per_face; + result->Reserve(total_num_boxes); + unsigned int count = 0; + // loop each stride + for (int f = 0; f < fmc; ++f) { + float* score_ptr = static_cast(infer_result.at(f).Data()); + float* bbox_ptr = static_cast(infer_result.at(f + fmc).Data()); + const unsigned int num_points = infer_result.at(f).shape[1]; + int current_stride = downsample_strides[f]; + auto& stride_points = center_points_[current_stride]; + // loop each anchor + for (unsigned int i = 0; i < num_points; ++i) { + const float cls_conf = score_ptr[i]; + if (cls_conf < conf_threshold) continue; // filter + auto& point = stride_points.at(i); + const float cx = point.cx; // cx + const float cy = point.cy; // cy + // bbox + const float* offsets = bbox_ptr + i * 4; + float l = offsets[0]; // left + float t = offsets[1]; // top + float r = offsets[2]; // right + float b = offsets[3]; // bottom + + float x1 = ((cx - l) * static_cast(current_stride) - + static_cast(pad_w)) / + scale; // cx - l x1 + float y1 = ((cy - t) * static_cast(current_stride) - + static_cast(pad_h)) / + scale; // cy - t y1 + float x2 = ((cx + r) * static_cast(current_stride) - + static_cast(pad_w)) / + scale; // cx + r x2 + float y2 = ((cy + b) * static_cast(current_stride) - + static_cast(pad_h)) / + scale; // cy + b y2 + result->boxes.emplace_back(std::array{x1, y1, x2, y2}); + result->scores.push_back(cls_conf); + if (use_kps) { + float* landmarks_ptr = + static_cast(infer_result.at(f + 2 * fmc).Data()); + // landmarks + const float* kps_offsets = landmarks_ptr + i * (landmarks_per_face * 2); + for (unsigned int j = 0; j < landmarks_per_face * 2; j += 2) { + float kps_l = kps_offsets[j]; + float kps_t = kps_offsets[j + 1]; + float kps_x = ((cx + kps_l) * static_cast(current_stride) - + static_cast(pad_w)) / + scale; // cx + l x + float kps_y = ((cy + kps_t) * static_cast(current_stride) - + static_cast(pad_h)) / + scale; // cy + t y + result->landmarks.emplace_back(std::array{kps_x, kps_y}); + } + } + count += 1; // limit boxes for nms. + if (count > max_nms) { + break; + } + } + } + + // fetch original image shape + FDASSERT((iter_ipt != im_info.end()), + "Cannot find input_shape from im_info."); + + if (result->boxes.size() == 0) { + return true; + } + + utils::NMS(result, nms_iou_threshold); + + // scale and clip box + for (size_t i = 0; i < result->boxes.size(); ++i) { + result->boxes[i][0] = std::max(result->boxes[i][0], 0.0f); + result->boxes[i][1] = std::max(result->boxes[i][1], 0.0f); + result->boxes[i][2] = std::max(result->boxes[i][2], 0.0f); + result->boxes[i][3] = std::max(result->boxes[i][3], 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + // scale and clip landmarks + for (size_t i = 0; i < result->landmarks.size(); ++i) { + result->landmarks[i][0] = std::max(result->landmarks[i][0], 0.0f); + result->landmarks[i][1] = std::max(result->landmarks[i][1], 0.0f); + result->landmarks[i][0] = std::min(result->landmarks[i][0], ipt_w - 1.0f); + result->landmarks[i][1] = std::min(result->landmarks[i][1], ipt_h - 1.0f); + } + return true; +} + +bool SCRFD::Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold, float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors, result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace facedet +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/facedet/contrib/scrfd.h b/csrc/fastdeploy/vision/facedet/contrib/scrfd.h new file mode 100644 index 000000000..398301363 --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/scrfd.h @@ -0,0 +1,122 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +class FASTDEPLOY_DECL SCRFD : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + SCRFD(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "scrfd"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold = 0.25f, + float nms_iou_threshold = 0.4f); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height), default (640, 640) + std::vector size; + // downsample strides (namely, steps) for SCRFD to + // generate anchors, will take (8,16,32) as default values. + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // for offseting the boxes by classes when using NMS + std::vector downsample_strides; + // landmarks_per_face, default 5 in SCRFD + int landmarks_per_face; + // are the outputs of onnx file with key points features or not + bool use_kps; + // the upperbond number of boxes processed by nms. + int max_nms; + // number anchors of each stride + unsigned int num_anchors; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(std::vector& infer_result, + FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + void GeneratePoints(); + + // 对图片进行LetterBox处理 + // mat 为读取到的原图 + // size 为输入模型的图像尺寸 + void LetterBox(Mat* mat, const std::vector& size, + const std::vector& color, bool _auto, + bool scale_fill = false, bool scale_up = true, + int stride = 32); + + bool is_dynamic_input_; + + bool center_points_is_update_; + + typedef struct { + float cx; + float cy; + } SCRFDPoint; + + std::unordered_map> center_points_; +}; +} // namespace facedet +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/scrfd_pybind.cc b/csrc/fastdeploy/vision/facedet/contrib/scrfd_pybind.cc new file mode 100644 index 000000000..7cfa4d025 --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/scrfd_pybind.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindSCRFD(pybind11::module& m) { + // Bind SCRFD + pybind11::class_(m, "SCRFD") + .def(pybind11::init()) + .def("predict", + [](vision::facedet::SCRFD& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::FaceDetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::facedet::SCRFD::size) + .def_readwrite("padding_value", &vision::facedet::SCRFD::padding_value) + .def_readwrite("is_mini_pad", &vision::facedet::SCRFD::is_mini_pad) + .def_readwrite("is_no_pad", &vision::facedet::SCRFD::is_no_pad) + .def_readwrite("is_scale_up", &vision::facedet::SCRFD::is_scale_up) + .def_readwrite("stride", &vision::facedet::SCRFD::stride) + .def_readwrite("use_kps", &vision::facedet::SCRFD::use_kps) + .def_readwrite("max_nms", &vision::facedet::SCRFD::max_nms) + .def_readwrite("downsample_strides", + &vision::facedet::SCRFD::downsample_strides) + .def_readwrite("num_anchors", &vision::facedet::SCRFD::num_anchors) + .def_readwrite("landmarks_per_face", + &vision::facedet::SCRFD::landmarks_per_face); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/ultraface.cc b/csrc/fastdeploy/vision/facedet/contrib/ultraface.cc new file mode 100644 index 000000000..ed4962306 --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/ultraface.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/facedet/contrib/ultraface.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +UltraFace::UltraFace(const std::string& model_file, + const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool UltraFace::Initialize() { + // parameters for preprocess + size = {320, 240}; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + return true; +} + +bool UltraFace::Preprocess( + Mat* mat, FDTensor* output, + std::map>* im_info) { + // ultraface's preprocess steps + // 1. resize + // 2. BGR->RGB + // 3. HWC->CHW + int resize_w = size[0]; + int resize_h = size[1]; + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + + BGR2RGB::Run(mat); + // Compute `result = mat * alpha + beta` directly by channel + // Reference: detect_imgs_onnx.py#L73 + std::vector alpha = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f}; + std::vector beta = {-127.0f * (1.0f / 128.0f), + -127.0f * (1.0f / 128.0f), + -127.0f * (1.0f / 128.0f)}; // RGB; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool UltraFace::Postprocess( + std::vector& infer_result, FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + // ultraface has 2 output tensors, scores & boxes + FDASSERT( + (infer_result.size() == 2), + "The default number of output tensor must be 2 according to ultraface."); + FDTensor& scores_tensor = infer_result.at(0); // (1,4420,2) + FDTensor& boxes_tensor = infer_result.at(1); // (1,4420,4) + FDASSERT((scores_tensor.shape[0] == 1), "Only support batch =1 now."); + FDASSERT((boxes_tensor.shape[0] == 1), "Only support batch =1 now."); + if (scores_tensor.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + if (boxes_tensor.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + + result->Clear(); + // must be setup landmarks_per_face before reserve. + // ultraface detector does not detect landmarks by default. + result->landmarks_per_face = 0; + result->Reserve(boxes_tensor.shape[1]); + + float* scores_ptr = static_cast(scores_tensor.Data()); + float* boxes_ptr = static_cast(boxes_tensor.Data()); + const size_t num_bboxes = boxes_tensor.shape[1]; // e.g 4420 + // fetch original image shape + auto iter_ipt = im_info.find("input_shape"); + FDASSERT((iter_ipt != im_info.end()), + "Cannot find input_shape from im_info."); + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + + // decode bounding boxes + for (size_t i = 0; i < num_bboxes; ++i) { + float confidence = scores_ptr[2 * i + 1]; + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + float x1 = boxes_ptr[4 * i + 0] * ipt_w; + float y1 = boxes_ptr[4 * i + 1] * ipt_h; + float x2 = boxes_ptr[4 * i + 2] * ipt_w; + float y2 = boxes_ptr[4 * i + 3] * ipt_h; + result->boxes.emplace_back(std::array{x1, y1, x2, y2}); + result->scores.push_back(confidence); + } + + if (result->boxes.size() == 0) { + return true; + } + + utils::NMS(result, nms_iou_threshold); + + // scale and clip box + for (size_t i = 0; i < result->boxes.size(); ++i) { + result->boxes[i][0] = std::max(result->boxes[i][0], 0.0f); + result->boxes[i][1] = std::max(result->boxes[i][1], 0.0f); + result->boxes[i][2] = std::max(result->boxes[i][2], 0.0f); + result->boxes[i][3] = std::max(result->boxes[i][3], 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + return true; +} + +bool UltraFace::Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold, float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors, result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace facedet +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/ultraface.h b/csrc/fastdeploy/vision/facedet/contrib/ultraface.h new file mode 100644 index 000000000..387bc1f9a --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/ultraface.h @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +class FASTDEPLOY_DECL UltraFace : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + UltraFace(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { + return "Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB"; + } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold = 0.7f, + float nms_iou_threshold = 0.3f); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height), default (320, 240) + std::vector size; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* outputs, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(std::vector& infer_result, + FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致 + bool IsDynamicInput() const { return is_dynamic_input_; } + + bool is_dynamic_input_; +}; + +} // namespace facedet +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/ultraface_pybind.cc b/csrc/fastdeploy/vision/facedet/contrib/ultraface_pybind.cc new file mode 100644 index 000000000..855c26908 --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/ultraface_pybind.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindUltraFace(pybind11::module& m) { + pybind11::class_(m, "UltraFace") + .def(pybind11::init()) + .def("predict", + [](vision::facedet::UltraFace& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::FaceDetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::facedet::UltraFace::size); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/yolov5face.cc b/csrc/fastdeploy/vision/facedet/contrib/yolov5face.cc new file mode 100644 index 000000000..96af230b0 --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/yolov5face.cc @@ -0,0 +1,294 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/facedet/contrib/yolov5face.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +void LetterBox(Mat* mat, std::vector size, std::vector color, + bool _auto, bool scale_fill = false, bool scale_up = true, + int stride = 32) { + float scale = + std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width()); + if (!scale_up) { + scale = std::min(scale, 1.0f); + } + + int resize_h = int(round(mat->Height() * scale)); + int resize_w = int(round(mat->Width() * scale)); + + int pad_w = size[0] - resize_w; + int pad_h = size[1] - resize_h; + if (_auto) { + pad_h = pad_h % stride; + pad_w = pad_w % stride; + } else if (scale_fill) { + pad_h = 0; + pad_w = 0; + resize_h = size[1]; + resize_w = size[0]; + } + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (pad_h > 0 || pad_w > 0) { + float half_h = pad_h * 1.0 / 2; + int top = int(round(half_h - 0.1)); + int bottom = int(round(half_h + 0.1)); + float half_w = pad_w * 1.0 / 2; + int left = int(round(half_w - 0.1)); + int right = int(round(half_w + 0.1)); + Pad::Run(mat, top, bottom, left, right, color); + } +} + +YOLOv5Face::YOLOv5Face(const std::string& model_file, + const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool YOLOv5Face::Initialize() { + // parameters for preprocess + size = {640, 640}; + padding_value = {114.0, 114.0, 114.0}; + is_mini_pad = false; + is_no_pad = false; + is_scale_up = false; + stride = 32; + landmarks_per_face = 5; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // Check if the input shape is dynamic after Runtime already initialized, + // Note that, We need to force is_mini_pad 'false' to keep static + // shape after padding (LetterBox) when the is_dynamic_input_ is 'false'. + is_dynamic_input_ = false; + auto shape = InputInfoOfRuntime(0).shape; + for (int i = 0; i < shape.size(); ++i) { + // if height or width is dynamic + if (i >= 2 && shape[i] <= 0) { + is_dynamic_input_ = true; + break; + } + } + if (!is_dynamic_input_) { + is_mini_pad = false; + } + return true; +} + +bool YOLOv5Face::Preprocess( + Mat* mat, FDTensor* output, + std::map>* im_info) { + // process after image load + float ratio = std::min(size[1] * 1.0f / static_cast(mat->Height()), + size[0] * 1.0f / static_cast(mat->Width())); + if (ratio != 1.0) { // always true + int interp = cv::INTER_AREA; + if (ratio > 1.0) { + interp = cv::INTER_LINEAR; + } + int resize_h = int(round(static_cast(mat->Height()) * ratio)); + int resize_w = int(round(static_cast(mat->Width()) * ratio)); + Resize::Run(mat, resize_w, resize_h, -1, -1, interp); + } + // yolov5face's preprocess steps + // 1. letterbox + // 2. BGR->RGB + // 3. HWC->CHW + LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad, is_scale_up, + stride); + BGR2RGB::Run(mat); + // Normalize::Run(mat, std::vector(mat->Channels(), 0.0), + // std::vector(mat->Channels(), 1.0)); + // Compute `result = mat * alpha + beta` directly by channel + std::vector alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + std::vector beta = {0.0f, 0.0f, 0.0f}; + Convert::Run(mat, alpha, beta); + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool YOLOv5Face::Postprocess( + FDTensor& infer_result, FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold) { + // infer_result: (1,n,16) 16=4+1+10+1 + FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now."); + if (infer_result.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + + result->Clear(); + // must be setup landmarks_per_face before reserve + result->landmarks_per_face = landmarks_per_face; + result->Reserve(infer_result.shape[1]); + + float* data = static_cast(infer_result.Data()); + for (size_t i = 0; i < infer_result.shape[1]; ++i) { + float* reg_cls_ptr = data + (i * infer_result.shape[2]); + float obj_conf = reg_cls_ptr[4]; + float cls_conf = reg_cls_ptr[15]; + float confidence = obj_conf * cls_conf; + // filter boxes by conf_threshold + if (confidence <= conf_threshold) { + continue; + } + float x = reg_cls_ptr[0]; + float y = reg_cls_ptr[1]; + float w = reg_cls_ptr[2]; + float h = reg_cls_ptr[3]; + + // convert from [x, y, w, h] to [x1, y1, x2, y2] + result->boxes.emplace_back(std::array{ + (x - w / 2.f), (y - h / 2.f), (x + w / 2.f), (y + h / 2.f)}); + result->scores.push_back(confidence); + // decode landmarks (default 5 landmarks) + if (landmarks_per_face > 0) { + float* landmarks_ptr = reg_cls_ptr + 5; + for (size_t j = 0; j < landmarks_per_face * 2; j += 2) { + result->landmarks.emplace_back( + std::array{landmarks_ptr[j], landmarks_ptr[j + 1]}); + } + } + } + + if (result->boxes.size() == 0) { + return true; + } + + utils::NMS(result, nms_iou_threshold); + + // scale the boxes to the origin image shape + auto iter_out = im_info.find("output_shape"); + auto iter_ipt = im_info.find("input_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + float out_h = iter_out->second[0]; + float out_w = iter_out->second[1]; + float ipt_h = iter_ipt->second[0]; + float ipt_w = iter_ipt->second[1]; + float scale = std::min(out_h / ipt_h, out_w / ipt_w); + float pad_h = (out_h - ipt_h * scale) / 2.f; + float pad_w = (out_w - ipt_w * scale) / 2.f; + if (is_mini_pad) { + pad_h = static_cast(static_cast(pad_h) % stride); + pad_w = static_cast(static_cast(pad_w) % stride); + } + // scale and clip box + for (size_t i = 0; i < result->boxes.size(); ++i) { + result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f); + result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f); + result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f); + result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f); + result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f); + result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f); + result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f); + result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f); + } + // scale and clip landmarks + for (size_t i = 0; i < result->landmarks.size(); ++i) { + result->landmarks[i][0] = + std::max((result->landmarks[i][0] - pad_w) / scale, 0.0f); + result->landmarks[i][1] = + std::max((result->landmarks[i][1] - pad_h) / scale, 0.0f); + result->landmarks[i][0] = std::min(result->landmarks[i][0], ipt_w - 1.0f); + result->landmarks[i][1] = std::min(result->landmarks[i][1], ipt_h - 1.0f); + } + return true; +} + +bool YOLOv5Face::Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold, float nms_iou_threshold) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors[0], result, im_info, conf_threshold, + nms_iou_threshold)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace facedet +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/facedet/contrib/yolov5face.h b/csrc/fastdeploy/vision/facedet/contrib/yolov5face.h new file mode 100644 index 000000000..017c9681a --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/yolov5face.h @@ -0,0 +1,97 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace facedet { + +class FASTDEPLOY_DECL YOLOv5Face : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + YOLOv5Face(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "yolov5-face"; } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + // conf_threshold 为后处理的参数 + // nms_iou_threshold 为后处理的参数 + virtual bool Predict(cv::Mat* im, FaceDetectionResult* result, + float conf_threshold = 0.25, + float nms_iou_threshold = 0.5); + + // 以下为模型在预测时的一些参数,基本是前后处理所需 + // 用户在创建模型后,可根据模型的要求,以及自己的需求 + // 对参数进行修改 + // tuple of (width, height) + std::vector size; + // padding value, size should be same with Channels + std::vector padding_value; + // only pad to the minimum rectange which height and width is times of stride + bool is_mini_pad; + // while is_mini_pad = false and is_no_pad = true, will resize the image to + // the set size + bool is_no_pad; + // if is_scale_up is false, the input image only can be zoom out, the maximum + // resize scale cannot exceed 1.0 + bool is_scale_up; + // padding stride, for is_mini_pad + int stride; + // setup the number of landmarks for per face (if have), default 5 in + // official yolov5face note that, the outupt tensor's shape must be: + // (1,n,4+1+2*landmarks_per_face+1=box+obj+landmarks+cls) + int landmarks_per_face; + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + // im_info为预处理过程保存的数据,在后处理中需要用到 + bool Preprocess(Mat* mat, FDTensor* outputs, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + // im_info 为预处理记录的信息,后处理用于还原box + // conf_threshold 后处理时过滤box的置信度阈值 + // nms_iou_threshold 后处理时NMS设定的iou阈值 + bool Postprocess(FDTensor& infer_result, FaceDetectionResult* result, + const std::map>& im_info, + float conf_threshold, float nms_iou_threshold); + + // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致 + bool IsDynamicInput() const { return is_dynamic_input_; } + + bool is_dynamic_input_; +}; + +} // namespace facedet +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/contrib/yolov5face_pybind.cc b/csrc/fastdeploy/vision/facedet/contrib/yolov5face_pybind.cc new file mode 100644 index 000000000..b843d4a9f --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/contrib/yolov5face_pybind.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindYOLOv5Face(pybind11::module& m) { + pybind11::class_(m, + "YOLOv5Face") + .def(pybind11::init()) + .def("predict", + [](vision::facedet::YOLOv5Face& self, pybind11::array& data, + float conf_threshold, float nms_iou_threshold) { + auto mat = PyArrayToCvMat(data); + vision::FaceDetectionResult res; + self.Predict(&mat, &res, conf_threshold, nms_iou_threshold); + return res; + }) + .def_readwrite("size", &vision::facedet::YOLOv5Face::size) + .def_readwrite("padding_value", + &vision::facedet::YOLOv5Face::padding_value) + .def_readwrite("is_mini_pad", &vision::facedet::YOLOv5Face::is_mini_pad) + .def_readwrite("is_no_pad", &vision::facedet::YOLOv5Face::is_no_pad) + .def_readwrite("is_scale_up", &vision::facedet::YOLOv5Face::is_scale_up) + .def_readwrite("stride", &vision::facedet::YOLOv5Face::stride) + .def_readwrite("landmarks_per_face", + &vision::facedet::YOLOv5Face::landmarks_per_face); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/facedet/facedet_pybind.cc b/csrc/fastdeploy/vision/facedet/facedet_pybind.cc new file mode 100644 index 000000000..3d9a812af --- /dev/null +++ b/csrc/fastdeploy/vision/facedet/facedet_pybind.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindRetinaFace(pybind11::module& m); +void BindUltraFace(pybind11::module& m); +void BindYOLOv5Face(pybind11::module& m); +void BindSCRFD(pybind11::module& m); + +void BindFaceDet(pybind11::module& m) { + auto facedet_module = m.def_submodule("facedet", "Face detection models."); + BindRetinaFace(facedet_module); + BindUltraFace(facedet_module); + BindYOLOv5Face(facedet_module); + BindSCRFD(facedet_module); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/arcface.cc b/csrc/fastdeploy/vision/faceid/contrib/arcface.cc new file mode 100644 index 000000000..9c2b64763 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/arcface.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/faceid/contrib/arcface.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +ArcFace::ArcFace(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) + : InsightFaceRecognitionModel(model_file, params_file, custom_option, + model_format) { + initialized = Initialize(); +} + +bool ArcFace::Initialize() { + // 如果初始化有变化 修改该子类函数 + // 这里需要判断backend是否已经initialized,如果是,则不应该再调用 + // InsightFaceRecognitionModel::Initialize() + // 因为该函数会对backend进行初始化, backend已经在父类的构造函数初始化 + // 这里只修改一些模型相关的属性 + + // (1) 如果父类初始化了backend + if (initialized) { + // (1.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; + } + // (2) 如果父类没有初始化backend + if (!InsightFaceRecognitionModel::Initialize()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // (2.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; +} + +bool ArcFace::Preprocess(Mat* mat, FDTensor* output) { + // 如果预处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Preprocess(mat, output); +} + +bool ArcFace::Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) { + // 如果后处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Postprocess(infer_result, result); +} + +bool ArcFace::Predict(cv::Mat* im, FaceRecognitionResult* result) { + // 如果前后处理有变化 则override子类的Preprocess和Postprocess + // 如果前后处理有变化 此处应该调用子类自己的Preprocess和Postprocess + return InsightFaceRecognitionModel::Predict(im, result); +} + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/faceid/contrib/arcface.h b/csrc/fastdeploy/vision/faceid/contrib/arcface.h new file mode 100644 index 000000000..698fadceb --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/arcface.h @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" +#include "fastdeploy/vision/faceid/contrib/insightface_rec.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +class FASTDEPLOY_DECL ArcFace : public InsightFaceRecognitionModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + // ArcFace支持IResNet, IResNet2060, VIT, MobileFaceNet骨干 + ArcFace(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const override { + return "deepinsight/insightface/recognition/arcface_pytorch"; + } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + bool Predict(cv::Mat* im, FaceRecognitionResult* result) override; + // 父类中包含 size, alpha, beta, swap_rb, l2_normalize 等基本可配置属性 + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize() override; + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + bool Preprocess(Mat* mat, FDTensor* output) override; + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + bool Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) override; +}; + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/arcface_pybind.cc b/csrc/fastdeploy/vision/faceid/contrib/arcface_pybind.cc new file mode 100644 index 000000000..cd9bf7c57 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/arcface_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindArcFace(pybind11::module& m) { + // Bind ArcFace + pybind11::class_(m, "ArcFace") + .def(pybind11::init()) + .def("predict", + [](vision::faceid::ArcFace& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::FaceRecognitionResult res; + self.Predict(&mat, &res); + return res; + }) + .def_readwrite("size", &vision::faceid::ArcFace::size) + .def_readwrite("alpha", &vision::faceid::ArcFace::alpha) + .def_readwrite("beta", &vision::faceid::ArcFace::beta) + .def_readwrite("swap_rb", &vision::faceid::ArcFace::swap_rb) + .def_readwrite("l2_normalize", &vision::faceid::ArcFace::l2_normalize); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/cosface.cc b/csrc/fastdeploy/vision/faceid/contrib/cosface.cc new file mode 100644 index 000000000..4a4d6dc55 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/cosface.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/faceid/contrib/cosface.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +CosFace::CosFace(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) + : InsightFaceRecognitionModel(model_file, params_file, custom_option, + model_format) { + initialized = Initialize(); +} + +bool CosFace::Initialize() { + // 如果初始化有变化 修改该子类函数 + // 这里需要判断backend是否已经initialized,如果是,则不应该再调用 + // InsightFaceRecognitionModel::Initialize() + // 因为该函数会对backend进行初始化, backend已经在父类的构造函数初始化 + // 这里只修改一些模型相关的属性 + + // (1) 如果父类初始化了backend + if (initialized) { + // (1.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; + } + // (2) 如果父类没有初始化backend + if (!InsightFaceRecognitionModel::Initialize()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // (2.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; +} + +bool CosFace::Preprocess(Mat* mat, FDTensor* output) { + // 如果预处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Preprocess(mat, output); +} + +bool CosFace::Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) { + // 如果后处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Postprocess(infer_result, result); +} + +bool CosFace::Predict(cv::Mat* im, FaceRecognitionResult* result) { + // 如果前后处理有变化 则override子类的Preprocess和Postprocess + // 如果前后处理有变化 此处应该调用子类自己的Preprocess和Postprocess + return InsightFaceRecognitionModel::Predict(im, result); +} + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/faceid/contrib/cosface.h b/csrc/fastdeploy/vision/faceid/contrib/cosface.h new file mode 100644 index 000000000..92704536c --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/cosface.h @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" +#include "fastdeploy/vision/faceid/contrib/insightface_rec.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +class FASTDEPLOY_DECL CosFace : public InsightFaceRecognitionModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + // ArcFace支持IResNet, IResNet2060, VIT, MobileFaceNet骨干 + CosFace(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + // insightface/arcface提供的模型文件包含了cosface + std::string ModelName() const override { + return "deepinsight/insightface/recognition/arcface_pytorch"; + } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + bool Predict(cv::Mat* im, FaceRecognitionResult* result) override; + // 父类中包含 size, alpha, beta, swap_rb, l2_normalize 等基本可配置属性 + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize() override; + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + bool Preprocess(Mat* mat, FDTensor* output) override; + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + bool Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) override; +}; + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/cosface_pybind.cc b/csrc/fastdeploy/vision/faceid/contrib/cosface_pybind.cc new file mode 100644 index 000000000..c09f9e723 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/cosface_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindCosFace(pybind11::module& m) { + // Bind CosFace + pybind11::class_(m, "CosFace") + .def(pybind11::init()) + .def("predict", + [](vision::faceid::CosFace& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::FaceRecognitionResult res; + self.Predict(&mat, &res); + return res; + }) + .def_readwrite("size", &vision::faceid::CosFace::size) + .def_readwrite("alpha", &vision::faceid::CosFace::alpha) + .def_readwrite("beta", &vision::faceid::CosFace::beta) + .def_readwrite("swap_rb", &vision::faceid::CosFace::swap_rb) + .def_readwrite("l2_normalize", &vision::faceid::CosFace::l2_normalize); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/insightface_rec.cc b/csrc/fastdeploy/vision/faceid/contrib/insightface_rec.cc new file mode 100644 index 000000000..ddd7520d4 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/insightface_rec.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/faceid/contrib/insightface_rec.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +InsightFaceRecognitionModel::InsightFaceRecognitionModel( + const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool InsightFaceRecognitionModel::Initialize() { + // parameters for preprocess + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + return true; +} + +bool InsightFaceRecognitionModel::Preprocess(Mat* mat, FDTensor* output) { + // face recognition model's preprocess steps in insightface + // reference: insightface/recognition/arcface_torch/inference.py + // 1. Resize + // 2. BGR2RGB + // 3. Convert(opencv style) or Normalize + // 4. HWC2CHW + int resize_w = size[0]; + int resize_h = size[1]; + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (swap_rb) { + BGR2RGB::Run(mat); + } + + Convert::Run(mat, alpha, beta); + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool InsightFaceRecognitionModel::Postprocess( + std::vector& infer_result, FaceRecognitionResult* result) { + FDASSERT((infer_result.size() == 1), + "The default number of output tensor must be 1 according to " + "insightface."); + FDTensor& embedding_tensor = infer_result.at(0); + FDASSERT((embedding_tensor.shape[0] == 1), "Only support batch =1 now."); + if (embedding_tensor.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + + result->Clear(); + result->Resize(embedding_tensor.Numel()); + // Copy the raw embedding vector directly without L2 normalize + // post process. Let the user decide whether to normalize or not. + // Will call utils::L2Normlize() method to perform L2 + // normalize if l2_normalize was set as 'true'. + std::memcpy(result->embedding.data(), embedding_tensor.Data(), + embedding_tensor.Nbytes()); + if (l2_normalize) { + auto norm_embedding = utils::L2Normalize(result->embedding); + std::memcpy(result->embedding.data(), norm_embedding.data(), + embedding_tensor.Nbytes()); + } + return true; +} + +bool InsightFaceRecognitionModel::Predict(cv::Mat* im, + FaceRecognitionResult* result) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + if (!Preprocess(&mat, &input_tensors[0])) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors, result)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/faceid/contrib/insightface_rec.h b/csrc/fastdeploy/vision/faceid/contrib/insightface_rec.h new file mode 100644 index 000000000..b8eb27262 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/insightface_rec.h @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +class FASTDEPLOY_DECL InsightFaceRecognitionModel : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + // 支持insightface/recognition人脸识别模型的基类 + InsightFaceRecognitionModel( + const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + virtual std::string ModelName() const { return "deepinsight/insightface"; } + + // 以下为一些可供用户修改的属性 + // tuple of (width, height), default (112, 112) + std::vector size; + // 归一化的 alpha 和 beta,x'=x*alpha+beta + std::vector alpha; + std::vector beta; + // whether to swap the B and R channel, such as BGR->RGB, default true. + bool swap_rb; + // whether to apply l2 normalize to embedding values, default; + bool l2_normalize; + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + virtual bool Predict(cv::Mat* im, FaceRecognitionResult* result); + + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + virtual bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + virtual bool Preprocess(Mat* mat, FDTensor* output); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + virtual bool Postprocess(std::vector& infer_result, + FaceRecognitionResult* result); +}; + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/insightface_rec_pybind.cc b/csrc/fastdeploy/vision/faceid/contrib/insightface_rec_pybind.cc new file mode 100644 index 000000000..78df369bb --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/insightface_rec_pybind.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindInsightFaceRecognitionModel(pybind11::module& m) { + // Bind InsightFaceRecognitionModel + pybind11::class_(m, "InsightFaceRecognitionModel") + .def(pybind11::init()) + .def("predict", + [](vision::faceid::InsightFaceRecognitionModel& self, + pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::FaceRecognitionResult res; + self.Predict(&mat, &res); + return res; + }) + .def_readwrite("size", &vision::faceid::InsightFaceRecognitionModel::size) + .def_readwrite("alpha", + &vision::faceid::InsightFaceRecognitionModel::alpha) + .def_readwrite("beta", &vision::faceid::InsightFaceRecognitionModel::beta) + .def_readwrite("swap_rb", + &vision::faceid::InsightFaceRecognitionModel::swap_rb) + .def_readwrite( + "l2_normalize", + &vision::faceid::InsightFaceRecognitionModel::l2_normalize); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/partial_fc.cc b/csrc/fastdeploy/vision/faceid/contrib/partial_fc.cc new file mode 100644 index 000000000..8f13226cb --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/partial_fc.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/faceid/contrib/partial_fc.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +PartialFC::PartialFC(const std::string& model_file, + const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) + : InsightFaceRecognitionModel(model_file, params_file, custom_option, + model_format) { + initialized = Initialize(); +} + +bool PartialFC::Initialize() { + // 如果初始化有变化 修改该子类函数 + // 这里需要判断backend是否已经initialized,如果是,则不应该再调用 + // InsightFaceRecognitionModel::Initialize() + // 因为该函数会对backend进行初始化, backend已经在父类的构造函数初始化 + // 这里只修改一些模型相关的属性 + + // (1) 如果父类初始化了backend + if (initialized) { + // (1.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; + } + // (2) 如果父类没有初始化backend + if (!InsightFaceRecognitionModel::Initialize()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // (2.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; +} + +bool PartialFC::Preprocess(Mat* mat, FDTensor* output) { + // 如果预处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Preprocess(mat, output); +} + +bool PartialFC::Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) { + // 如果后处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Postprocess(infer_result, result); +} + +bool PartialFC::Predict(cv::Mat* im, FaceRecognitionResult* result) { + // 如果前后处理有变化 则override子类的Preprocess和Postprocess + // 如果前后处理有变化 此处应该调用子类自己的Preprocess和Postprocess + return InsightFaceRecognitionModel::Predict(im, result); +} + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/faceid/contrib/partial_fc.h b/csrc/fastdeploy/vision/faceid/contrib/partial_fc.h new file mode 100644 index 000000000..88a1f2a2a --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/partial_fc.h @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" +#include "fastdeploy/vision/faceid/contrib/insightface_rec.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +class FASTDEPLOY_DECL PartialFC : public InsightFaceRecognitionModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + PartialFC(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const override { + return "deepinsight/insightface/recognition/partial_fc"; + } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + bool Predict(cv::Mat* im, FaceRecognitionResult* result) override; + // 父类中包含 size, alpha, beta, swap_rb, l2_normalize 等基本可配置属性 + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize() override; + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + bool Preprocess(Mat* mat, FDTensor* output) override; + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + bool Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) override; +}; + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/partial_fc_pybind.cc b/csrc/fastdeploy/vision/faceid/contrib/partial_fc_pybind.cc new file mode 100644 index 000000000..b8cb31358 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/partial_fc_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindPartialFC(pybind11::module& m) { + // Bind Partial FC + pybind11::class_(m, "PartialFC") + .def(pybind11::init()) + .def("predict", + [](vision::faceid::PartialFC& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::FaceRecognitionResult res; + self.Predict(&mat, &res); + return res; + }) + .def_readwrite("size", &vision::faceid::PartialFC::size) + .def_readwrite("alpha", &vision::faceid::PartialFC::alpha) + .def_readwrite("beta", &vision::faceid::PartialFC::beta) + .def_readwrite("swap_rb", &vision::faceid::PartialFC::swap_rb) + .def_readwrite("l2_normalize", &vision::faceid::PartialFC::l2_normalize); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/vpl.cc b/csrc/fastdeploy/vision/faceid/contrib/vpl.cc new file mode 100644 index 000000000..bb34d3993 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/vpl.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/faceid/contrib/vpl.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +VPL::VPL(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, const Frontend& model_format) + : InsightFaceRecognitionModel(model_file, params_file, custom_option, + model_format) { + initialized = Initialize(); +} + +bool VPL::Initialize() { + // 如果初始化有变化 修改该子类函数 + // 这里需要判断backend是否已经initialized,如果是,则不应该再调用 + // InsightFaceRecognitionModel::Initialize() + // 因为该函数会对backend进行初始化, backend已经在父类的构造函数初始化 + // 这里只修改一些模型相关的属性 + + // (1) 如果父类初始化了backend + if (initialized) { + // (1.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; + } + // (2) 如果父类没有初始化backend + if (!InsightFaceRecognitionModel::Initialize()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + // (2.1) re-init parameters for specific sub-classes + size = {112, 112}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + l2_normalize = false; + return true; +} + +bool VPL::Preprocess(Mat* mat, FDTensor* output) { + // 如果预处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Preprocess(mat, output); +} + +bool VPL::Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) { + // 如果后处理有变化 修改该子类函数 + return InsightFaceRecognitionModel::Postprocess(infer_result, result); +} + +bool VPL::Predict(cv::Mat* im, FaceRecognitionResult* result) { + // 如果前后处理有变化 则override子类的Preprocess和Postprocess + // 如果前后处理有变化 此处应该调用子类自己的Preprocess和Postprocess + return InsightFaceRecognitionModel::Predict(im, result); +} + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/faceid/contrib/vpl.h b/csrc/fastdeploy/vision/faceid/contrib/vpl.h new file mode 100644 index 000000000..696d13ac3 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/vpl.h @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" +#include "fastdeploy/vision/faceid/contrib/insightface_rec.h" + +namespace fastdeploy { + +namespace vision { + +namespace faceid { + +class FASTDEPLOY_DECL VPL : public InsightFaceRecognitionModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + // VPL支持IResNet, IResNet1024骨干 + VPL(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const override { + return "deepinsight/insightface/recognition/vpl"; + } + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + bool Predict(cv::Mat* im, FaceRecognitionResult* result) override; + // 父类中包含 size, alpha, beta, swap_rb, l2_normalize 等基本可配置属性 + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize() override; + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + bool Preprocess(Mat* mat, FDTensor* output) override; + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + bool Postprocess(std::vector& infer_result, + FaceRecognitionResult* result) override; +}; + +} // namespace faceid +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/contrib/vpl_pybind.cc b/csrc/fastdeploy/vision/faceid/contrib/vpl_pybind.cc new file mode 100644 index 000000000..448cf3d3b --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/contrib/vpl_pybind.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindVPL(pybind11::module& m) { + // Bind VPL + pybind11::class_(m, "VPL") + .def(pybind11::init()) + .def("predict", + [](vision::faceid::VPL& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::FaceRecognitionResult res; + self.Predict(&mat, &res); + return res; + }) + .def_readwrite("size", &vision::faceid::VPL::size) + .def_readwrite("alpha", &vision::faceid::VPL::alpha) + .def_readwrite("beta", &vision::faceid::VPL::beta) + .def_readwrite("swap_rb", &vision::faceid::VPL::swap_rb) + .def_readwrite("l2_normalize", &vision::faceid::VPL::l2_normalize); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/faceid/faceid_pybind.cc b/csrc/fastdeploy/vision/faceid/faceid_pybind.cc new file mode 100644 index 000000000..40a1c6727 --- /dev/null +++ b/csrc/fastdeploy/vision/faceid/faceid_pybind.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindArcFace(pybind11::module& m); +void BindInsightFaceRecognitionModel(pybind11::module& m); +void BindCosFace(pybind11::module& m); +void BindPartialFC(pybind11::module& m); +void BindVPL(pybind11::module& m); + +void BindFaceId(pybind11::module& m) { + auto faceid_module = m.def_submodule("faceid", "Face recognition models."); + BindInsightFaceRecognitionModel(faceid_module); + BindArcFace(faceid_module); + BindCosFace(faceid_module); + BindPartialFC(faceid_module); + BindVPL(faceid_module); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/matting/contrib/modnet.cc b/csrc/fastdeploy/vision/matting/contrib/modnet.cc new file mode 100644 index 000000000..b98d055e3 --- /dev/null +++ b/csrc/fastdeploy/vision/matting/contrib/modnet.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/matting/contrib/modnet.h" +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { + +namespace vision { + +namespace matting { + +MODNet::MODNet(const std::string& model_file, const std::string& params_file, + const RuntimeOption& custom_option, + const Frontend& model_format) { + if (model_format == Frontend::ONNX) { + valid_cpu_backends = {Backend::ORT}; // 指定可用的CPU后端 + valid_gpu_backends = {Backend::ORT, Backend::TRT}; // 指定可用的GPU后端 + } else { + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT}; + } + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool MODNet::Initialize() { + // parameters for preprocess + size = {256, 256}; + alpha = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f}; + beta = {-1.f, -1.f, -1.f}; // RGB + swap_rb = true; + + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + return true; +} + +bool MODNet::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + // 1. Resize + // 2. BGR2RGB + // 3. Convert(opencv style) or Normalize + // 4. HWC2CHW + int resize_w = size[0]; + int resize_h = size[1]; + if (resize_h != mat->Height() || resize_w != mat->Width()) { + Resize::Run(mat, resize_w, resize_h); + } + if (swap_rb) { + BGR2RGB::Run(mat); + } + + Convert::Run(mat, alpha, beta); + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {mat->Height(), mat->Width()}; + + HWC2CHW::Run(mat); + Cast::Run(mat, "float"); + + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c + return true; +} + +bool MODNet::Postprocess( + std::vector& infer_result, MattingResult* result, + const std::map>& im_info) { + FDASSERT((infer_result.size() == 1), + "The default number of output tensor must be 1 according to " + "modnet."); + FDTensor& alpha_tensor = infer_result.at(0); // (1,h,w,1) + FDASSERT((alpha_tensor.shape[0] == 1), "Only support batch =1 now."); + if (alpha_tensor.dtype != FDDataType::FP32) { + FDERROR << "Only support post process with float32 data." << std::endl; + return false; + } + + // 先获取alpha并resize (使用opencv) + auto iter_ipt = im_info.find("input_shape"); + auto iter_out = im_info.find("output_shape"); + FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(), + "Cannot find input_shape or output_shape from im_info."); + int out_h = iter_out->second[0]; + int out_w = iter_out->second[1]; + int ipt_h = iter_ipt->second[0]; + int ipt_w = iter_ipt->second[1]; + + // TODO: 需要修改成FDTensor或Mat的运算 现在依赖cv::Mat + float* alpha_ptr = static_cast(alpha_tensor.Data()); + cv::Mat alpha_zero_copy_ref(out_h, out_w, CV_32FC1, alpha_ptr); + Mat alpha_resized(alpha_zero_copy_ref); // ref-only, zero copy. + if ((out_h != ipt_h) || (out_w != ipt_w)) { + // already allocated a new continuous memory after resize. + // cv::resize(alpha_resized, alpha_resized, cv::Size(ipt_w, ipt_h)); + Resize::Run(&alpha_resized, ipt_w, ipt_h, -1, -1); + } + + result->Clear(); + // note: must be setup shape before Resize + result->contain_foreground = false; + // 和输入原图大小对应的alpha + result->shape = {static_cast(ipt_h), static_cast(ipt_w)}; + int numel = ipt_h * ipt_w; + int nbytes = numel * sizeof(float); + result->Resize(numel); + std::memcpy(result->alpha.data(), alpha_resized.GetCpuMat()->data, nbytes); + return true; +} + +bool MODNet::Predict(cv::Mat* im, MattingResult* result) { +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_START(0) +#endif + + Mat mat(*im); + std::vector input_tensors(1); + + std::map> im_info; + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {mat.Height(), mat.Width()}; + im_info["output_shape"] = {mat.Height(), mat.Width()}; + + if (!Preprocess(&mat, &input_tensors[0], &im_info)) { + FDERROR << "Failed to preprocess input image." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(0, "Preprocess") + TIMERECORD_START(1) +#endif + + input_tensors[0].name = InputInfoOfRuntime(0).name; + std::vector output_tensors; + if (!Infer(input_tensors, &output_tensors)) { + FDERROR << "Failed to inference." << std::endl; + return false; + } +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(1, "Inference") + TIMERECORD_START(2) +#endif + + if (!Postprocess(output_tensors, result, im_info)) { + FDERROR << "Failed to post process." << std::endl; + return false; + } + +#ifdef FASTDEPLOY_DEBUG + TIMERECORD_END(2, "Postprocess") +#endif + return true; +} + +} // namespace matting +} // namespace vision +} // namespace fastdeploy \ No newline at end of file diff --git a/csrc/fastdeploy/vision/matting/contrib/modnet.h b/csrc/fastdeploy/vision/matting/contrib/modnet.h new file mode 100644 index 000000000..047fd3aea --- /dev/null +++ b/csrc/fastdeploy/vision/matting/contrib/modnet.h @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { + +namespace vision { + +namespace matting { + +class FASTDEPLOY_DECL MODNet : public FastDeployModel { + public: + // 当model_format为ONNX时,无需指定params_file + // 当model_format为Paddle时,则需同时指定model_file & params_file + MODNet(const std::string& model_file, const std::string& params_file = "", + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::ONNX); + + // 定义模型的名称 + std::string ModelName() const { return "matting/MODNet"; } + + // 以下为一些可供用户修改的属性 + // tuple of (width, height), default (256, 256) + std::vector size; + // 归一化的 alpha 和 beta,x'=x*alpha+beta + std::vector alpha; + std::vector beta; + // whether to swap the B and R channel, such as BGR->RGB, default true. + bool swap_rb; + + // 模型预测接口,即用户调用的接口 + // im 为用户的输入数据,目前对于CV均定义为cv::Mat + // result 为模型预测的输出结构体 + bool Predict(cv::Mat* im, MattingResult* result); + + private: + // 初始化函数,包括初始化后端,以及其它模型推理需要涉及的操作 + bool Initialize(); + + // 输入图像预处理操作 + // Mat为FastDeploy定义的数据结构 + // FDTensor为预处理后的Tensor数据,传给后端进行推理 + bool Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info); + + // 后端推理结果后处理,输出给用户 + // infer_result 为后端推理后的输出Tensor + // result 为模型预测的结果 + bool Postprocess(std::vector& infer_result, MattingResult* result, + const std::map>& im_info); +}; + +} // namespace matting +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/matting/contrib/modnet_pybind.cc b/csrc/fastdeploy/vision/matting/contrib/modnet_pybind.cc new file mode 100644 index 000000000..bfb8b1f88 --- /dev/null +++ b/csrc/fastdeploy/vision/matting/contrib/modnet_pybind.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindMODNet(pybind11::module& m) { + // Bind MODNet + pybind11::class_(m, "MODNet") + .def(pybind11::init()) + .def("predict", + [](vision::matting::MODNet& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::MattingResult res; + self.Predict(&mat, &res); + return res; + }) + .def_readwrite("size", &vision::matting::MODNet::size) + .def_readwrite("alpha", &vision::matting::MODNet::alpha) + .def_readwrite("beta", &vision::matting::MODNet::beta) + .def_readwrite("swap_rb", &vision::matting::MODNet::swap_rb); +} + +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/matting/matting_pybind.cc b/csrc/fastdeploy/vision/matting/matting_pybind.cc new file mode 100644 index 000000000..e5fd78925 --- /dev/null +++ b/csrc/fastdeploy/vision/matting/matting_pybind.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindMODNet(pybind11::module& m); + +void BindMatting(pybind11::module& m) { + auto matting_module = + m.def_submodule("matting", "Image object matting models."); + BindMODNet(matting_module); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/ppcls/model.cc b/csrc/fastdeploy/vision/ppcls/model.cc new file mode 100644 index 000000000..c4e5b767c --- /dev/null +++ b/csrc/fastdeploy/vision/ppcls/model.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/ppcls/model.h" +#include "fastdeploy/vision/utils/utils.h" +#include "yaml-cpp/yaml.h" + +namespace fastdeploy { +namespace vision { +namespace ppcls { + +Model::Model(const std::string& model_file, const std::string& params_file, + const std::string& config_file, const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::ORT, Backend::PDINFER}; + valid_gpu_backends = {Backend::ORT, Backend::PDINFER}; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool Model::Initialize() { + if (!BuildPreprocessPipelineFromConfig()) { + FDERROR << "Failed to build preprocess pipeline from configuration file." + << std::endl; + return false; + } + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + return true; +} + +bool Model::BuildPreprocessPipelineFromConfig() { + processors_.clear(); + YAML::Node cfg; + try { + cfg = YAML::LoadFile(config_file_); + } catch (YAML::BadFile& e) { + FDERROR << "Failed to load yaml file " << config_file_ + << ", maybe you should check this file." << std::endl; + return false; + } + auto preprocess_cfg = cfg["PreProcess"]["transform_ops"]; + processors_.push_back(std::make_shared()); + for (const auto& op : preprocess_cfg) { + FDASSERT(op.IsMap(), + "Require the transform information in yaml be Map type."); + auto op_name = op.begin()->first.as(); + if (op_name == "ResizeImage") { + int target_size = op.begin()->second["resize_short"].as(); + bool use_scale = false; + int interp = 1; + processors_.push_back( + std::make_shared(target_size, 1, use_scale)); + } else if (op_name == "CropImage") { + int width = op.begin()->second["size"].as(); + int height = op.begin()->second["size"].as(); + processors_.push_back(std::make_shared(width, height)); + } else if (op_name == "NormalizeImage") { + auto mean = op.begin()->second["mean"].as>(); + auto std = op.begin()->second["std"].as>(); + auto scale = op.begin()->second["scale"].as(); + FDASSERT((scale - 0.00392157) < 1e-06 && (scale - 0.00392157) > -1e-06, + "Only support scale in Normalize be 0.00392157, means the pixel " + "is in range of [0, 255]."); + processors_.push_back(std::make_shared(mean, std)); + } else if (op_name == "ToCHWImage") { + processors_.push_back(std::make_shared()); + } else { + FDERROR << "Unexcepted preprocess operator: " << op_name << "." + << std::endl; + return false; + } + } + return true; +} + +bool Model::Preprocess(Mat* mat, FDTensor* output) { + for (size_t i = 0; i < processors_.size(); ++i) { + if (!(*(processors_[i].get()))(mat)) { + FDERROR << "Failed to process image data in " << processors_[i]->Name() + << "." << std::endl; + return false; + } + } + + int channel = mat->Channels(); + int width = mat->Width(); + int height = mat->Height(); + output->name = InputInfoOfRuntime(0).name; + output->SetExternalData({1, channel, height, width}, FDDataType::FP32, + mat->GetCpuMat()->ptr()); + return true; +} + +bool Model::Postprocess(const FDTensor& infer_result, ClassifyResult* result, + int topk) { + int num_classes = infer_result.shape[1]; + const float* infer_result_buffer = + reinterpret_cast(infer_result.data.data()); + topk = std::min(num_classes, topk); + result->label_ids = + utils::TopKIndices(infer_result_buffer, num_classes, topk); + result->scores.resize(topk); + for (int i = 0; i < topk; ++i) { + result->scores[i] = *(infer_result_buffer + result->label_ids[i]); + } + return true; +} + +bool Model::Predict(cv::Mat* im, ClassifyResult* result, int topk) { + Mat mat(*im); + std::vector processed_data(1); + if (!Preprocess(&mat, &(processed_data[0]))) { + FDERROR << "Failed to preprocess input data while using model:" + << ModelName() << "." << std::endl; + return false; + } + + std::vector infer_result(1); + if (!Infer(processed_data, &infer_result)) { + FDERROR << "Failed to inference while using model:" << ModelName() << "." + << std::endl; + return false; + } + + if (!Postprocess(infer_result[0], result, topk)) { + FDERROR << "Failed to postprocess while using model:" << ModelName() << "." + << std::endl; + return false; + } + return true; +} + +} // namespace ppcls +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/ppcls/model.h b/csrc/fastdeploy/vision/ppcls/model.h new file mode 100644 index 000000000..71800a7d7 --- /dev/null +++ b/csrc/fastdeploy/vision/ppcls/model.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace ppcls { + +class FASTDEPLOY_DECL Model : public FastDeployModel { + public: + Model(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + std::string ModelName() const { return "ppclas-classify"; } + + // TODO(jiangjiajun) Batch is on the way + virtual bool Predict(cv::Mat* im, ClassifyResult* result, int topk = 1); + + private: + bool Initialize(); + + bool BuildPreprocessPipelineFromConfig(); + + bool Preprocess(Mat* mat, FDTensor* outputs); + + bool Postprocess(const FDTensor& infer_result, ClassifyResult* result, + int topk = 1); + + std::vector> processors_; + std::string config_file_; +}; +} // namespace ppcls +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/ppcls/ppcls_pybind.cc b/csrc/fastdeploy/vision/ppcls/ppcls_pybind.cc new file mode 100644 index 000000000..10ff5ee10 --- /dev/null +++ b/csrc/fastdeploy/vision/ppcls/ppcls_pybind.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindPPCls(pybind11::module& m) { + auto ppcls_module = m.def_submodule("ppcls", "Module to deploy PaddleClas."); + pybind11::class_(ppcls_module, "Model") + .def(pybind11::init()) + .def("predict", + [](vision::ppcls::Model& self, pybind11::array& data, int topk = 1) { + auto mat = PyArrayToCvMat(data); + vision::ClassifyResult res; + self.Predict(&mat, &res, topk); + return res; + }); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/ppseg/model.cc b/csrc/fastdeploy/vision/ppseg/model.cc new file mode 100644 index 000000000..7f692c6a7 --- /dev/null +++ b/csrc/fastdeploy/vision/ppseg/model.cc @@ -0,0 +1,232 @@ +#include "fastdeploy/vision/ppseg/model.h" +#include "fastdeploy/vision.h" +#include "fastdeploy/vision/utils/utils.h" +#include "yaml-cpp/yaml.h" + +namespace fastdeploy { +namespace vision { +namespace ppseg { + +Model::Model(const std::string& model_file, const std::string& params_file, + const std::string& config_file, const RuntimeOption& custom_option, + const Frontend& model_format) { + config_file_ = config_file; + valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; + valid_gpu_backends = {Backend::PDINFER, Backend::ORT}; + runtime_option = custom_option; + runtime_option.model_format = model_format; + runtime_option.model_file = model_file; + runtime_option.params_file = params_file; + initialized = Initialize(); +} + +bool Model::Initialize() { + if (!BuildPreprocessPipelineFromConfig()) { + FDERROR << "Failed to build preprocess pipeline from configuration file." + << std::endl; + return false; + } + if (!InitRuntime()) { + FDERROR << "Failed to initialize fastdeploy backend." << std::endl; + return false; + } + return true; +} + +bool Model::BuildPreprocessPipelineFromConfig() { + processors_.clear(); + YAML::Node cfg; + processors_.push_back(std::make_shared()); + try { + cfg = YAML::LoadFile(config_file_); + } catch (YAML::BadFile& e) { + FDERROR << "Failed to load yaml file " << config_file_ + << ", maybe you should check this file." << std::endl; + return false; + } + + if (cfg["Deploy"]["transforms"]) { + auto preprocess_cfg = cfg["Deploy"]["transforms"]; + for (const auto& op : preprocess_cfg) { + FDASSERT(op.IsMap(), + "Require the transform information in yaml be Map type."); + if (op["type"].as() == "Normalize") { + std::vector mean = {0.5, 0.5, 0.5}; + std::vector std = {0.5, 0.5, 0.5}; + if (op["mean"]) { + mean = op["mean"].as>(); + } + if (op["std"]) { + std = op["std"].as>(); + } + processors_.push_back(std::make_shared(mean, std)); + + } else if (op["type"].as() == "Resize") { + const auto& target_size = op["target_size"]; + int resize_width = target_size[0].as(); + int resize_height = target_size[1].as(); + is_resized = true; + processors_.push_back( + std::make_shared(resize_width, resize_height)); + } + } + processors_.push_back(std::make_shared()); + } + return true; +} + +bool Model::Preprocess(Mat* mat, FDTensor* output, + std::map>* im_info) { + for (size_t i = 0; i < processors_.size(); ++i) { + if (processors_[i]->Name().compare("Resize") == 0) { + auto processor = dynamic_cast(processors_[i].get()); + int resize_width = -1; + int resize_height = -1; + std::tie(resize_width, resize_height) = processor->GetWidthAndHeight(); + if (is_vertical_screen && (resize_width > resize_height)) { + if (processor->SetWidthAndHeight(resize_height, resize_width)) { + FDERROR << "Failed to set Resize processor width and height " + << processors_[i]->Name() << "." << std::endl; + } + } + } + if (!(*(processors_[i].get()))(mat)) { + FDERROR << "Failed to process image data in " << processors_[i]->Name() + << "." << std::endl; + return false; + } + } + + // Record output shape of preprocessed image + (*im_info)["output_shape"] = {static_cast(mat->Height()), + static_cast(mat->Width())}; + + mat->ShareWithTensor(output); + output->shape.insert(output->shape.begin(), 1); + output->name = InputInfoOfRuntime(0).name; + return true; +} + +bool Model::Postprocess(FDTensor& infer_result, SegmentationResult* result, + std::map>* im_info) { + // PaddleSeg has three types of inference output: + // 1. output with argmax and without softmax. 3-D matrix CHW, Channel + // always 1, the element in matrix is classified label_id INT64 Type. + // 2. output without argmax and without softmax. 4-D matrix NCHW, N always + // 1, Channel is the num of classes. The element is the logits of classes + // FP32 + // 3. output without argmax and with softmax. 4-D matrix NCHW, the result + // of 2 with softmax layer + // Fastdeploy output: + // 1. label_map + // 2. score_map(optional) + // 3. shape: 2-D HW + FDASSERT(infer_result.dtype == FDDataType::INT64 || + infer_result.dtype == FDDataType::FP32, + "Require the data type of output is int64 or fp32, but now it's " + + Str(infer_result.dtype) + "."); + result->Clear(); + + if (infer_result.shape.size() == 4) { + FDASSERT(infer_result.shape[0] == 1, "Only support batch size = 1."); + // output without argmax + result->contain_score_map = true; + utils::NCHW2NHWC(infer_result); + } + + // for resize mat below + FDTensor new_infer_result; + Mat* mat = nullptr; + if (is_resized) { + cv::Mat temp_mat; + utils::FDTensor2FP32CVMat(temp_mat, infer_result, + result->contain_score_map); + + // original image shape + auto iter_ipt = (*im_info).find("input_shape"); + FDASSERT(iter_ipt != im_info->end(), + "Cannot find input_shape from im_info."); + int ipt_h = iter_ipt->second[0]; + int ipt_w = iter_ipt->second[1]; + + mat = new Mat(temp_mat); + + Resize::Run(mat, ipt_w, ipt_h, -1, -1, 1); + mat->ShareWithTensor(&new_infer_result); + new_infer_result.shape.insert(new_infer_result.shape.begin(), 1); + result->shape = new_infer_result.shape; + } else { + result->shape = infer_result.shape; + } + int out_num = + std::accumulate(result->shape.begin(), result->shape.begin() + 3, 1, + std::multiplies()); + // NCHW remove N or CHW remove C + result->shape.erase(result->shape.begin()); + result->Resize(out_num); + if (result->contain_score_map) { + // output with label_map and score_map + float_t* infer_result_buffer = nullptr; + if (is_resized) { + infer_result_buffer = static_cast(new_infer_result.Data()); + } else { + infer_result_buffer = static_cast(infer_result.Data()); + } + // argmax + utils::ArgmaxScoreMap(infer_result_buffer, result, with_softmax); + result->shape.erase(result->shape.begin() + 2); + } else { + // output only with label_map + if (is_resized) { + float_t* infer_result_buffer = + static_cast(new_infer_result.Data()); + for (int i = 0; i < out_num; i++) { + result->label_map[i] = static_cast(*(infer_result_buffer + i)); + } + } else { + const int64_t* infer_result_buffer = + reinterpret_cast(infer_result.Data()); + for (int i = 0; i < out_num; i++) { + result->label_map[i] = static_cast(*(infer_result_buffer + i)); + } + } + } + delete mat; + mat = nullptr; + return true; +} + +bool Model::Predict(cv::Mat* im, SegmentationResult* result) { + Mat mat(*im); + std::vector processed_data(1); + + std::map> im_info; + + // Record the shape of image and the shape of preprocessed image + im_info["input_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + im_info["output_shape"] = {static_cast(mat.Height()), + static_cast(mat.Width())}; + + if (!Preprocess(&mat, &(processed_data[0]), &im_info)) { + FDERROR << "Failed to preprocess input data while using model:" + << ModelName() << "." << std::endl; + return false; + } + std::vector infer_result(1); + if (!Infer(processed_data, &infer_result)) { + FDERROR << "Failed to inference while using model:" << ModelName() << "." + << std::endl; + return false; + } + if (!Postprocess(infer_result[0], result, &im_info)) { + FDERROR << "Failed to postprocess while using model:" << ModelName() << "." + << std::endl; + return false; + } + return true; +} + +} // namespace ppseg +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/ppseg/model.h b/csrc/fastdeploy/vision/ppseg/model.h new file mode 100644 index 000000000..72f8dbc64 --- /dev/null +++ b/csrc/fastdeploy/vision/ppseg/model.h @@ -0,0 +1,43 @@ +#pragma once +#include "fastdeploy/fastdeploy_model.h" +#include "fastdeploy/vision/common/processors/transform.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace ppseg { + +class FASTDEPLOY_DECL Model : public FastDeployModel { + public: + Model(const std::string& model_file, const std::string& params_file, + const std::string& config_file, + const RuntimeOption& custom_option = RuntimeOption(), + const Frontend& model_format = Frontend::PADDLE); + + std::string ModelName() const { return "ppseg"; } + + virtual bool Predict(cv::Mat* im, SegmentationResult* result); + + bool with_softmax = false; + + bool is_vertical_screen = false; + + private: + bool Initialize(); + + bool BuildPreprocessPipelineFromConfig(); + + bool Preprocess(Mat* mat, FDTensor* outputs, + std::map>* im_info); + + bool Postprocess(FDTensor& infer_result, SegmentationResult* result, + std::map>* im_info); + + bool is_resized = false; + + std::vector> processors_; + std::string config_file_; +}; +} // namespace ppseg +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/ppseg/ppseg_pybind.cc b/csrc/fastdeploy/vision/ppseg/ppseg_pybind.cc new file mode 100644 index 000000000..949c27487 --- /dev/null +++ b/csrc/fastdeploy/vision/ppseg/ppseg_pybind.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindPPSeg(pybind11::module& m) { + auto ppseg_module = + m.def_submodule("ppseg", "Module to deploy PaddleSegmentation."); + pybind11::class_(ppseg_module, "Model") + .def(pybind11::init()) + .def("predict", + [](vision::ppseg::Model& self, pybind11::array& data) { + auto mat = PyArrayToCvMat(data); + vision::SegmentationResult* res = new vision::SegmentationResult(); + // self.Predict(&mat, &res); + self.Predict(&mat, res); + return res; + }) + .def_readwrite("with_softmax", &vision::ppseg::Model::with_softmax) + .def_readwrite("is_vertical_screen", + &vision::ppseg::Model::is_vertical_screen); +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/utils/FDTensor2CVMat.cc b/csrc/fastdeploy/vision/utils/FDTensor2CVMat.cc new file mode 100644 index 000000000..fdd110cb8 --- /dev/null +++ b/csrc/fastdeploy/vision/utils/FDTensor2CVMat.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace utils { + +void FDTensor2FP32CVMat(cv::Mat& mat, FDTensor& infer_result, + bool contain_score_map) { + // output with argmax channel is 1 + int channel = 1; + int height = infer_result.shape[1]; + int width = infer_result.shape[2]; + + if (contain_score_map) { + // output without argmax and convent to NHWC + channel = infer_result.shape[3]; + } + // create FP32 cvmat + if (infer_result.dtype == FDDataType::INT64) { + FDWARNING << "The PaddleSeg model is exported with argmax. Inference " + "result type is " + + Str(infer_result.dtype) + + ". If you want the edge of segmentation image more " + "smoother. Please export model with --without_argmax " + "--with_softmax." + << std::endl; + int64_t chw = channel * height * width; + int64_t* infer_result_buffer = static_cast(infer_result.Data()); + std::vector float_result_buffer(chw); + mat = cv::Mat(height, width, CV_32FC(channel)); + int index = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + mat.at(i, j) = + static_cast(infer_result_buffer[index++]); + } + } + } else if (infer_result.dtype == FDDataType::FP32) { + mat = cv::Mat(height, width, CV_32FC(channel), infer_result.Data()); + } +} + +} // namespace utils +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/utils/cosine_similarity.cc b/csrc/fastdeploy/vision/utils/cosine_similarity.cc new file mode 100644 index 000000000..70d4e31dd --- /dev/null +++ b/csrc/fastdeploy/vision/utils/cosine_similarity.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace utils { + +float CosineSimilarity(const std::vector& a, const std::vector& b, + bool normalized) { + // 计算余弦相似度 + FDASSERT((a.size() == b.size()) && (a.size() != 0), + "The size of a and b must be equal and >= 1."); + size_t num_val = a.size(); + if (normalized) { + float mul_a = 0.f, mul_b = 0.f, mul_ab = 0.f; + for (size_t i = 0; i < num_val; ++i) { + mul_a += (a[i] * a[i]); + mul_b += (b[i] * b[i]); + mul_ab += (a[i] * b[i]); + } + return (mul_ab / (std::sqrt(mul_a) * std::sqrt(mul_b))); + } + auto norm_a = L2Normalize(a); + auto norm_b = L2Normalize(b); + float mul_a = 0.f, mul_b = 0.f, mul_ab = 0.f; + for (size_t i = 0; i < num_val; ++i) { + mul_a += (norm_a[i] * norm_a[i]); + mul_b += (norm_b[i] * norm_b[i]); + mul_ab += (norm_a[i] * norm_b[i]); + } + return (mul_ab / (std::sqrt(mul_a) * std::sqrt(mul_b))); +} + +} // namespace utils +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/utils/l2_normalize.cc b/csrc/fastdeploy/vision/utils/l2_normalize.cc new file mode 100644 index 000000000..f5752b848 --- /dev/null +++ b/csrc/fastdeploy/vision/utils/l2_normalize.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace utils { + +std::vector L2Normalize(const std::vector& values) { + size_t num_val = values.size(); + if (num_val == 0) { + return {}; + } + std::vector norm; + float l2_sum_val = 0.f; + for (size_t i = 0; i < num_val; ++i) { + l2_sum_val += (values[i] * values[i]); + } + float l2_sum_sqrt = std::sqrt(l2_sum_val); + norm.resize(num_val); + for (size_t i = 0; i < num_val; ++i) { + norm[i] = values[i] / l2_sum_sqrt; + } + return norm; +} + +} // namespace utils +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/utils/nms.cc b/csrc/fastdeploy/vision/utils/nms.cc new file mode 100644 index 000000000..900acf84d --- /dev/null +++ b/csrc/fastdeploy/vision/utils/nms.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/utils/perf.h" +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace utils { + +// The implementation refers to +// https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/cpp/src/utils.cc +void NMS(DetectionResult* result, float iou_threshold) { + utils::SortDetectionResult(result); + + std::vector area_of_boxes(result->boxes.size()); + std::vector suppressed(result->boxes.size(), 0); + for (size_t i = 0; i < result->boxes.size(); ++i) { + area_of_boxes[i] = (result->boxes[i][2] - result->boxes[i][0]) * + (result->boxes[i][3] - result->boxes[i][1]); + } + + for (size_t i = 0; i < result->boxes.size(); ++i) { + if (suppressed[i] == 1) { + continue; + } + for (size_t j = i + 1; j < result->boxes.size(); ++j) { + if (suppressed[j] == 1) { + continue; + } + float xmin = std::max(result->boxes[i][0], result->boxes[j][0]); + float ymin = std::max(result->boxes[i][1], result->boxes[j][1]); + float xmax = std::min(result->boxes[i][2], result->boxes[j][2]); + float ymax = std::min(result->boxes[i][3], result->boxes[j][3]); + float overlap_w = std::max(0.0f, xmax - xmin); + float overlap_h = std::max(0.0f, ymax - ymin); + float overlap_area = overlap_w * overlap_h; + float overlap_ratio = + overlap_area / (area_of_boxes[i] + area_of_boxes[j] - overlap_area); + if (overlap_ratio > iou_threshold) { + suppressed[j] = 1; + } + } + } + DetectionResult backup(*result); + result->Clear(); + result->Reserve(suppressed.size()); + for (size_t i = 0; i < suppressed.size(); ++i) { + if (suppressed[i] == 1) { + continue; + } + result->boxes.emplace_back(backup.boxes[i]); + result->scores.push_back(backup.scores[i]); + result->label_ids.push_back(backup.label_ids[i]); + } +} + +void NMS(FaceDetectionResult* result, float iou_threshold) { + utils::SortDetectionResult(result); + + std::vector area_of_boxes(result->boxes.size()); + std::vector suppressed(result->boxes.size(), 0); + for (size_t i = 0; i < result->boxes.size(); ++i) { + area_of_boxes[i] = (result->boxes[i][2] - result->boxes[i][0]) * + (result->boxes[i][3] - result->boxes[i][1]); + } + + for (size_t i = 0; i < result->boxes.size(); ++i) { + if (suppressed[i] == 1) { + continue; + } + for (size_t j = i + 1; j < result->boxes.size(); ++j) { + if (suppressed[j] == 1) { + continue; + } + float xmin = std::max(result->boxes[i][0], result->boxes[j][0]); + float ymin = std::max(result->boxes[i][1], result->boxes[j][1]); + float xmax = std::min(result->boxes[i][2], result->boxes[j][2]); + float ymax = std::min(result->boxes[i][3], result->boxes[j][3]); + float overlap_w = std::max(0.0f, xmax - xmin); + float overlap_h = std::max(0.0f, ymax - ymin); + float overlap_area = overlap_w * overlap_h; + float overlap_ratio = + overlap_area / (area_of_boxes[i] + area_of_boxes[j] - overlap_area); + if (overlap_ratio > iou_threshold) { + suppressed[j] = 1; + } + } + } + FaceDetectionResult backup(*result); + int landmarks_per_face = result->landmarks_per_face; + + result->Clear(); + // don't forget to reset the landmarks_per_face + // before apply Reserve method. + result->landmarks_per_face = landmarks_per_face; + result->Reserve(suppressed.size()); + for (size_t i = 0; i < suppressed.size(); ++i) { + if (suppressed[i] == 1) { + continue; + } + result->boxes.emplace_back(backup.boxes[i]); + result->scores.push_back(backup.scores[i]); + // landmarks (if have) + if (result->landmarks_per_face > 0) { + for (size_t j = 0; j < result->landmarks_per_face; ++j) { + result->landmarks.emplace_back( + backup.landmarks[i * result->landmarks_per_face + j]); + } + } + } +} + +} // namespace utils +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/utils/sort_det_res.cc b/csrc/fastdeploy/vision/utils/sort_det_res.cc new file mode 100644 index 000000000..93dbb6969 --- /dev/null +++ b/csrc/fastdeploy/vision/utils/sort_det_res.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace utils { + +void Merge(DetectionResult* result, size_t low, size_t mid, size_t high) { + std::vector>& boxes = result->boxes; + std::vector& scores = result->scores; + std::vector& label_ids = result->label_ids; + std::vector> temp_boxes(boxes); + std::vector temp_scores(scores); + std::vector temp_label_ids(label_ids); + size_t i = low; + size_t j = mid + 1; + size_t k = i; + for (; i <= mid && j <= high; k++) { + if (temp_scores[i] >= temp_scores[j]) { + scores[k] = temp_scores[i]; + label_ids[k] = temp_label_ids[i]; + boxes[k] = temp_boxes[i]; + i++; + } else { + scores[k] = temp_scores[j]; + label_ids[k] = temp_label_ids[j]; + boxes[k] = temp_boxes[j]; + j++; + } + } + while (i <= mid) { + scores[k] = temp_scores[i]; + label_ids[k] = temp_label_ids[i]; + boxes[k] = temp_boxes[i]; + k++; + i++; + } + while (j <= high) { + scores[k] = temp_scores[j]; + label_ids[k] = temp_label_ids[j]; + boxes[k] = temp_boxes[j]; + k++; + j++; + } +} + +void MergeSort(DetectionResult* result, size_t low, size_t high) { + if (low < high) { + size_t mid = (high - low) / 2 + low; + MergeSort(result, low, mid); + MergeSort(result, mid + 1, high); + Merge(result, low, mid, high); + } +} + +void SortDetectionResult(DetectionResult* result) { + size_t low = 0; + size_t high = result->scores.size(); + if (high == 0) { + return; + } + high = high - 1; + MergeSort(result, low, high); +} + +} // namespace utils +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/utils/sort_face_det_res.cc b/csrc/fastdeploy/vision/utils/sort_face_det_res.cc new file mode 100644 index 000000000..34150f9ac --- /dev/null +++ b/csrc/fastdeploy/vision/utils/sort_face_det_res.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision/utils/utils.h" + +namespace fastdeploy { +namespace vision { +namespace utils { + +void SortDetectionResult(FaceDetectionResult* result) { + // sort face detection results with landmarks or not. + if (result->boxes.size() == 0) { + return; + } + int landmarks_per_face = result->landmarks_per_face; + if (landmarks_per_face > 0) { + FDASSERT( + (result->landmarks.size() == result->boxes.size() * landmarks_per_face), + "The size of landmarks != boxes.size * landmarks_per_face."); + } + + // argsort for scores. + std::vector indices; + indices.resize(result->boxes.size()); + for (size_t i = 0; i < result->boxes.size(); ++i) { + indices[i] = i; + } + std::vector& scores = result->scores; + std::sort(indices.begin(), indices.end(), + [&scores](size_t a, size_t b) { return scores[a] > scores[b]; }); + + // reorder boxes, scores, landmarks (if have). + FaceDetectionResult backup(*result); + result->Clear(); + // don't forget to reset the landmarks_per_face + // before apply Reserve method. + result->landmarks_per_face = landmarks_per_face; + result->Reserve(indices.size()); + if (landmarks_per_face > 0) { + for (size_t i = 0; i < indices.size(); ++i) { + result->boxes.emplace_back(backup.boxes[indices[i]]); + result->scores.push_back(backup.scores[indices[i]]); + for (size_t j = 0; j < landmarks_per_face; ++j) { + result->landmarks.emplace_back( + backup.landmarks[indices[i] * landmarks_per_face + j]); + } + } + } else { + for (size_t i = 0; i < indices.size(); ++i) { + result->boxes.emplace_back(backup.boxes[indices[i]]); + result->scores.push_back(backup.scores[indices[i]]); + } + } +} + +} // namespace utils +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/utils/utils.h b/csrc/fastdeploy/vision/utils/utils.h new file mode 100644 index 000000000..02cf16e9c --- /dev/null +++ b/csrc/fastdeploy/vision/utils/utils.h @@ -0,0 +1,140 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "fastdeploy/core/fd_tensor.h" +#include "fastdeploy/utils/utils.h" +#include "fastdeploy/vision/common/result.h" + +namespace fastdeploy { +namespace vision { +namespace utils { +// topk sometimes is a very small value +// so this implementation is simple but I don't think it will +// cost too much time +// Also there may be cause problem since we suppose the minimum value is +// -99999999 +// Do not use this function on array which topk contains value less than +// -99999999 +template +std::vector TopKIndices(const T* array, int array_size, int topk) { + topk = std::min(array_size, topk); + std::vector res(topk); + std::set searched; + for (int32_t i = 0; i < topk; ++i) { + T min = -99999999; + for (int32_t j = 0; j < array_size; ++j) { + if (searched.find(j) != searched.end()) { + continue; + } + if (*(array + j) > min) { + res[i] = j; + min = *(array + j); + } + } + searched.insert(res[i]); + } + return res; +} + +template +void ArgmaxScoreMap(T infer_result_buffer, SegmentationResult* result, + bool with_softmax) { + int64_t height = result->shape[0]; + int64_t width = result->shape[1]; + int64_t num_classes = result->shape[2]; + int index = 0; + for (size_t i = 0; i < height; ++i) { + for (size_t j = 0; j < width; ++j) { + int64_t s = (i * width + j) * num_classes; + T max_class_score = std::max_element( + infer_result_buffer + s, infer_result_buffer + s + num_classes); + int label_id = std::distance(infer_result_buffer + s, max_class_score); + if (label_id >= 255) { + FDWARNING << "label_id is stored by uint8_t, now the value is bigger " + "than 255, it's " + << static_cast(label_id) << "." << std::endl; + } + result->label_map[index] = static_cast(label_id); + + if (with_softmax) { + double_t total = 0; + for (int k = 0; k < num_classes; k++) { + total += exp(*(infer_result_buffer + s + k) - *max_class_score); + } + double_t softmax_class_score = 1 / total; + result->score_map[index] = static_cast(softmax_class_score); + + } else { + result->score_map[index] = static_cast(*max_class_score); + } + index++; + } + } +} + +template +void NCHW2NHWC(FDTensor& infer_result) { + T* infer_result_buffer = reinterpret_cast(infer_result.MutableData()); + int num = infer_result.shape[0]; + int channel = infer_result.shape[1]; + int height = infer_result.shape[2]; + int width = infer_result.shape[3]; + int chw = channel * height * width; + int wc = width * channel; + int wh = width * height; + std::vector hwc_data(chw); + int index = 0; + for (int n = 0; n < num; n++) { + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + hwc_data[n * chw + h * wc + w * channel + c] = + *(infer_result_buffer + index); + index++; + } + } + } + } + std::memcpy(infer_result.MutableData(), hwc_data.data(), + num * chw * sizeof(T)); + infer_result.shape = {num, height, width, channel}; +} + +void FDTensor2FP32CVMat(cv::Mat& mat, FDTensor& infer_result, + bool contain_score_map); + +void NMS(DetectionResult* output, float iou_threshold = 0.5); + +void NMS(FaceDetectionResult* result, float iou_threshold = 0.5); + +// MergeSort +void SortDetectionResult(DetectionResult* output); + +void SortDetectionResult(FaceDetectionResult* result); + +// L2 Norm / cosine similarity (for face recognition, ...) +FASTDEPLOY_DECL std::vector L2Normalize( + const std::vector& values); + +FASTDEPLOY_DECL float CosineSimilarity(const std::vector& a, + const std::vector& b, + bool normalized = true); + +} // namespace utils +} // namespace vision +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/vision_pybind.cc b/csrc/fastdeploy/vision/vision_pybind.cc new file mode 100644 index 000000000..6528dd22b --- /dev/null +++ b/csrc/fastdeploy/vision/vision_pybind.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { + +void BindPPCls(pybind11::module& m); +void BindPPDet(pybind11::module& m); +void BindPPSeg(pybind11::module& m); + +void BindDetection(pybind11::module& m); +void BindMatting(pybind11::module& m); +void BindFaceDet(pybind11::module& m); +void BindFaceId(pybind11::module& m); +#ifdef ENABLE_VISION_VISUALIZE +void BindVisualize(pybind11::module& m); +#endif + +void BindVision(pybind11::module& m) { + pybind11::class_(m, "ClassifyResult") + .def(pybind11::init()) + .def_readwrite("label_ids", &vision::ClassifyResult::label_ids) + .def_readwrite("scores", &vision::ClassifyResult::scores) + .def("__repr__", &vision::ClassifyResult::Str) + .def("__str__", &vision::ClassifyResult::Str); + + pybind11::class_(m, "DetectionResult") + .def(pybind11::init()) + .def_readwrite("boxes", &vision::DetectionResult::boxes) + .def_readwrite("scores", &vision::DetectionResult::scores) + .def_readwrite("label_ids", &vision::DetectionResult::label_ids) + .def("__repr__", &vision::DetectionResult::Str) + .def("__str__", &vision::DetectionResult::Str); + + pybind11::class_(m, "FaceDetectionResult") + .def(pybind11::init()) + .def_readwrite("boxes", &vision::FaceDetectionResult::boxes) + .def_readwrite("scores", &vision::FaceDetectionResult::scores) + .def_readwrite("landmarks", &vision::FaceDetectionResult::landmarks) + .def_readwrite("landmarks_per_face", + &vision::FaceDetectionResult::landmarks_per_face) + .def("__repr__", &vision::FaceDetectionResult::Str) + .def("__str__", &vision::FaceDetectionResult::Str); + + pybind11::class_(m, "SegmentationResult") + .def(pybind11::init()) + .def_readwrite("label_map", &vision::SegmentationResult::label_map) + .def_readwrite("score_map", &vision::SegmentationResult::score_map) + .def_readwrite("shape", &vision::SegmentationResult::shape) + .def_readwrite("shape", &vision::SegmentationResult::shape) + .def("__repr__", &vision::SegmentationResult::Str) + .def("__str__", &vision::SegmentationResult::Str); + + pybind11::class_(m, "FaceRecognitionResult") + .def(pybind11::init()) + .def_readwrite("embedding", &vision::FaceRecognitionResult::embedding) + .def("__repr__", &vision::FaceRecognitionResult::Str) + .def("__str__", &vision::FaceRecognitionResult::Str); + + pybind11::class_(m, "MattingResult") + .def(pybind11::init()) + .def_readwrite("alpha", &vision::MattingResult::alpha) + .def_readwrite("foreground", &vision::MattingResult::foreground) + .def_readwrite("shape", &vision::MattingResult::shape) + .def_readwrite("contain_foreground", &vision::MattingResult::shape) + .def("__repr__", &vision::MattingResult::Str) + .def("__str__", &vision::MattingResult::Str); + + BindPPCls(m); + BindPPDet(m); + BindPPSeg(m); + + BindDetection(m); + BindFaceDet(m); + BindFaceId(m); + BindMatting(m); +#ifdef ENABLE_VISION_VISUALIZE + BindVisualize(m); +#endif +} +} // namespace fastdeploy diff --git a/csrc/fastdeploy/vision/visualize/detection.cc b/csrc/fastdeploy/vision/visualize/detection.cc new file mode 100644 index 000000000..147ef6556 --- /dev/null +++ b/csrc/fastdeploy/vision/visualize/detection.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE + +#include "fastdeploy/vision/visualize/visualize.h" +#include "opencv2/imgproc/imgproc.hpp" + +namespace fastdeploy { +namespace vision { + +// Default only support visualize num_classes <= 1000 +// If need to visualize num_classes > 1000 +// Please call Visualize::GetColorMap(num_classes) first +cv::Mat Visualize::VisDetection(const cv::Mat& im, + const DetectionResult& result, int line_size, + float font_size) { + auto color_map = GetColorMap(); + int h = im.rows; + int w = im.cols; + auto vis_im = im.clone(); + for (size_t i = 0; i < result.boxes.size(); ++i) { + cv::Rect rect(result.boxes[i][0], result.boxes[i][1], + result.boxes[i][2] - result.boxes[i][0], + result.boxes[i][3] - result.boxes[i][1]); + int c0 = color_map[3 * result.label_ids[i] + 0]; + int c1 = color_map[3 * result.label_ids[i] + 1]; + int c2 = color_map[3 * result.label_ids[i] + 2]; + cv::Scalar rect_color = cv::Scalar(c0, c1, c2); + std::string id = std::to_string(result.label_ids[i]); + std::string score = std::to_string(result.scores[i]); + if (score.size() > 4) { + score = score.substr(0, 4); + } + std::string text = id + "," + score; + int font = cv::FONT_HERSHEY_SIMPLEX; + cv::Size text_size = cv::getTextSize(text, font, font_size, 1, nullptr); + cv::Point origin; + origin.x = rect.x; + origin.y = rect.y; + cv::Rect text_background = + cv::Rect(result.boxes[i][0], result.boxes[i][1] - text_size.height, + text_size.width, text_size.height); + cv::rectangle(vis_im, rect, rect_color, line_size); + cv::putText(vis_im, text, origin, font, font_size, + cv::Scalar(255, 255, 255), 1); + } + return vis_im; +} + +} // namespace vision +} // namespace fastdeploy +#endif diff --git a/csrc/fastdeploy/vision/visualize/face_detection.cc b/csrc/fastdeploy/vision/visualize/face_detection.cc new file mode 100644 index 000000000..d9da27786 --- /dev/null +++ b/csrc/fastdeploy/vision/visualize/face_detection.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE + +#include "fastdeploy/vision/visualize/visualize.h" +#include "opencv2/imgproc/imgproc.hpp" + +namespace fastdeploy { + +namespace vision { + +// Default only support visualize num_classes <= 1000 +// If need to visualize num_classes > 1000 +// Please call Visualize::GetColorMap(num_classes) first +cv::Mat Visualize::VisFaceDetection(const cv::Mat& im, + const FaceDetectionResult& result, + int line_size, float font_size) { + auto color_map = GetColorMap(); + int h = im.rows; + int w = im.cols; + + auto vis_im = im.clone(); + bool vis_landmarks = false; + if ((result.landmarks_per_face > 0) && + (result.boxes.size() * result.landmarks_per_face == + result.landmarks.size())) { + vis_landmarks = true; + } + for (size_t i = 0; i < result.boxes.size(); ++i) { + cv::Rect rect(result.boxes[i][0], result.boxes[i][1], + result.boxes[i][2] - result.boxes[i][0], + result.boxes[i][3] - result.boxes[i][1]); + int color_id = i % 333; + int c0 = color_map[3 * color_id + 0]; + int c1 = color_map[3 * color_id + 1]; + int c2 = color_map[3 * color_id + 2]; + cv::Scalar rect_color = cv::Scalar(c0, c1, c2); + std::string text = std::to_string(result.scores[i]); + if (text.size() > 4) { + text = text.substr(0, 4); + } + int font = cv::FONT_HERSHEY_SIMPLEX; + cv::Size text_size = cv::getTextSize(text, font, font_size, 1, nullptr); + cv::Point origin; + origin.x = rect.x; + origin.y = rect.y; + cv::Rect text_background = + cv::Rect(result.boxes[i][0], result.boxes[i][1] - text_size.height, + text_size.width, text_size.height); + cv::rectangle(vis_im, rect, rect_color, line_size); + cv::putText(vis_im, text, origin, font, font_size, + cv::Scalar(255, 255, 255), 1); + // vis landmarks (if have) + if (vis_landmarks) { + cv::Scalar landmark_color = rect_color; + for (size_t j = 0; j < result.landmarks_per_face; ++j) { + cv::Point landmark; + landmark.x = static_cast( + result.landmarks[i * result.landmarks_per_face + j][0]); + landmark.y = static_cast( + result.landmarks[i * result.landmarks_per_face + j][1]); + cv::circle(vis_im, landmark, line_size, landmark_color, -1); + } + } + } + return vis_im; +} + +} // namespace vision +} // namespace fastdeploy + +#endif diff --git a/csrc/fastdeploy/vision/visualize/matting_alpha.cc b/csrc/fastdeploy/vision/visualize/matting_alpha.cc new file mode 100644 index 000000000..1018018c6 --- /dev/null +++ b/csrc/fastdeploy/vision/visualize/matting_alpha.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE + +#include "fastdeploy/vision/visualize/visualize.h" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc/imgproc.hpp" + +namespace fastdeploy { +namespace vision { + +static void RemoveSmallConnectedArea(cv::Mat* alpha_pred, + float threshold = 0.05f) { + // 移除小的联通区域和噪点 开闭合形态学处理 + // 假设输入的是透明度alpha, 值域(0.,1.) + cv::Mat gray, binary; + (*alpha_pred).convertTo(gray, CV_8UC1, 255.f); + // 255 * 0.05 ~ 13 + unsigned int binary_threshold = static_cast(255.f * threshold); + cv::threshold(gray, binary, binary_threshold, 255, cv::THRESH_BINARY); + // morphologyEx with OPEN operation to remove noise first. + auto kernel = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(3, 3), + cv::Point(-1, -1)); + cv::morphologyEx(binary, binary, cv::MORPH_OPEN, kernel); + // Computationally connected domain + cv::Mat labels = cv::Mat::zeros((*alpha_pred).size(), CV_32S); + cv::Mat stats, centroids; + int num_labels = + cv::connectedComponentsWithStats(binary, labels, stats, centroids, 8, 4); + if (num_labels <= 1) { + // no noise, skip. + return; + } + // find max connected area, 0 is background + int max_connected_id = 1; // 1,2,... + int max_connected_area = stats.at(max_connected_id, cv::CC_STAT_AREA); + for (int i = 1; i < num_labels; ++i) { + int tmp_connected_area = stats.at(i, cv::CC_STAT_AREA); + if (tmp_connected_area > max_connected_area) { + max_connected_area = tmp_connected_area; + max_connected_id = i; + } + } + const int h = (*alpha_pred).rows; + const int w = (*alpha_pred).cols; + // remove small connected area. + for (int i = 0; i < h; ++i) { + int* label_row_ptr = labels.ptr(i); + float* alpha_row_ptr = (*alpha_pred).ptr(i); + for (int j = 0; j < w; ++j) { + if (label_row_ptr[j] != max_connected_id) alpha_row_ptr[j] = 0.f; + } + } +} + +cv::Mat Visualize::VisMattingAlpha(const cv::Mat& im, + const MattingResult& result, + bool remove_small_connected_area) { + // 只可视化alpha,fgr(前景)本身就是一张图 不需要可视化 + FDASSERT((!im.empty()), "im can't be empty!"); + FDASSERT((im.channels() == 3), "Only support 3 channels mat!"); + + auto vis_img = im.clone(); + int out_h = static_cast(result.shape[0]); + int out_w = static_cast(result.shape[1]); + int height = im.rows; + int width = im.cols; + // alpha to cv::Mat && 避免resize等操作修改外部数据 + std::vector alpha_copy; + alpha_copy.assign(result.alpha.begin(), result.alpha.end()); + float* alpha_ptr = static_cast(alpha_copy.data()); + cv::Mat alpha(out_h, out_w, CV_32FC1, alpha_ptr); + if (remove_small_connected_area) { + RemoveSmallConnectedArea(&alpha, 0.05f); + } + if ((out_h != height) || (out_w != width)) { + cv::resize(alpha, alpha, cv::Size(width, height)); + } + + if ((vis_img).type() != CV_8UC3) { + (vis_img).convertTo((vis_img), CV_8UC3); + } + + uchar* vis_data = static_cast(vis_img.data); + uchar* im_data = static_cast(im.data); + float* alpha_data = reinterpret_cast(alpha.data); + + for (size_t i = 0; i < height; ++i) { + for (size_t j = 0; j < width; ++j) { + float alpha_val = alpha_data[i * width + j]; + vis_data[i * width * 3 + j * 3 + 0] = cv::saturate_cast( + static_cast(im_data[i * width * 3 + j * 3 + 0]) * alpha_val + + (1.f - alpha_val) * 153.f); + vis_data[i * width * 3 + j * 3 + 1] = cv::saturate_cast( + static_cast(im_data[i * width * 3 + j * 3 + 1]) * alpha_val + + (1.f - alpha_val) * 255.f); + vis_data[i * width * 3 + j * 3 + 2] = cv::saturate_cast( + static_cast(im_data[i * width * 3 + j * 3 + 2]) * alpha_val + + (1.f - alpha_val) * 120.f); + } + } + return vis_img; +} + +} // namespace vision +} // namespace fastdeploy +#endif diff --git a/csrc/fastdeploy/vision/visualize/segmentation.cc b/csrc/fastdeploy/vision/visualize/segmentation.cc new file mode 100644 index 000000000..7d3790328 --- /dev/null +++ b/csrc/fastdeploy/vision/visualize/segmentation.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE + +#include "fastdeploy/vision/visualize/visualize.h" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc/imgproc.hpp" + +namespace fastdeploy { +namespace vision { + +cv::Mat Visualize::VisSegmentation(const cv::Mat& im, + const SegmentationResult& result) { + auto color_map = GetColorMap(); + int64_t height = result.shape[0]; + int64_t width = result.shape[1]; + auto vis_img = cv::Mat(height, width, CV_8UC3); + + int64_t index = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + int category_id = result.label_map[index++]; + vis_img.at(i, j)[0] = color_map[3 * category_id + 0]; + vis_img.at(i, j)[1] = color_map[3 * category_id + 1]; + vis_img.at(i, j)[2] = color_map[3 * category_id + 2]; + } + } + cv::addWeighted(im, .5, vis_img, .5, 0, vis_img); + return vis_img; +} + +} // namespace vision +} // namespace fastdeploy +#endif diff --git a/csrc/fastdeploy/vision/visualize/visualize.cc b/csrc/fastdeploy/vision/visualize/visualize.cc new file mode 100644 index 000000000..4ad6ba124 --- /dev/null +++ b/csrc/fastdeploy/vision/visualize/visualize.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE +#include "fastdeploy/vision/visualize/visualize.h" + +namespace fastdeploy { +namespace vision { + +int Visualize::num_classes_ = 0; +std::vector Visualize::color_map_ = std::vector(); + +const std::vector& Visualize::GetColorMap(int num_classes) { + if (num_classes < num_classes_) { + return color_map_; + } + num_classes_ = num_classes; + std::vector().swap(color_map_); + color_map_.resize(3 * num_classes_, 0); + for (int i = 0; i < num_classes_; ++i) { + int j = 0; + int lab = i; + while (lab) { + color_map_[i * 3] |= (((lab >> 0) & 1) << (7 - j)); + color_map_[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)); + color_map_[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)); + ++j; + lab >>= 3; + } + } + return color_map_; +} + +} // namespace vision +} // namespace fastdeploy +#endif diff --git a/csrc/fastdeploy/vision/visualize/visualize.h b/csrc/fastdeploy/vision/visualize/visualize.h new file mode 100644 index 000000000..bee62c301 --- /dev/null +++ b/csrc/fastdeploy/vision/visualize/visualize.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE +#pragma once + +#include "fastdeploy/vision/common/result.h" +#include "opencv2/imgproc/imgproc.hpp" +namespace fastdeploy { +namespace vision { + +class FASTDEPLOY_DECL Visualize { + public: + static int num_classes_; + static std::vector color_map_; + static const std::vector& GetColorMap(int num_classes = 1000); + static cv::Mat VisDetection(const cv::Mat& im, const DetectionResult& result, + int line_size = 1, float font_size = 0.5f); + static cv::Mat VisFaceDetection(const cv::Mat& im, + const FaceDetectionResult& result, + int line_size = 1, float font_size = 0.5f); + static cv::Mat VisSegmentation(const cv::Mat& im, + const SegmentationResult& result); + static cv::Mat VisMattingAlpha(const cv::Mat& im, const MattingResult& result, + bool remove_small_connected_area = false); +}; + +} // namespace vision +} // namespace fastdeploy +#endif diff --git a/csrc/fastdeploy/vision/visualize/visualize_pybind.cc b/csrc/fastdeploy/vision/visualize/visualize_pybind.cc new file mode 100644 index 000000000..36010acf1 --- /dev/null +++ b/csrc/fastdeploy/vision/visualize/visualize_pybind.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/pybind/main.h" + +namespace fastdeploy { +void BindVisualize(pybind11::module& m) { + pybind11::class_(m, "Visualize") + .def(pybind11::init<>()) + .def_static("vis_detection", + [](pybind11::array& im_data, vision::DetectionResult& result, + int line_size, float font_size) { + auto im = PyArrayToCvMat(im_data); + auto vis_im = vision::Visualize::VisDetection( + im, result, line_size, font_size); + FDTensor out; + vision::Mat(vis_im).ShareWithTensor(&out); + return TensorToPyArray(out); + }) + .def_static( + "vis_face_detection", + [](pybind11::array& im_data, vision::FaceDetectionResult& result, + int line_size, float font_size) { + auto im = PyArrayToCvMat(im_data); + auto vis_im = vision::Visualize::VisFaceDetection( + im, result, line_size, font_size); + FDTensor out; + vision::Mat(vis_im).ShareWithTensor(&out); + return TensorToPyArray(out); + }) + .def_static( + "vis_segmentation", + [](pybind11::array& im_data, vision::SegmentationResult& result) { + cv::Mat im = PyArrayToCvMat(im_data); + auto vis_im = vision::Visualize::VisSegmentation(im, result); + FDTensor out; + vision::Mat(vis_im).ShareWithTensor(&out); + return TensorToPyArray(out); + }) + .def_static("vis_matting_alpha", + [](pybind11::array& im_data, vision::MattingResult& result, + bool remove_small_connected_area) { + cv::Mat im = PyArrayToCvMat(im_data); + auto vis_im = vision::Visualize::VisMattingAlpha( + im, result, remove_small_connected_area); + FDTensor out; + vision::Mat(vis_im).ShareWithTensor(&out); + return TensorToPyArray(out); + }); +} +} // namespace fastdeploy