Files
FastDeploy/fastdeploy/vision/ocr/ppocr/structurev2_layout_postprocessor.cc
DefTruth 6d0261e9e4 [Model] Support PP-StructureV2-Layout model (#1867)
* [Model] init pp-structurev2-layout code

* [Model] init pp-structurev2-layout code

* [Model] init pp-structurev2-layout code

* [Model] add structurev2_layout_preprocessor

* [PP-StructureV2] add postprocessor and layout detector class

* [PP-StructureV2] add postprocessor and layout detector class

* [PP-StructureV2] add postprocessor and layout detector class

* [PP-StructureV2] add postprocessor and layout detector class

* [PP-StructureV2] add postprocessor and layout detector class

* [pybind] add pp-structurev2-layout model pybind

* [pybind] add pp-structurev2-layout model pybind

* [Bug Fix] fixed code style

* [examples] add pp-structurev2-layout c++ examples

* [PP-StructureV2] add python example and docs

* [benchmark] add pp-structurev2-layout benchmark support
2023-05-05 13:05:58 +08:00

172 lines
7.0 KiB
C++

// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "fastdeploy/vision/ocr/ppocr/structurev2_layout_postprocessor.h"
#include "fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h"
#include "fastdeploy/vision/utils/utils.h"
namespace fastdeploy {
namespace vision {
namespace ocr {
bool StructureV2LayoutPostprocessor::Run(
const std::vector<FDTensor>& tensors, std::vector<DetectionResult>* results,
const std::vector<std::array<int, 4>>& batch_layout_img_info) {
// A StructureV2Layout has 8 output tensors on which it then runs
// a GFL regression (namely, DisPred2Box), reference:
// PaddleOCR/blob/release/2.6/deploy/cpp_infer/src/postprocess_op.cpp#L511
int tensor_size = tensors.size();
FDASSERT(tensor_size == 8,
"StructureV2Layout should has 8 output tensors,"
"but got %d now!",
tensor_size)
FDASSERT((tensor_size / 2) == fpn_stride_.size(),
"found (tensor_size / 2) != fpn_stride_.size() !")
// TODO(qiuyanjun): may need to reorder the tensors according to
// fpn_stride_ and the shape of output tensors.
size_t batch = tensors[0].Shape()[0]; // [batch, ...]
results->resize(batch);
SetRegMax(tensors[fpn_stride_.size()].Shape()[2] / 4);
for (int batch_idx = 0; batch_idx < batch; ++batch_idx) {
std::vector<FDTensor> single_batch_tensors(8);
SetSingleBatchExternalData(tensors, single_batch_tensors, batch_idx);
SingleBatchPostprocessor(single_batch_tensors,
batch_layout_img_info[batch_idx],
&results->at(batch_idx));
}
return true;
}
void StructureV2LayoutPostprocessor::SetSingleBatchExternalData(
const std::vector<FDTensor>& tensors,
std::vector<FDTensor>& single_batch_tensors, size_t batch_idx) {
single_batch_tensors.resize(tensors.size());
for (int j = 0; j < tensors.size(); ++j) {
auto j_shape = tensors[j].Shape();
j_shape[0] = 1; // process b=1 per loop
size_t j_step =
accumulate(j_shape.begin(), j_shape.end(), 1, std::multiplies<int>());
const float* j_data_ptr = reinterpret_cast<const float*>(tensors[j].Data());
const float* j_start_ptr = j_data_ptr + j_step * batch_idx;
single_batch_tensors[j].SetExternalData(
j_shape, tensors[j].Dtype(),
const_cast<void*>(reinterpret_cast<const void*>(j_start_ptr)),
tensors[j].device, tensors[j].device_id);
}
}
bool StructureV2LayoutPostprocessor::SingleBatchPostprocessor(
const std::vector<FDTensor>& single_batch_tensors,
const std::array<int, 4>& layout_img_info, DetectionResult* result) {
FDASSERT(single_batch_tensors.size() == 8,
"StructureV2Layout should has 8 output tensors,"
"but got %d now!",
static_cast<int>(single_batch_tensors.size()))
// layout_img_info: {image width, image height, resize width, resize height}
int img_w = layout_img_info[0];
int img_h = layout_img_info[1];
int in_w = layout_img_info[2];
int in_h = layout_img_info[3];
float scale_factor_w = static_cast<float>(in_w) / static_cast<float>(img_w);
float scale_factor_h = static_cast<float>(in_h) / static_cast<float>(img_h);
std::vector<DetectionResult> bbox_results;
bbox_results.resize(num_class_); // tmp result for each class
// decode score, label, box
for (int i = 0; i < fpn_stride_.size(); ++i) {
int feature_h = std::ceil(static_cast<float>(in_h) / fpn_stride_[i]);
int feature_w = std::ceil(static_cast<float>(in_w) / fpn_stride_[i]);
const FDTensor& prob_tensor = single_batch_tensors[i];
const FDTensor& bbox_tensor = single_batch_tensors[i + fpn_stride_.size()];
const float* prob_data = reinterpret_cast<const float*>(prob_tensor.Data());
const float* bbox_data = reinterpret_cast<const float*>(bbox_tensor.Data());
for (int idx = 0; idx < feature_h * feature_w; ++idx) {
// score and label
float score = 0.f;
int label = 0;
for (int j = 0; j < num_class_; ++j) {
if (prob_data[idx * num_class_ + j] > score) {
score = prob_data[idx * num_class_ + j];
label = j;
}
}
// bbox
if (score > score_threshold_) {
int row = idx / feature_w;
int col = idx % feature_w;
std::vector<float> bbox_pred(bbox_data + idx * 4 * reg_max_,
bbox_data + (idx + 1) * 4 * reg_max_);
bbox_results[label].boxes.push_back(DisPred2Bbox(
bbox_pred, col, row, fpn_stride_[i], in_w, in_h, reg_max_));
bbox_results[label].scores.push_back(score);
bbox_results[label].label_ids.push_back(label);
}
}
}
result->Clear();
// nms for per class, i in [0~num_class-1]
for (int i = 0; i < bbox_results.size(); ++i) {
if (bbox_results[i].boxes.size() <= 0) {
continue;
}
vision::utils::NMS(&bbox_results[i], nms_threshold_);
// fill output results
for (int j = 0; j < bbox_results[i].boxes.size(); ++j) {
result->scores.push_back(bbox_results[i].scores[j]);
result->label_ids.push_back(bbox_results[i].label_ids[j]);
result->boxes.push_back({
bbox_results[i].boxes[j][0] / scale_factor_w,
bbox_results[i].boxes[j][1] / scale_factor_h,
bbox_results[i].boxes[j][2] / scale_factor_w,
bbox_results[i].boxes[j][3] / scale_factor_h,
});
}
}
return true;
}
std::array<float, 4> StructureV2LayoutPostprocessor::DisPred2Bbox(
const std::vector<float>& bbox_pred, int x, int y, int stride, int resize_w,
int resize_h, int reg_max) {
float ct_x = (static_cast<float>(x) + 0.5f) * static_cast<float>(stride);
float ct_y = (static_cast<float>(y) + 0.5f) * static_cast<float>(stride);
std::vector<float> dis_pred;
dis_pred.resize(4);
for (int i = 0; i < 4; i++) {
std::vector<float> bbox_pred_i(bbox_pred.begin() + i * reg_max,
bbox_pred.begin() + (i + 1) * reg_max);
std::vector<float> dis_after_sm = ocr::Softmax(bbox_pred_i);
float dis = 0.0f;
for (int j = 0; j < reg_max; j++) {
dis += static_cast<float>(j) * dis_after_sm[j];
}
dis *= static_cast<float>(stride);
dis_pred[i] = dis;
}
float xmin = std::max(ct_x - dis_pred[0], 0.0f);
float ymin = std::max(ct_y - dis_pred[1], 0.0f);
float xmax = std::min(ct_x + dis_pred[2], static_cast<float>(resize_w));
float ymax = std::min(ct_y + dis_pred[3], static_cast<float>(resize_h));
return {xmin, ymin, xmax, ymax};
}
} // namespace ocr
} // namespace vision
} // namespace fastdeploy