mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-07 17:41:52 +08:00

* [Model] init pp-structurev2-layout code * [Model] init pp-structurev2-layout code * [Model] init pp-structurev2-layout code * [Model] add structurev2_layout_preprocessor * [PP-StructureV2] add postprocessor and layout detector class * [PP-StructureV2] add postprocessor and layout detector class * [PP-StructureV2] add postprocessor and layout detector class * [PP-StructureV2] add postprocessor and layout detector class * [PP-StructureV2] add postprocessor and layout detector class * [pybind] add pp-structurev2-layout model pybind * [pybind] add pp-structurev2-layout model pybind * [Bug Fix] fixed code style * [examples] add pp-structurev2-layout c++ examples * [PP-StructureV2] add python example and docs * [benchmark] add pp-structurev2-layout benchmark support
172 lines
7.0 KiB
C++
172 lines
7.0 KiB
C++
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "fastdeploy/vision/ocr/ppocr/structurev2_layout_postprocessor.h"
|
|
#include "fastdeploy/vision/ocr/ppocr/utils/ocr_utils.h"
|
|
#include "fastdeploy/vision/utils/utils.h"
|
|
|
|
namespace fastdeploy {
|
|
namespace vision {
|
|
namespace ocr {
|
|
|
|
bool StructureV2LayoutPostprocessor::Run(
|
|
const std::vector<FDTensor>& tensors, std::vector<DetectionResult>* results,
|
|
const std::vector<std::array<int, 4>>& batch_layout_img_info) {
|
|
// A StructureV2Layout has 8 output tensors on which it then runs
|
|
// a GFL regression (namely, DisPred2Box), reference:
|
|
// PaddleOCR/blob/release/2.6/deploy/cpp_infer/src/postprocess_op.cpp#L511
|
|
int tensor_size = tensors.size();
|
|
FDASSERT(tensor_size == 8,
|
|
"StructureV2Layout should has 8 output tensors,"
|
|
"but got %d now!",
|
|
tensor_size)
|
|
FDASSERT((tensor_size / 2) == fpn_stride_.size(),
|
|
"found (tensor_size / 2) != fpn_stride_.size() !")
|
|
// TODO(qiuyanjun): may need to reorder the tensors according to
|
|
// fpn_stride_ and the shape of output tensors.
|
|
size_t batch = tensors[0].Shape()[0]; // [batch, ...]
|
|
|
|
results->resize(batch);
|
|
SetRegMax(tensors[fpn_stride_.size()].Shape()[2] / 4);
|
|
for (int batch_idx = 0; batch_idx < batch; ++batch_idx) {
|
|
std::vector<FDTensor> single_batch_tensors(8);
|
|
SetSingleBatchExternalData(tensors, single_batch_tensors, batch_idx);
|
|
SingleBatchPostprocessor(single_batch_tensors,
|
|
batch_layout_img_info[batch_idx],
|
|
&results->at(batch_idx));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void StructureV2LayoutPostprocessor::SetSingleBatchExternalData(
|
|
const std::vector<FDTensor>& tensors,
|
|
std::vector<FDTensor>& single_batch_tensors, size_t batch_idx) {
|
|
single_batch_tensors.resize(tensors.size());
|
|
for (int j = 0; j < tensors.size(); ++j) {
|
|
auto j_shape = tensors[j].Shape();
|
|
j_shape[0] = 1; // process b=1 per loop
|
|
size_t j_step =
|
|
accumulate(j_shape.begin(), j_shape.end(), 1, std::multiplies<int>());
|
|
const float* j_data_ptr = reinterpret_cast<const float*>(tensors[j].Data());
|
|
const float* j_start_ptr = j_data_ptr + j_step * batch_idx;
|
|
single_batch_tensors[j].SetExternalData(
|
|
j_shape, tensors[j].Dtype(),
|
|
const_cast<void*>(reinterpret_cast<const void*>(j_start_ptr)),
|
|
tensors[j].device, tensors[j].device_id);
|
|
}
|
|
}
|
|
|
|
bool StructureV2LayoutPostprocessor::SingleBatchPostprocessor(
|
|
const std::vector<FDTensor>& single_batch_tensors,
|
|
const std::array<int, 4>& layout_img_info, DetectionResult* result) {
|
|
FDASSERT(single_batch_tensors.size() == 8,
|
|
"StructureV2Layout should has 8 output tensors,"
|
|
"but got %d now!",
|
|
static_cast<int>(single_batch_tensors.size()))
|
|
// layout_img_info: {image width, image height, resize width, resize height}
|
|
int img_w = layout_img_info[0];
|
|
int img_h = layout_img_info[1];
|
|
int in_w = layout_img_info[2];
|
|
int in_h = layout_img_info[3];
|
|
float scale_factor_w = static_cast<float>(in_w) / static_cast<float>(img_w);
|
|
float scale_factor_h = static_cast<float>(in_h) / static_cast<float>(img_h);
|
|
|
|
std::vector<DetectionResult> bbox_results;
|
|
bbox_results.resize(num_class_); // tmp result for each class
|
|
|
|
// decode score, label, box
|
|
for (int i = 0; i < fpn_stride_.size(); ++i) {
|
|
int feature_h = std::ceil(static_cast<float>(in_h) / fpn_stride_[i]);
|
|
int feature_w = std::ceil(static_cast<float>(in_w) / fpn_stride_[i]);
|
|
const FDTensor& prob_tensor = single_batch_tensors[i];
|
|
const FDTensor& bbox_tensor = single_batch_tensors[i + fpn_stride_.size()];
|
|
const float* prob_data = reinterpret_cast<const float*>(prob_tensor.Data());
|
|
const float* bbox_data = reinterpret_cast<const float*>(bbox_tensor.Data());
|
|
for (int idx = 0; idx < feature_h * feature_w; ++idx) {
|
|
// score and label
|
|
float score = 0.f;
|
|
int label = 0;
|
|
for (int j = 0; j < num_class_; ++j) {
|
|
if (prob_data[idx * num_class_ + j] > score) {
|
|
score = prob_data[idx * num_class_ + j];
|
|
label = j;
|
|
}
|
|
}
|
|
// bbox
|
|
if (score > score_threshold_) {
|
|
int row = idx / feature_w;
|
|
int col = idx % feature_w;
|
|
std::vector<float> bbox_pred(bbox_data + idx * 4 * reg_max_,
|
|
bbox_data + (idx + 1) * 4 * reg_max_);
|
|
bbox_results[label].boxes.push_back(DisPred2Bbox(
|
|
bbox_pred, col, row, fpn_stride_[i], in_w, in_h, reg_max_));
|
|
bbox_results[label].scores.push_back(score);
|
|
bbox_results[label].label_ids.push_back(label);
|
|
}
|
|
}
|
|
}
|
|
|
|
result->Clear();
|
|
// nms for per class, i in [0~num_class-1]
|
|
for (int i = 0; i < bbox_results.size(); ++i) {
|
|
if (bbox_results[i].boxes.size() <= 0) {
|
|
continue;
|
|
}
|
|
vision::utils::NMS(&bbox_results[i], nms_threshold_);
|
|
// fill output results
|
|
for (int j = 0; j < bbox_results[i].boxes.size(); ++j) {
|
|
result->scores.push_back(bbox_results[i].scores[j]);
|
|
result->label_ids.push_back(bbox_results[i].label_ids[j]);
|
|
result->boxes.push_back({
|
|
bbox_results[i].boxes[j][0] / scale_factor_w,
|
|
bbox_results[i].boxes[j][1] / scale_factor_h,
|
|
bbox_results[i].boxes[j][2] / scale_factor_w,
|
|
bbox_results[i].boxes[j][3] / scale_factor_h,
|
|
});
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
std::array<float, 4> StructureV2LayoutPostprocessor::DisPred2Bbox(
|
|
const std::vector<float>& bbox_pred, int x, int y, int stride, int resize_w,
|
|
int resize_h, int reg_max) {
|
|
float ct_x = (static_cast<float>(x) + 0.5f) * static_cast<float>(stride);
|
|
float ct_y = (static_cast<float>(y) + 0.5f) * static_cast<float>(stride);
|
|
std::vector<float> dis_pred;
|
|
dis_pred.resize(4);
|
|
for (int i = 0; i < 4; i++) {
|
|
std::vector<float> bbox_pred_i(bbox_pred.begin() + i * reg_max,
|
|
bbox_pred.begin() + (i + 1) * reg_max);
|
|
std::vector<float> dis_after_sm = ocr::Softmax(bbox_pred_i);
|
|
float dis = 0.0f;
|
|
for (int j = 0; j < reg_max; j++) {
|
|
dis += static_cast<float>(j) * dis_after_sm[j];
|
|
}
|
|
dis *= static_cast<float>(stride);
|
|
dis_pred[i] = dis;
|
|
}
|
|
|
|
float xmin = std::max(ct_x - dis_pred[0], 0.0f);
|
|
float ymin = std::max(ct_y - dis_pred[1], 0.0f);
|
|
float xmax = std::min(ct_x + dis_pred[2], static_cast<float>(resize_w));
|
|
float ymax = std::min(ct_y + dis_pred[3], static_cast<float>(resize_h));
|
|
|
|
return {xmin, ymin, xmax, ymax};
|
|
}
|
|
|
|
} // namespace ocr
|
|
} // namespace vision
|
|
} // namespace fastdeploy
|