[Benchmark]Compare diff for OCR (#1415)

* avoid mem copy for cpp benchmark * set CMAKE_BUILD_TYPE to Release * Add SegmentationDiff * change pointer to reference * fixed bug * cast uint8 to int32 * Add diff compare for OCR * Add diff compare for OCR * rm ppocr pipeline * Add yolov5 diff compare * Add yolov5 diff compare * deal with comments * deal with comments * fixed bug * fixed bug
2025-10-05 16:48:03 +08:00 · 2023-02-23 18:57:39 +08:00
parent 0c664fd006
commit d3845eb4e1
38 changed files with 513 additions and 255 deletions
--- a/benchmark/cpp/CMakeLists.txt
+++ b/benchmark/cpp/CMakeLists.txt
@@ -13,7 +13,9 @@ add_executable(benchmark_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov8.cc)
 add_executable(benchmark_ppcls ${PROJECT_SOURCE_DIR}/benchmark_ppcls.cc)
 add_executable(benchmark_precision_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_precision_ppyolov8.cc)
 add_executable(benchmark_ppseg ${PROJECT_SOURCE_DIR}/benchmark_ppseg.cc)
-add_executable(benchmark_ppocr ${PROJECT_SOURCE_DIR}/benchmark_ppocr.cc)
+add_executable(benchmark_ppocr_det ${PROJECT_SOURCE_DIR}/benchmark_ppocr_det.cc)
+add_executable(benchmark_ppocr_cls ${PROJECT_SOURCE_DIR}/benchmark_ppocr_cls.cc)
+add_executable(benchmark_ppocr_rec ${PROJECT_SOURCE_DIR}/benchmark_ppocr_rec.cc)

 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
@@ -21,12 +23,16 @@ if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
  target_link_libraries(benchmark_ppcls ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_precision_ppyolov8 ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppseg ${FASTDEPLOY_LIBS} gflags pthread)
-  target_link_libraries(benchmark_ppocr ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppocr_det ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppocr_cls ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppocr_rec ${FASTDEPLOY_LIBS} gflags pthread)
 else()
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppcls ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_precision_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppseg ${FASTDEPLOY_LIBS} gflags)
-  target_link_libraries(benchmark_ppocr ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppocr_det ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppocr_cls ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_ppocr_rec ${FASTDEPLOY_LIBS} gflags)
 endif()
--- a/benchmark/cpp/benchmark_ppocr.cc
+++ b/benchmark/cpp/benchmark_ppocr.cc
@@ -1,97 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "flags.h"
-#include "macros.h"
-#include "option.h"
-
-// Only for ppocr
-DEFINE_string(det_model, "", "Path of Detection model of PPOCR.");
-DEFINE_string(cls_model, "", "Path of Classification model of PPOCR.");
-DEFINE_string(rec_model, "", "Path of Recognization model of PPOCR.");
-DEFINE_string(rec_label_file, "", "Path of Recognization label file of PPOCR.");
-DEFINE_string(image_rec, "", "Path of Recognization img file of PPOCR.");
-
-int main(int argc, char* argv[]) {
-#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
-  // Initialization
-  auto option = fastdeploy::RuntimeOption();
-  if (!CreateRuntimeOption(&option, argc, argv, true)) {
-    return -1;
-  }
-  auto im = cv::imread(FLAGS_image);
-  auto im_rec = cv::imread(FLAGS_image_rec);
-  // Detection Model
-  auto det_model_file =
-      FLAGS_model + sep + FLAGS_det_model + sep + "inference.pdmodel";
-  auto det_params_file =
-      FLAGS_model + sep + FLAGS_det_model + sep + "inference.pdiparams";
-  // Classification Model
-  auto cls_model_file =
-      FLAGS_model + sep + FLAGS_cls_model + sep + "inference.pdmodel";
-  auto cls_params_file =
-      FLAGS_model + sep + FLAGS_cls_model + sep + "inference.pdiparams";
-  // Recognition Model
-  auto rec_model_file =
-      FLAGS_model + sep + FLAGS_rec_model + sep + "inference.pdmodel";
-  auto rec_params_file =
-      FLAGS_model + sep + FLAGS_rec_model + sep + "inference.pdiparams";
-  auto rec_label_file = FLAGS_rec_label_file;
-  if (FLAGS_backend == "paddle_trt") {
-    option.paddle_infer_option.collect_trt_shape = true;
-  }
-  auto det_option = option;
-  auto cls_option = option;
-  auto rec_option = option;
-  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
-    det_option.trt_option.SetShape("x", {1, 3, 64, 64}, {1, 3, 640, 640},
-                                   {1, 3, 960, 960});
-    cls_option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
-                                   {8, 3, 48, 1024});
-    rec_option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
-                                   {8, 3, 48, 2304});
-  }
-  auto det_model = fastdeploy::vision::ocr::DBDetector(
-      det_model_file, det_params_file, det_option);
-  auto cls_model = fastdeploy::vision::ocr::Classifier(
-      cls_model_file, cls_params_file, cls_option);
-  auto rec_model = fastdeploy::vision::ocr::Recognizer(
-      rec_model_file, rec_params_file, rec_label_file, rec_option);
-  // Only for runtime
-  if (FLAGS_profile_mode == "runtime") {
-    std::vector<std::array<int, 8>> boxes_result;
-    std::cout << "====Detection model====" << std::endl;
-    BENCHMARK_MODEL(det_model, det_model.Predict(im, &boxes_result));
-    int32_t cls_label;
-    float cls_score;
-    std::cout << "====Classification model====" << std::endl;
-    BENCHMARK_MODEL(cls_model,
-                    cls_model.Predict(im_rec, &cls_label, &cls_score));
-    std::string text;
-    float rec_score;
-    std::cout << "====Recognization model====" << std::endl;
-    BENCHMARK_MODEL(rec_model, rec_model.Predict(im_rec, &text, &rec_score));
-  }
-  auto model_ppocrv3 =
-      fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model);
-  fastdeploy::vision::OCRResult res;
-  if (FLAGS_profile_mode == "end2end") {
-    BENCHMARK_MODEL(model_ppocrv3, model_ppocrv3.Predict(im, &res))
-  }
-  auto vis_im = fastdeploy::vision::VisOcr(im, res);
-  cv::imwrite("vis_result.jpg", vis_im);
-  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
-#endif
-  return 0;
-}
--- a/benchmark/cpp/benchmark_ppocr_cls.cc
+++ b/benchmark/cpp/benchmark_ppocr_cls.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  // Classification Model
+  auto cls_model_file = FLAGS_model + sep + "inference.pdmodel";
+  auto cls_params_file = FLAGS_model + sep + "inference.pdiparams";
+  if (FLAGS_backend == "paddle_trt") {
+    option.paddle_infer_option.collect_trt_shape = true;
+  }
+  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
+    option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
+                               {8, 3, 48, 1024});
+  }
+  auto model_ppocr_cls = fastdeploy::vision::ocr::Classifier(
+      cls_model_file, cls_params_file, option);
+  int32_t res_label;
+  float res_score;
+  // Run once at least
+  model_ppocr_cls.Predict(im, &res_label, &res_score);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  int32_t res_label_expect = 0;
+  float res_score_expect = 1.0;
+  // Calculate diff between two results.
+  auto ppocr_cls_label_diff = res_label - res_label_expect;
+  auto ppocr_cls_score_diff = res_score - res_score_expect;
+  std::cout << "PPOCR Cls label diff: " << ppocr_cls_label_diff << std::endl;
+  std::cout << "PPOCR Cls score diff: " << abs(ppocr_cls_score_diff)
+            << std::endl;
+  BENCHMARK_MODEL(model_ppocr_cls,
+                  model_ppocr_cls.Predict(im, &res_label, &res_score));
+#endif
+  return 0;
+}
--- a/benchmark/cpp/benchmark_ppocr_det.cc
+++ b/benchmark/cpp/benchmark_ppocr_det.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  // Detection Model
+  auto det_model_file = FLAGS_model + sep + "inference.pdmodel";
+  auto det_params_file = FLAGS_model + sep + "inference.pdiparams";
+  if (FLAGS_backend == "paddle_trt") {
+    option.paddle_infer_option.collect_trt_shape = true;
+  }
+  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
+    option.trt_option.SetShape("x", {1, 3, 64, 64}, {1, 3, 640, 640},
+                               {1, 3, 960, 960});
+  }
+  auto model_ppocr_det =
+      vision::ocr::DBDetector(det_model_file, det_params_file, option);
+  std::vector<std::array<int, 8>> res;
+  // Run once at least
+  model_ppocr_det.Predict(im, &res);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  // Save result to -> disk.
+  std::string ppocr_det_result_path = "ppocr_det_result.txt";
+  benchmark::ResultManager::SaveOCRDetResult(res, ppocr_det_result_path);
+  // Load result from <- disk.
+  std::vector<std::array<int, 8>> res_loaded;
+  benchmark::ResultManager::LoadOCRDetResult(&res_loaded,
+                                             ppocr_det_result_path);
+  // Calculate diff between two results.
+  auto ppocr_det_diff =
+      benchmark::ResultManager::CalculateDiffStatis(res, res_loaded);
+  std::cout << "PPOCR Boxes diff: mean=" << ppocr_det_diff.boxes.mean
+            << ", max=" << ppocr_det_diff.boxes.max
+            << ", min=" << ppocr_det_diff.boxes.min << std::endl;
+  BENCHMARK_MODEL(model_ppocr_det, model_ppocr_det.Predict(im, &res));
+#endif
+  return 0;
+}
--- a/benchmark/cpp/benchmark_ppocr_rec.cc
+++ b/benchmark/cpp/benchmark_ppocr_rec.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+DEFINE_string(rec_label_file, "", "Path of Recognization label file of PPOCR.");
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  // Recognition Model
+  auto rec_model_file = FLAGS_model + sep + "inference.pdmodel";
+  auto rec_params_file = FLAGS_model + sep + "inference.pdiparams";
+  if (FLAGS_backend == "paddle_trt") {
+    option.paddle_infer_option.collect_trt_shape = true;
+  }
+  if (FLAGS_backend == "paddle_trt" || FLAGS_backend == "trt") {
+    option.trt_option.SetShape("x", {1, 3, 48, 10}, {4, 3, 48, 320},
+                               {8, 3, 48, 2304});
+  }
+  auto model_ppocr_rec = fastdeploy::vision::ocr::Recognizer(
+      rec_model_file, rec_params_file, FLAGS_rec_label_file, option);
+  std::string text;
+  float rec_score;
+  // Run once at least
+  model_ppocr_rec.Predict(im, &text, &rec_score);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  std::string text_expect = "上海斯格威铂尔大酒店";
+  float res_score_expect = 0.993308;
+  // Calculate diff between two results.
+  auto ppocr_rec_text_diff = text.compare(text_expect);
+  auto ppocr_rec_score_diff = rec_score - res_score_expect;
+  std::cout << "PPOCR Rec text diff: " << ppocr_rec_text_diff << std::endl;
+  std::cout << "PPOCR Rec score diff: " << abs(ppocr_rec_score_diff)
+            << std::endl;
+  BENCHMARK_MODEL(model_ppocr_rec,
+                  model_ppocr_rec.Predict(im, &text, &rec_score));
+#endif
+  return 0;
+}
--- a/benchmark/cpp/benchmark_yolov5.cc
+++ b/benchmark/cpp/benchmark_yolov5.cc
@@ -16,6 +16,9 @@
 #include "macros.h"
 #include "option.h"

+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
 int main(int argc, char* argv[]) {
 #if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
  // Initialization
@@ -24,11 +27,29 @@ int main(int argc, char* argv[]) {
    return -1;
  }
  auto im = cv::imread(FLAGS_image);
-  auto model_yolov5 =
-      fastdeploy::vision::detection::YOLOv5(FLAGS_model, "", option);
-  fastdeploy::vision::DetectionResult res;
+  auto model_yolov5 = vision::detection::YOLOv5(FLAGS_model, "", option);
+  vision::DetectionResult res;
+  // Run once at least
+  model_yolov5.Predict(im, &res);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  // Save result to -> disk.
+  std::string det_result_path = "yolov5_result.txt";
+  benchmark::ResultManager::SaveDetectionResult(res, det_result_path);
+  // Load result from <- disk.
+  vision::DetectionResult res_loaded;
+  benchmark::ResultManager::LoadDetectionResult(&res_loaded, det_result_path);
+  // Calculate diff between two results.
+  auto det_diff =
+      benchmark::ResultManager::CalculateDiffStatis(res, res_loaded);
+  std::cout << "Boxes diff: mean=" << det_diff.boxes.mean
+            << ", max=" << det_diff.boxes.max << ", min=" << det_diff.boxes.min
+            << std::endl;
+  std::cout << "Label_ids diff: mean=" << det_diff.labels.mean
+            << ", max=" << det_diff.labels.max
+            << ", min=" << det_diff.labels.min << std::endl;
  BENCHMARK_MODEL(model_yolov5, model_yolov5.Predict(im, &res))
-  auto vis_im = fastdeploy::vision::VisDetection(im, res);
+  auto vis_im = vision::VisDetection(im, res);
  cv::imwrite("vis_result.jpg", vis_im);
  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
 #endif
--- a/benchmark/cpp/run_benchmark_ppyolov8.sh
+++ b/benchmark/cpp/run_benchmark_ppyolov8.sh
--- a/fastdeploy/benchmark/utils.cc
+++ b/fastdeploy/benchmark/utils.cc
@@ -474,6 +474,34 @@ bool ResultManager::SaveSegmentationResult(
  return true;
 }

+bool ResultManager::SaveOCRDetResult(const std::vector<std::array<int, 8>>& res,
+                                     const std::string& path) {
+  if (res.empty()) {
+    FDERROR << "OCRDetResult can not be empty!" << std::endl;
+    return false;
+  }
+  std::ofstream fs(path, std::ios::out);
+  if (!fs.is_open()) {
+    FDERROR << "Fail to open file:" << path << std::endl;
+    return false;
+  }
+  fs.precision(20);
+  // boxes
+  fs << "boxes" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.size(); ++i) {
+    for (int j = 0; j < 8; ++j) {
+      if ((i == res.size() - 1) && (j == 7)) {
+        fs << res[i][j];
+      } else {
+        fs << res[i][j] << VALUE_SEP;
+      }
+    }
+  }
+  fs << "\n";
+  fs.close();
+  return true;
+}
+
 bool ResultManager::LoadDetectionResult(vision::DetectionResult* res,
                                        const std::string& path) {
  if (!CheckFileExists(path)) {
@@ -556,6 +584,26 @@ bool ResultManager::LoadSegmentationResult(vision::SegmentationResult* res,
  return true;
 }

+bool ResultManager::LoadOCRDetResult(std::vector<std::array<int, 8>>* res,
+                                     const std::string& path) {
+  if (!CheckFileExists(path)) {
+    FDERROR << "Can't found file from" << path << std::endl;
+    return false;
+  }
+  auto lines = ReadLines(path);
+  std::map<std::string, std::vector<std::string>> data;
+  // boxes
+  data = SplitDataLine(lines[0]);
+  int boxes_num = data.begin()->second.size() / 8;
+  res->resize(boxes_num);
+  for (int i = 0; i < boxes_num; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      (*res)[i][j] = std::stoi(data.begin()->second[i * 8 + j]);
+    }
+  }
+  return true;
+}
+
 DetectionDiff ResultManager::CalculateDiffStatis(
    const vision::DetectionResult& lhs, const vision::DetectionResult& rhs,
    const float& score_threshold) {
@@ -643,6 +691,31 @@ SegmentationDiff ResultManager::CalculateDiffStatis(
  return diff;
 }

+OCRDetDiff ResultManager::CalculateDiffStatis(
+    const std::vector<std::array<int, 8>>& lhs,
+    const std::vector<std::array<int, 8>>& rhs) {
+  const int boxes_nums = std::min(lhs.size(), rhs.size());
+  std::vector<std::array<int, 8>> lhs_sort = lhs;
+  std::vector<std::array<int, 8>> rhs_sort = rhs;
+  // lex sort by x(w) & y(h)
+  vision::utils::LexSortOCRDetResultByXY(&lhs_sort);
+  vision::utils::LexSortOCRDetResultByXY(&rhs_sort);
+  // get value diff
+  const int boxes_num = std::min(lhs_sort.size(), rhs_sort.size());
+  std::vector<float> boxes_diff;
+  for (int i = 0; i < boxes_num; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      boxes_diff.push_back(lhs_sort[i][j] - rhs_sort[i][j]);
+    }
+  }
+
+  OCRDetDiff diff;
+  CalculateStatisInfo<float>(boxes_diff.data(), boxes_diff.size(),
+                             &(diff.boxes.mean), &(diff.boxes.max),
+                             &(diff.boxes.min));
+  return diff;
+}
+
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK

--- a/fastdeploy/benchmark/utils.h
+++ b/fastdeploy/benchmark/utils.h
@@ -122,6 +122,10 @@ struct FASTDEPLOY_DECL SegmentationDiff: public BaseDiff {
  EvalStatis labels;
 };

+struct FASTDEPLOY_DECL OCRDetDiff: public BaseDiff {
+  EvalStatis boxes;
+};
+
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK

@@ -148,6 +152,10 @@ struct FASTDEPLOY_DECL ResultManager {
                                     const std::string& path);
  static bool LoadSegmentationResult(vision::SegmentationResult* res,
                                     const std::string& path);
+  static bool SaveOCRDetResult(const std::vector<std::array<int, 8>>& res,
+                               const std::string& path);
+  static bool LoadOCRDetResult(std::vector<std::array<int, 8>>* res,
+                               const std::string& path);
  /// Calculate diff value between two basic results.
  static DetectionDiff CalculateDiffStatis(const vision::DetectionResult& lhs,
                                           const vision::DetectionResult& rhs,
@@ -157,6 +165,9 @@ struct FASTDEPLOY_DECL ResultManager {
  static SegmentationDiff CalculateDiffStatis(
      const vision::SegmentationResult& lhs,
      const vision::SegmentationResult& rhs);
+  static OCRDetDiff CalculateDiffStatis(
+      const std::vector<std::array<int, 8>>& lhs,
+      const std::vector<std::array<int, 8>>& rhs);
 #endif  // ENABLE_VISION
 #endif  // ENABLE_BENCHMARK
 };
--- a/fastdeploy/vision/classification/contrib/resnet.cc
+++ b/fastdeploy/vision/classification/contrib/resnet.cc
@@ -13,15 +13,14 @@
 // limitations under the License.

 #include "fastdeploy/vision/classification/contrib/resnet.h"
-#include "fastdeploy/vision/utils/utils.h"
 #include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/utils/utils.h"

 namespace fastdeploy {
 namespace vision {
 namespace classification {

-ResNet::ResNet(const std::string& model_file,
-               const std::string& params_file,
+ResNet::ResNet(const std::string& model_file, const std::string& params_file,
               const RuntimeOption& custom_option,
               const ModelFormat& model_format) {
  // In constructor, the 3 steps below are necessary.
@@ -42,7 +41,6 @@ ResNet::ResNet(const std::string& model_file,
 }

 bool ResNet::Initialize() {
-
  // In this function, the 3 steps below are necessary.
  // 1. assign values to the global variables 2. call InitRuntime()

@@ -57,14 +55,15 @@ bool ResNet::Initialize() {
  return true;
 }

-
 bool ResNet::Preprocess(Mat* mat, FDTensor* output) {
+  // In this function, the preprocess need be implemented according to the
+  // original Repos,
+  // The result of preprocess has to be saved in FDTensor variable, because the
+  // input of Infer() need to be std::vector<FDTensor>.
+  // 1. Resize 2. BGR2RGB 3. Normalize 4. HWC2CHW 5. Put the result into
+  // FDTensor variable.

-// In this function, the preprocess need be implemented according to the original Repos,
-// The result of preprocess has to be saved in FDTensor variable, because the input of Infer() need to be std::vector<FDTensor>.
-// 1. Resize 2. BGR2RGB 3. Normalize 4. HWC2CHW 5. Put the result into FDTensor variable.
-        
-  if (mat->Height()!=size[0] || mat->Width()!=size[1]){
+  if (mat->Height() != size[0] || mat->Width() != size[1]) {
    int interp = cv::INTER_LINEAR;
    Resize::Run(mat, size[1], size[0], -1, -1, interp);
  }
@@ -75,20 +74,23 @@ bool ResNet::Preprocess(Mat* mat, FDTensor* output) {
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

-bool ResNet::Postprocess(FDTensor& infer_result,
-                                  ClassifyResult* result, int topk) {
-
-  // In this function, the postprocess need be implemented according to the original Repos,
-  // Finally the reslut of postprocess should be saved in ClassifyResult variable.
-  // 1. Softmax 2. Choose topk labels 3. Put the result into ClassifyResult variable.
+bool ResNet::Postprocess(FDTensor& infer_result, ClassifyResult* result,
+                         int topk) {
+  // In this function, the postprocess need be implemented according to the
+  // original Repos,
+  // Finally the reslut of postprocess should be saved in ClassifyResult
+  // variable.
+  // 1. Softmax 2. Choose topk labels 3. Put the result into ClassifyResult
+  // variable.

  int num_classes = infer_result.shape[1];
  function::Softmax(infer_result, &infer_result);
-  const float* infer_result_buffer = reinterpret_cast<float*>(infer_result.Data());
+  const float* infer_result_buffer =
+      reinterpret_cast<float*>(infer_result.Data());
  topk = std::min(num_classes, topk);
  result->label_ids =
      utils::TopKIndices(infer_result_buffer, num_classes, topk);
@@ -100,8 +102,8 @@ bool ResNet::Postprocess(FDTensor& infer_result,
 }

 bool ResNet::Predict(cv::Mat* im, ClassifyResult* result, int topk) {
-
-  // In this function, the Preprocess(), Infer(), and Postprocess() are called sequentially.
+  // In this function, the Preprocess(), Infer(), and Postprocess() are called
+  // sequentially.

  Mat mat(*im);
  std::vector<FDTensor> processed_data(1);
@@ -128,7 +130,6 @@ bool ResNet::Predict(cv::Mat* im, ClassifyResult* result, int topk) {
  return true;
 }

-
 }  // namespace classification
 }  // namespace vision
 }  // namespace fastdeploy
--- a/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc
+++ b/fastdeploy/vision/classification/contrib/yolov5cls/preprocessor.cc
@@ -23,7 +23,8 @@ YOLOv5ClsPreprocessor::YOLOv5ClsPreprocessor() {
  size_ = {224, 224};  //{h,w}
 }

-bool YOLOv5ClsPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
+bool YOLOv5ClsPreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
    std::map<std::string, std::array<float, 2>>* im_info) {
  // Record the shape of image and the shape of preprocessed image
  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
@@ -54,14 +55,16 @@ bool YOLOv5ClsPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                static_cast<float>(mat->Width())};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

-bool YOLOv5ClsPreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+bool YOLOv5ClsPreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
  if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
    return false;
  }
  ims_info->resize(images->size());
--- a/fastdeploy/vision/detection/contrib/fastestdet/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/fastestdet/preprocessor.cc
@@ -23,7 +23,8 @@ FastestDetPreprocessor::FastestDetPreprocessor() {
  size_ = {352, 352};  //{h,w}
 }

-bool FastestDetPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
+bool FastestDetPreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
    std::map<std::string, std::array<float, 2>>* im_info) {
  // Record the shape of image and the shape of preprocessed image
  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
@@ -36,10 +37,10 @@ bool FastestDetPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
  // fastestdet's preprocess steps
  // 1. resize
  // 2. convert_and_permute(swap_rb=false)
-  Resize::Run(mat, size_[0], size_[1]); //resize
+  Resize::Run(mat, size_[0], size_[1]);  // resize
  std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
  std::vector<float> beta = {0.0f, 0.0f, 0.0f};
-  //convert to float and HWC2CHW
+  // convert to float and HWC2CHW
  ConvertAndPermute::Run(mat, alpha, beta, false);

  // Record output shape of preprocessed image
@@ -47,14 +48,16 @@ bool FastestDetPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                static_cast<float>(mat->Width())};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

-bool FastestDetPreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+bool FastestDetPreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
  if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
    return false;
  }
  ims_info->resize(images->size());
--- a/fastdeploy/vision/detection/contrib/nanodet_plus.cc
+++ b/fastdeploy/vision/detection/contrib/nanodet_plus.cc
@@ -182,7 +182,7 @@ bool NanoDetPlus::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/detection/contrib/scaledyolov4.cc
+++ b/fastdeploy/vision/detection/contrib/scaledyolov4.cc
@@ -144,7 +144,7 @@ bool ScaledYOLOv4::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/detection/contrib/yolor.cc
+++ b/fastdeploy/vision/detection/contrib/yolor.cc
@@ -142,7 +142,7 @@ bool YOLOR::Preprocess(Mat* mat, FDTensor* output,
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/detection/contrib/yolov5/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5/preprocessor.cc
@@ -64,7 +64,8 @@ void YOLOv5Preprocessor::LetterBox(FDMat* mat) {
  }
 }

-bool YOLOv5Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
+bool YOLOv5Preprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
    std::map<std::string, std::array<float, 2>>* im_info) {
  // Record the shape of image and the shape of preprocessed image
  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
@@ -82,14 +83,16 @@ bool YOLOv5Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                static_cast<float>(mat->Width())};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

-bool YOLOv5Preprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+bool YOLOv5Preprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
  if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
    return false;
  }
  ims_info->resize(images->size());
--- a/fastdeploy/vision/detection/contrib/yolov5lite.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5lite.cc
@@ -195,7 +195,7 @@ bool YOLOv5Lite::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

@@ -253,7 +253,7 @@ bool YOLOv5Lite::CudaPreprocess(
  output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                          input_tensor_cuda_buffer_device_);
  output->device = Device::GPU;
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 #else
  FDERROR << "CUDA src code was not enabled." << std::endl;
--- a/fastdeploy/vision/detection/contrib/yolov5seg/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5seg/preprocessor.cc
@@ -64,7 +64,8 @@ void YOLOv5SegPreprocessor::LetterBox(FDMat* mat) {
  }
 }

-bool YOLOv5SegPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
+bool YOLOv5SegPreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
    std::map<std::string, std::array<float, 2>>* im_info) {
  // Record the shape of image and the shape of preprocessed image
  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
@@ -82,14 +83,16 @@ bool YOLOv5SegPreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                static_cast<float>(mat->Width())};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

-bool YOLOv5SegPreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+bool YOLOv5SegPreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
  if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
    return false;
  }
  ims_info->resize(images->size());
--- a/fastdeploy/vision/detection/contrib/yolov6.cc
+++ b/fastdeploy/vision/detection/contrib/yolov6.cc
@@ -168,7 +168,7 @@ bool YOLOv6::Preprocess(Mat* mat, FDTensor* output,
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

@@ -226,7 +226,7 @@ bool YOLOv6::CudaPreprocess(
  output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                          input_tensor_cuda_buffer_device_);
  output->device = Device::GPU;
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 #else
  FDERROR << "CUDA src code was not enabled." << std::endl;
--- a/fastdeploy/vision/detection/contrib/yolov7/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov7/preprocessor.cc
@@ -64,7 +64,8 @@ void YOLOv7Preprocessor::LetterBox(FDMat* mat) {
  }
 }

-bool YOLOv7Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
+bool YOLOv7Preprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
    std::map<std::string, std::array<float, 2>>* im_info) {
  // Record the shape of image and the shape of preprocessed image
  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
@@ -82,14 +83,16 @@ bool YOLOv7Preprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                static_cast<float>(mat->Width())};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

-bool YOLOv7Preprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+bool YOLOv7Preprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
  if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
    return false;
  }
  ims_info->resize(images->size());
--- a/fastdeploy/vision/detection/contrib/yolov7end2end_ort.cc
+++ b/fastdeploy/vision/detection/contrib/yolov7end2end_ort.cc
@@ -137,7 +137,7 @@ bool YOLOv7End2EndORT::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

@@ -235,7 +235,8 @@ bool YOLOv7End2EndORT::Predict(cv::Mat* im, DetectionResult* result,
    return false;
  }

-  if (!Postprocess(reused_output_tensors_[0], result, im_info, conf_threshold)) {
+  if (!Postprocess(reused_output_tensors_[0], result, im_info,
+                   conf_threshold)) {
    FDERROR << "Failed to post process." << std::endl;
    return false;
  }
--- a/fastdeploy/vision/detection/contrib/yolov7end2end_trt.cc
+++ b/fastdeploy/vision/detection/contrib/yolov7end2end_trt.cc
@@ -169,7 +169,7 @@ bool YOLOv7End2EndTRT::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

@@ -227,7 +227,7 @@ bool YOLOv7End2EndTRT::CudaPreprocess(
  output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                          input_tensor_cuda_buffer_device_);
  output->device = Device::GPU;
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 #else
  FDERROR << "CUDA src code was not enabled." << std::endl;
--- a/fastdeploy/vision/detection/contrib/yolov8/preprocessor.cc
+++ b/fastdeploy/vision/detection/contrib/yolov8/preprocessor.cc
@@ -83,7 +83,7 @@ bool YOLOv8Preprocessor::Preprocess(
                                static_cast<float>(mat->Width())};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/detection/contrib/yolox.cc
+++ b/fastdeploy/vision/detection/contrib/yolox.cc
@@ -129,7 +129,7 @@ bool YOLOX::Preprocess(Mat* mat, FDTensor* output,
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/facealign/contrib/face_landmark_1000.cc
+++ b/fastdeploy/vision/facealign/contrib/face_landmark_1000.cc
@@ -70,7 +70,7 @@ bool FaceLandmark1000::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/facealign/contrib/pfld.cc
+++ b/fastdeploy/vision/facealign/contrib/pfld.cc
@@ -22,8 +22,7 @@ namespace vision {

 namespace facealign {

-PFLD::PFLD(const std::string& model_file,
-           const std::string& params_file,
+PFLD::PFLD(const std::string& model_file, const std::string& params_file,
           const RuntimeOption& custom_option,
           const ModelFormat& model_format) {
  if (model_format == ModelFormat::ONNX) {
@@ -71,11 +70,12 @@ bool PFLD::Preprocess(Mat* mat, FDTensor* output,
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

-bool PFLD::Postprocess(FDTensor& infer_result, FaceAlignmentResult* result,
+bool PFLD::Postprocess(
+    FDTensor& infer_result, FaceAlignmentResult* result,
    const std::map<std::string, std::array<int, 2>>& im_info) {
  FDASSERT(infer_result.shape[0] == 1, "Only support batch = 1 now.");
  if (infer_result.dtype != FDDataType::FP32) {
@@ -84,8 +84,7 @@ bool PFLD::Postprocess(FDTensor& infer_result, FaceAlignmentResult* result,
  }

  auto iter_in = im_info.find("input_shape");
-  FDASSERT(iter_in != im_info.end(),
-           "Cannot find input_shape from im_info.");
+  FDASSERT(iter_in != im_info.end(), "Cannot find input_shape from im_info.");
  int in_h = iter_in->second[0];
  int in_w = iter_in->second[1];

@@ -97,8 +96,7 @@ bool PFLD::Postprocess(FDTensor& infer_result, FaceAlignmentResult* result,
    x = std::min(std::max(0.f, x), 1.0f);
    y = std::min(std::max(0.f, y), 1.0f);
    // decode landmarks (default 106 landmarks)
-    result->landmarks.emplace_back(
-        std::array<float, 2>{x * in_w, y * in_h});
+    result->landmarks.emplace_back(std::array<float, 2>{x * in_w, y * in_h});
  }

  return true;
--- a/fastdeploy/vision/facealign/contrib/pipnet.cc
+++ b/fastdeploy/vision/facealign/contrib/pipnet.cc
@@ -632,7 +632,7 @@ bool PIPNet::Preprocess(Mat* mat, FDTensor* output,
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/facedet/contrib/retinaface.cc
+++ b/fastdeploy/vision/facedet/contrib/retinaface.cc
@@ -145,7 +145,7 @@ bool RetinaFace::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/facedet/contrib/ultraface.cc
+++ b/fastdeploy/vision/facedet/contrib/ultraface.cc
@@ -90,7 +90,7 @@ bool UltraFace::Preprocess(
  HWC2CHW::Run(mat);
  Cast::Run(mat, "float");
  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/facedet/contrib/yolov5face.cc
+++ b/fastdeploy/vision/facedet/contrib/yolov5face.cc
@@ -151,7 +151,7 @@ bool YOLOv5Face::Preprocess(
  Cast::Run(mat, "float");

  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc
+++ b/fastdeploy/vision/facedet/contrib/yolov7face/preprocessor.cc
@@ -32,10 +32,12 @@ Yolov7FacePreprocessor::Yolov7FacePreprocessor() {
  max_wh_ = 7680.0;
 }

-bool Yolov7FacePreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
+bool Yolov7FacePreprocessor::Run(
+    std::vector<FDMat>* images, std::vector<FDTensor>* outputs,
    std::vector<std::map<std::string, std::array<float, 2>>>* ims_info) {
  if (images->size() == 0) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
    return false;
  }
  ims_info->resize(images->size());
@@ -56,8 +58,9 @@ bool Yolov7FacePreprocessor::Run(std::vector<FDMat>* images, std::vector<FDTenso
  return true;
 }

-bool Yolov7FacePreprocessor::Preprocess(FDMat* mat, FDTensor* output,
-                                        std::map<std::string, std::array<float, 2>>* im_info){
+bool Yolov7FacePreprocessor::Preprocess(
+    FDMat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
  // Record the shape of image and the shape of preprocessed image
  (*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
                               static_cast<float>(mat->Width())};
@@ -75,7 +78,7 @@ bool Yolov7FacePreprocessor::Preprocess(FDMat* mat, FDTensor* output,
                                static_cast<float>(mat->Width())};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/faceid/contrib/adaface/preprocessor.cc
+++ b/fastdeploy/vision/faceid/contrib/adaface/preprocessor.cc
@@ -26,8 +26,7 @@ AdaFacePreprocessor::AdaFacePreprocessor() {
  permute_ = true;
 }

-bool AdaFacePreprocessor::Preprocess(FDMat * mat, FDTensor* output) {
-
+bool AdaFacePreprocessor::Preprocess(FDMat* mat, FDTensor* output) {
  // face recognition model's preprocess steps in insightface
  // reference: insightface/recognition/arcface_torch/inference.py
  // 1. Resize
@@ -48,14 +47,15 @@ bool AdaFacePreprocessor::Preprocess(FDMat * mat, FDTensor* output) {
  Cast::Run(mat, "float");

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

 bool AdaFacePreprocessor::Run(std::vector<FDMat>* images,
                              std::vector<FDTensor>* outputs) {
  if (images->empty()) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
    return false;
  }
  FDASSERT(images->size() == 1, "Only support batch = 1 now.");
--- a/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
+++ b/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
@@ -50,7 +50,7 @@ bool InsightFaceRecognitionPreprocessor::Preprocess(FDMat* mat,
  }

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/headpose/contrib/fsanet.cc
+++ b/fastdeploy/vision/headpose/contrib/fsanet.cc
@@ -22,8 +22,7 @@ namespace vision {

 namespace headpose {

-FSANet::FSANet(const std::string& model_file,
-               const std::string& params_file,
+FSANet::FSANet(const std::string& model_file, const std::string& params_file,
               const RuntimeOption& custom_option,
               const ModelFormat& model_format) {
  if (model_format == ModelFormat::ONNX) {
@@ -62,7 +61,8 @@ bool FSANet::Preprocess(Mat* mat, FDTensor* output,

  // Normalize
  std::vector<float> alpha = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f};
-  std::vector<float> beta = {-127.5f / 128.0f, -127.5f / 128.0f, -127.5f / 128.0f};
+  std::vector<float> beta = {-127.5f / 128.0f, -127.5f / 128.0f,
+                             -127.5f / 128.0f};
  Convert::Run(mat, alpha, beta);

  // Record output shape of preprocessed image
@@ -72,11 +72,12 @@ bool FSANet::Preprocess(Mat* mat, FDTensor* output,
  Cast::Run(mat, "float");

  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

-bool FSANet::Postprocess(FDTensor& infer_result, HeadPoseResult* result,
+bool FSANet::Postprocess(
+    FDTensor& infer_result, HeadPoseResult* result,
    const std::map<std::string, std::array<int, 2>>& im_info) {
  FDASSERT(infer_result.shape[0] == 1, "Only support batch = 1 now.");
  if (infer_result.dtype != FDDataType::FP32) {
@@ -85,8 +86,7 @@ bool FSANet::Postprocess(FDTensor& infer_result, HeadPoseResult* result,
  }

  auto iter_in = im_info.find("input_shape");
-  FDASSERT(iter_in != im_info.end(),
-           "Cannot find input_shape from im_info.");
+  FDASSERT(iter_in != im_info.end(), "Cannot find input_shape from im_info.");
  int in_h = iter_in->second[0];
  int in_w = iter_in->second[1];

--- a/fastdeploy/vision/matting/contrib/modnet.cc
+++ b/fastdeploy/vision/matting/contrib/modnet.cc
@@ -77,7 +77,7 @@ bool MODNet::Preprocess(Mat* mat, FDTensor* output,
  Cast::Run(mat, "float");

  mat->ShareWithTensor(output);
-  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/matting/contrib/rvm.cc
+++ b/fastdeploy/vision/matting/contrib/rvm.cc
@@ -74,7 +74,7 @@ bool RobustVideoMatting::Preprocess(
  (*im_info)["output_shape"] = {mat->Height(), mat->Width()};

  mat->ShareWithTensor(output);
-  output->ExpandDim(0);  // reshape to n, h, w, c
+  output->ExpandDim(0);  // reshape to n, c, h, w
  return true;
 }

--- a/fastdeploy/vision/utils/sort_det_res.cc
+++ b/fastdeploy/vision/utils/sort_det_res.cc
@@ -77,27 +77,42 @@ void SortDetectionResult(DetectionResult* result) {
  MergeSort(result, low, high);
 }

-bool LexSortByXYCompare(const std::array<float, 4>& box_a,
-                        const std::array<float, 4>& box_b) {
+template <typename T>
+bool LexSortByXYCompare(const std::array<T, 4>& box_a,
+                        const std::array<T, 4>& box_b) {
  // WARN: The status shoule be false if (a==b).
  // https://blog.csdn.net/xxxwrq/article/details/83080640
-  auto is_equal = [](const float& a, const float& b) -> bool {
+  auto is_equal = [](const T& a, const T& b) -> bool {
    return std::abs(a - b) < 1e-6f;
  };
-  const float& x0_a = box_a[0];
-  const float& y0_a = box_a[1];
-  const float& x0_b = box_b[0];
-  const float& y0_b = box_b[1];
+  const T& x0_a = box_a[0];
+  const T& y0_a = box_a[1];
+  const T& x0_b = box_b[0];
+  const T& y0_b = box_b[1];
  if (is_equal(x0_a, x0_b)) {
    return is_equal(y0_a, y0_b) ? false : y0_a > y0_b;
  }
  return x0_a > x0_b;
 }

+// Only for int dtype
+template <>
+bool LexSortByXYCompare(const std::array<int, 4>& box_a,
+                        const std::array<int, 4>& box_b) {
+  const int& x0_a = box_a[0];
+  const int& y0_a = box_a[1];
+  const int& x0_b = box_b[0];
+  const int& y0_b = box_b[1];
+  if (x0_a == x0_b) {
+    return y0_a == y0_b ? false : y0_a > y0_b;
+  }
+  return x0_a > x0_b;
+}
+
 void ReorderDetectionResultByIndices(DetectionResult* result,
                                     const std::vector<size_t>& indices) {
  // reorder boxes, scores, label_ids, masks
-  DetectionResult backup = (*result);  // move
+  DetectionResult backup = (*result);
  const bool contain_masks = backup.contain_masks;
  const int boxes_num = backup.boxes.size();
  result->Clear();
@@ -122,7 +137,7 @@ void ReorderDetectionResultByIndices(DetectionResult* result,
 }

 void LexSortDetectionResultByXY(DetectionResult* result) {
-  if (result->boxes.size() == 0) {
+  if (result->boxes.empty()) {
    return;
  }
  std::vector<size_t> indices;
@@ -138,6 +153,35 @@ void LexSortDetectionResultByXY(DetectionResult* result) {
  ReorderDetectionResultByIndices(result, indices);
 }

+void LexSortOCRDetResultByXY(std::vector<std::array<int, 8>>* result) {
+  if (result->empty()) {
+    return;
+  }
+  std::vector<size_t> indices;
+  indices.resize(result->size());
+  std::vector<std::array<int, 4>> boxes;
+  boxes.resize(result->size());
+  for (size_t i = 0; i < result->size(); ++i) {
+    indices[i] = i;
+    // 4 points to 2 points for LexSort
+    boxes[i] = {(*result)[i][0], (*result)[i][1], (*result)[i][6],
+                (*result)[i][7]};
+  }
+  // lex sort by x(w) then y(h)
+  std::sort(indices.begin(), indices.end(), [&boxes](size_t a, size_t b) {
+    return LexSortByXYCompare(boxes[a], boxes[b]);
+  });
+  // reorder boxes
+  std::vector<std::array<int, 8>> backup = (*result);
+  const int boxes_num = backup.size();
+  result->clear();
+  result->resize(boxes_num);
+  // boxes
+  for (int i = 0; i < boxes_num; ++i) {
+    (*result)[i] = backup[indices[i]];
+  }
+}
+
 }  // namespace utils
 }  // namespace vision
 }  // namespace fastdeploy
--- a/fastdeploy/vision/utils/utils.h
+++ b/fastdeploy/vision/utils/utils.h
@@ -67,8 +67,11 @@ void NMS(FaceDetectionResult* result, float iou_threshold = 0.5);
 /// Sort DetectionResult/FaceDetectionResult by score
 FASTDEPLOY_DECL void SortDetectionResult(DetectionResult* result);
 FASTDEPLOY_DECL void SortDetectionResult(FaceDetectionResult* result);
-/// Lex Sort DetectionResult/FaceDetectionResult by x(w) & y(h) axis
+/// Lex Sort DetectionResult by x(w) & y(h) axis
 FASTDEPLOY_DECL void LexSortDetectionResultByXY(DetectionResult* result);
+/// Lex Sort OCRDet Result by x(w) & y(h) axis
+FASTDEPLOY_DECL void LexSortOCRDetResultByXY(
+                     std::vector<std::array<int, 8>>* result);

 /// L2 Norm / cosine similarity  (for face recognition, ...)
 FASTDEPLOY_DECL std::vector<float>