[Benchmark] Add precision evaluation api from benchmark (#1310)

* [Benchmark] Init benchmark precision api * [Benchmark] Init benchmark precision api * [Benchmark] Add benchmark precision api * [Benchmark] Calculate the statis of diff * [Benchmark] Calculate the statis of diff * [Benchmark] Calculate the statis of diff * [Benchmark] Calculate the statis of diff * [Benchmark] Calculate the statis of diff * [Benchmark] Add SplitDataLine utils * [Benchmark] Add LexSortByXY func * [Benchmark] Add LexSortByXY func * [Benchmark] Add LexSortDetectionResultByXY func * [Benchmark] Add LexSortDetectionResultByXY func * [Benchmark] Add tensor diff presicion test * [Benchmark] fixed conflicts * [Benchmark] fixed calc tensor diff * fixed build bugs * fixed ci bugs when WITH_TESTING=ON
2025-10-06 09:07:10 +08:00 · 2023-02-16 17:16:14 +08:00
parent bdfb7b0008
commit ee85a3cade
14 changed files with 575 additions and 29 deletions
--- a/benchmark/cpp/CMakeLists.txt
+++ b/benchmark/cpp/CMakeLists.txt
@@ -11,13 +11,16 @@ include_directories(${FASTDEPLOY_INCS})
 add_executable(benchmark_yolov5 ${PROJECT_SOURCE_DIR}/benchmark_yolov5.cc)
 add_executable(benchmark_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov8.cc)
 add_executable(benchmark_ppcls ${PROJECT_SOURCE_DIR}/benchmark_ppcls.cc)
+add_executable(benchmark_precision_ppyolov8 ${PROJECT_SOURCE_DIR}/benchmark_precision_ppyolov8.cc)

 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppcls ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_precision_ppyolov8 ${FASTDEPLOY_LIBS} gflags pthread)
 else()
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
-  target_link_libraries(benchmark_ppcls ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_ppcls ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_precision_ppyolov8 ${FASTDEPLOY_LIBS} gflags)
 endif()
--- a/benchmark/cpp/benchmark_ppyolov8.cc
+++ b/benchmark/cpp/benchmark_ppyolov8.cc
--- a/benchmark/cpp/benchmark_precision_ppyolov8.cc
+++ b/benchmark/cpp/benchmark_precision_ppyolov8.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  auto model_file = FLAGS_model + sep + "model.pdmodel";
+  auto params_file = FLAGS_model + sep + "model.pdiparams";
+  auto config_file = FLAGS_model + sep + "infer_cfg.yml";
+  auto model_ppyolov8 = vision::detection::PaddleYOLOv8(model_file, params_file,
+                                                        config_file, option);
+  vision::DetectionResult res;
+  // Run once at least
+  model_ppyolov8.Predict(im, &res);
+  // 1. Test result diff
+  std::cout << "=============== Test result diff =================\n";
+  // Save result to -> disk.
+  std::string det_result_path = "ppyolov8_result.txt";
+  benchmark::ResultManager::SaveDetectionResult(res, det_result_path);
+  // Load result from <- disk.
+  vision::DetectionResult res_loaded;
+  benchmark::ResultManager::LoadDetectionResult(&res_loaded, det_result_path);
+  // Calculate diff between two results.
+  auto det_diff =
+      benchmark::ResultManager::CalculateDiffStatis(&res, &res_loaded);
+  std::cout << "diff: mean=" << det_diff.mean << ",max=" << det_diff.max
+            << ",min=" << det_diff.min << std::endl;
+  // 2. Test tensor diff
+  std::cout << "=============== Test tensor diff =================\n";
+  std::vector<vision::DetectionResult> bacth_res;
+  std::vector<fastdeploy::FDTensor> input_tensors, output_tensors;
+  std::vector<cv::Mat> imgs;
+  imgs.push_back(im);
+  std::vector<vision::FDMat> fd_images = vision::WrapMat(imgs);
+
+  model_ppyolov8.GetPreprocessor().Run(&fd_images, &input_tensors);
+  input_tensors[0].name = "image";
+  input_tensors[1].name = "scale_factor";
+  input_tensors[2].name = "im_shape";
+  input_tensors.pop_back();
+  model_ppyolov8.Infer(input_tensors, &output_tensors);
+  model_ppyolov8.GetPostprocessor().Run(output_tensors, &bacth_res);
+  // Save tensor to -> disk.
+  auto& tensor_dump = output_tensors[0];
+  std::string det_tensor_path = "ppyolov8_tensor.txt";
+  benchmark::ResultManager::SaveFDTensor(tensor_dump, det_tensor_path);
+  // Load tensor from <- disk.
+  fastdeploy::FDTensor tensor_loaded;
+  benchmark::ResultManager::LoadFDTensor(&tensor_loaded, det_tensor_path);
+  // Calculate diff between two tensors.
+  auto det_tensor_diff = benchmark::ResultManager::CalculateDiffStatis(
+      &tensor_dump, &tensor_loaded);
+  std::cout << "diff: mean=" << det_tensor_diff.mean
+            << ",max=" << det_tensor_diff.max << ",min=" << det_tensor_diff.min
+            << std::endl;
+  // 3. Run profiling
+  BENCHMARK_MODEL(model_ppyolov8, model_ppyolov8.Predict(im, &res))
+  auto vis_im = vision::VisDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+#endif
+
+  return 0;
+}
--- a/benchmark/cpp/benchmark_yolov5.cc
+++ b/benchmark/cpp/benchmark_yolov5.cc
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -41,6 +41,7 @@ function(fastdeploy_summary)
  message(STATUS "  ENABLE_OPENVINO_BACKEND   : ${ENABLE_OPENVINO_BACKEND}")
  message(STATUS "  ENABLE_BENCHMARK          : ${ENABLE_BENCHMARK}")
  message(STATUS "  WITH_GPU                  : ${WITH_GPU}")
+  message(STATUS "  WITH_TESTING              : ${WITH_TESTING}")
  message(STATUS "  WITH_ASCEND               : ${WITH_ASCEND}")
  message(STATUS "  WITH_TIMVX                : ${WITH_TIMVX}")
  message(STATUS "  WITH_KUNLUNXIN            : ${WITH_KUNLUNXIN}")
--- a/fastdeploy/benchmark/utils.cc
+++ b/fastdeploy/benchmark/utils.cc
@@ -19,10 +19,15 @@
 #include <cmath>

 #include "fastdeploy/benchmark/utils.h"
+#include "fastdeploy/utils/path.h"
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+#include "fastdeploy/vision/utils/utils.h"
+#endif

 namespace fastdeploy {
 namespace benchmark {

+#if defined(ENABLE_BENCHMARK)
 std::string Strip(const std::string& str, char ch) {
  int i = 0;
  while (str[i] == ch) {
@@ -35,8 +40,7 @@ std::string Strip(const std::string& str, char ch) {
  return str.substr(i, j + 1 - i);
 }

-void Split(const std::string& s, std::vector<std::string>& tokens,
-           char delim) {
+void Split(const std::string& s, std::vector<std::string>& tokens, char delim) {
  tokens.clear();
  size_t lastPos = s.find_first_not_of(delim, 0);
  size_t pos = s.find(delim, lastPos);
@@ -146,6 +150,332 @@ std::string ResourceUsageMonitor::GetCurrentGpuMemoryInfo(int device_id) {
 #endif
  return result;
 }
+#endif  // ENABLE_BENCHMARK
+
+/// Utils for precision evaluation
+#if defined(ENABLE_BENCHMARK)
+static const char KEY_VALUE_SEP = '#';
+static const char VALUE_SEP = ',';
+
+std::vector<std::string> ReadLines(const std::string& path) {
+  std::ifstream fin(path);
+  std::vector<std::string> lines;
+  std::string line;
+  if (fin.is_open()) {
+    while (getline(fin, line)) {
+      lines.push_back(line);
+    }
+  } else {
+    FDERROR << "Failed to open file " << path << std::endl;
+    std::abort();
+  }
+  fin.close();
+  return lines;
+}
+
+std::map<std::string, std::vector<std::string>> SplitDataLine(
+    const std::string& data_line) {
+  std::map<std::string, std::vector<std::string>> dict;
+  std::vector<std::string> tokens, value_tokens;
+  Split(data_line, tokens, KEY_VALUE_SEP);
+  std::string key = tokens[0];
+  std::string value = tokens[1];
+  Split(value, value_tokens, VALUE_SEP);
+  dict[key] = value_tokens;
+  return dict;
+}
+
+bool ResultManager::SaveFDTensor(const FDTensor& tensor,
+                                 const std::string& path) {
+  if (tensor.CpuData() == nullptr || tensor.Numel() <= 0) {
+    FDERROR << "Input tensor is empty!" << std::endl;
+    return false;
+  }
+  std::ofstream fs(path, std::ios::out);
+  if (!fs.is_open()) {
+    FDERROR << "Fail to open file:" << path << std::endl;
+    return false;
+  }
+  fs.precision(20);
+  if (tensor.Dtype() != FDDataType::FP32 &&
+      tensor.Dtype() != FDDataType::INT32 &&
+      tensor.Dtype() != FDDataType::INT64) {
+    FDERROR << "Only support FP32/INT32/INT64 now, but got "
+            << Str(tensor.dtype) << std::endl;
+    return false;
+  }
+  // name
+  fs << "name" << KEY_VALUE_SEP << tensor.name << "\n";
+  // shape
+  fs << "shape" << KEY_VALUE_SEP;
+  for (int i = 0; i < tensor.shape.size(); ++i) {
+    if (i < tensor.shape.size() - 1) {
+      fs << tensor.shape[i] << VALUE_SEP;
+    } else {
+      fs << tensor.shape[i];
+    }
+  }
+  fs << "\n";
+  // dtype
+  fs << "dtype" << KEY_VALUE_SEP << Str(tensor.dtype) << "\n";
+  // data
+  fs << "data" << KEY_VALUE_SEP;
+  const void* data_ptr = tensor.CpuData();
+  for (int i = 0; i < tensor.Numel(); ++i) {
+    if (tensor.Dtype() == FDDataType::INT64) {
+      if (i < tensor.Numel() - 1) {
+        fs << (static_cast<const int64_t*>(data_ptr))[i] << VALUE_SEP;
+      } else {
+        fs << (static_cast<const int64_t*>(data_ptr))[i];
+      }
+    } else if (tensor.Dtype() == FDDataType::INT32) {
+      if (i < tensor.Numel() - 1) {
+        fs << (static_cast<const int32_t*>(data_ptr))[i] << VALUE_SEP;
+      } else {
+        fs << (static_cast<const int32_t*>(data_ptr))[i];
+      }
+    } else {  // FP32
+      if (i < tensor.Numel() - 1) {
+        fs << (static_cast<const float*>(data_ptr))[i] << VALUE_SEP;
+      } else {
+        fs << (static_cast<const float*>(data_ptr))[i];
+      }
+    }
+  }
+  fs << "\n";
+  fs.close();
+  return true;
+}
+
+bool ResultManager::LoadFDTensor(FDTensor* tensor, const std::string& path) {
+  if (!CheckFileExists(path)) {
+    FDERROR << "Can't found file from" << path << std::endl;
+    return false;
+  }
+  auto lines = ReadLines(path);
+  std::map<std::string, std::vector<std::string>> data;
+  // name
+  data = SplitDataLine(lines[0]);
+  tensor->name = data.begin()->first;
+  // shape
+  data = SplitDataLine(lines[1]);
+  tensor->shape.clear();
+  for (const auto& s : data.begin()->second) {
+    tensor->shape.push_back(std::stol(s));
+  }
+  // dtype
+  data = SplitDataLine(lines[2]);
+  if (data.begin()->second.at(0) == Str(FDDataType::INT64)) {
+    tensor->dtype = FDDataType::INT64;
+  } else if (data.begin()->second.at(0) == Str(FDDataType::INT32)) {
+    tensor->dtype = FDDataType::INT32;
+  } else if (data.begin()->second.at(0) == Str(FDDataType::FP32)) {
+    tensor->dtype = FDDataType::FP32;
+  } else {
+    FDERROR << "Only support FP32/INT64/INT32 now, but got "
+            << data.begin()->second.at(0) << std::endl;
+    return false;
+  }
+  // data
+  data = SplitDataLine(lines[3]);
+  tensor->Allocate(tensor->shape, tensor->dtype, tensor->name);
+  if (tensor->dtype == FDDataType::INT64) {
+    int64_t* mutable_data_ptr = static_cast<int64_t*>(tensor->MutableData());
+    for (int i = 0; i < data.begin()->second.size(); ++i) {
+      mutable_data_ptr[i] = std::stol(data.begin()->second[i]);
+    }
+  } else if (tensor->dtype == FDDataType::INT32) {
+    int32_t* mutable_data_ptr = static_cast<int32_t*>(tensor->MutableData());
+    for (int i = 0; i < data.begin()->second.size(); ++i) {
+      mutable_data_ptr[i] = std::stoi(data.begin()->second[i]);
+    }
+  } else {  // FP32
+    float* mutable_data_ptr = static_cast<float*>(tensor->MutableData());
+    for (int i = 0; i < data.begin()->second.size(); ++i) {
+      mutable_data_ptr[i] = std::stof(data.begin()->second[i]);
+    }
+  }
+  return true;
+}
+
+TensorDiff ResultManager::CalculateDiffStatis(FDTensor* lhs, FDTensor* rhs) {
+  if (lhs->Numel() != rhs->Numel() || lhs->Dtype() != rhs->Dtype()) {
+    FDASSERT(false,
+             "The size and dtype of input FDTensor must be equal!"
+             " But got size %d, %d, dtype %s, %s",
+             lhs->Numel(), rhs->Numel(), Str(lhs->Dtype()).c_str(),
+             Str(rhs->Dtype()).c_str())
+  }
+  FDDataType dtype = lhs->Dtype();
+  int numel = lhs->Numel();
+  if (dtype != FDDataType::FP32 && dtype != FDDataType::INT64 &&
+      dtype != FDDataType::INT32) {
+    FDASSERT(false, "Only support FP32/INT64/INT32 now, but got %s",
+             Str(dtype).c_str())
+  }
+  if (dtype == FDDataType::INT64) {
+    std::vector<int64_t> tensor_diff(numel);
+    const int64_t* lhs_data_ptr = static_cast<const int64_t*>(lhs->CpuData());
+    const int64_t* rhs_data_ptr = static_cast<const int64_t*>(rhs->CpuData());
+    for (int i = 0; i < numel; ++i) {
+      tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
+    }
+    TensorDiff diff;
+    CalculateStatisInfo<int64_t>(tensor_diff.data(), numel, &(diff.mean),
+                                 &(diff.max), &(diff.min));
+    return diff;
+  } else if (dtype == FDDataType::INT32) {
+    std::vector<int32_t> tensor_diff(numel);
+    const int32_t* lhs_data_ptr = static_cast<const int32_t*>(lhs->CpuData());
+    const int32_t* rhs_data_ptr = static_cast<const int32_t*>(rhs->CpuData());
+    for (int i = 0; i < numel; ++i) {
+      tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
+    }
+    TensorDiff diff;
+    CalculateStatisInfo<float>(tensor_diff.data(), numel, &(diff.mean),
+                               &(diff.max), &(diff.min));
+    return diff;
+  } else {  // FP32
+    std::vector<float> tensor_diff(numel);
+    const float* lhs_data_ptr = static_cast<const float*>(lhs->CpuData());
+    const float* rhs_data_ptr = static_cast<const float*>(rhs->CpuData());
+    for (int i = 0; i < numel; ++i) {
+      tensor_diff[i] = lhs_data_ptr[i] - rhs_data_ptr[i];
+    }
+    TensorDiff diff;
+    CalculateStatisInfo<float>(tensor_diff.data(), numel, &(diff.mean),
+                               &(diff.max), &(diff.min));
+    return diff;
+  }
+}
+
+#if defined(ENABLE_VISION)
+bool ResultManager::SaveDetectionResult(const vision::DetectionResult& res,
+                                        const std::string& path) {
+  if (res.boxes.empty()) {
+    FDERROR << "DetectionResult can not be empty!" << std::endl;
+    return false;
+  }
+  std::ofstream fs(path, std::ios::out);
+  if (!fs.is_open()) {
+    FDERROR << "Fail to open file:" << path << std::endl;
+    return false;
+  }
+  fs.precision(20);
+  // boxes
+  fs << "boxes" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.boxes.size(); ++i) {
+    for (int j = 0; j < 4; ++j) {
+      if ((i == res.boxes.size() - 1) && (j == 3)) {
+        fs << res.boxes[i][j];
+      } else {
+        fs << res.boxes[i][j] << VALUE_SEP;
+      }
+    }
+  }
+  fs << "\n";
+  // scores
+  fs << "scores" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.scores.size(); ++i) {
+    if (i < res.scores.size() - 1) {
+      fs << res.scores[i] << VALUE_SEP;
+    } else {
+      fs << res.scores[i];
+    }
+  }
+  fs << "\n";
+  // label_ids
+  fs << "label_ids" << KEY_VALUE_SEP;
+  for (int i = 0; i < res.label_ids.size(); ++i) {
+    if (i < res.label_ids.size() - 1) {
+      fs << res.label_ids[i] << VALUE_SEP;
+    } else {
+      fs << res.label_ids[i];
+    }
+  }
+  fs << "\n";
+  // TODO(qiuyanjun): dump masks
+  fs.close();
+  return true;
+}
+
+bool ResultManager::LoadDetectionResult(vision::DetectionResult* res,
+                                        const std::string& path) {
+  if (!CheckFileExists(path)) {
+    FDERROR << "Can't found file from" << path << std::endl;
+    return false;
+  }
+  auto lines = ReadLines(path);
+  std::map<std::string, std::vector<std::string>> data;
+
+  // boxes
+  data = SplitDataLine(lines[0]);
+  int boxes_num = data.begin()->second.size() / 4;
+  res->Resize(boxes_num);
+  for (int i = 0; i < boxes_num; ++i) {
+    res->boxes[i][0] = std::stof(data.begin()->second[i * 4 + 0]);
+    res->boxes[i][1] = std::stof(data.begin()->second[i * 4 + 1]);
+    res->boxes[i][2] = std::stof(data.begin()->second[i * 4 + 2]);
+    res->boxes[i][3] = std::stof(data.begin()->second[i * 4 + 3]);
+  }
+  // scores
+  data = SplitDataLine(lines[1]);
+  for (int i = 0; i < data.begin()->second.size(); ++i) {
+    res->scores[i] = std::stof(data.begin()->second[i]);
+  }
+  // label_ids
+  data = SplitDataLine(lines[2]);
+  for (int i = 0; i < data.begin()->second.size(); ++i) {
+    res->label_ids[i] = std::stoi(data.begin()->second[i]);
+  }
+  // TODO(qiuyanjun): load masks
+  return true;
+}
+
+DetectionDiff ResultManager::CalculateDiffStatis(vision::DetectionResult* lhs,
+                                                 vision::DetectionResult* rhs,
+                                                 float score_threshold) {
+  // lex sort by x(w) & y(h)
+  vision::utils::LexSortDetectionResultByXY(lhs);
+  vision::utils::LexSortDetectionResultByXY(rhs);
+  // get value diff & trunc it by score_threshold
+  const int boxes_num = std::min(lhs->boxes.size(), rhs->boxes.size());
+  std::vector<float> boxes_diff;
+  std::vector<float> scores_diff;
+  std::vector<int32_t> labels_diff;
+  // TODO(qiuyanjun): process the diff of masks.
+  for (int i = 0; i < boxes_num; ++i) {
+    if (lhs->scores[i] > score_threshold && rhs->scores[i] > score_threshold) {
+      scores_diff.push_back(lhs->scores[i] - rhs->scores[i]);
+      labels_diff.push_back(lhs->label_ids[i] - rhs->label_ids[i]);
+      boxes_diff.push_back(lhs->boxes[i][0] - rhs->boxes[i][0]);
+      boxes_diff.push_back(lhs->boxes[i][1] - rhs->boxes[i][1]);
+      boxes_diff.push_back(lhs->boxes[i][2] - rhs->boxes[i][2]);
+      boxes_diff.push_back(lhs->boxes[i][3] - rhs->boxes[i][3]);
+    }
+  }
+  FDASSERT(boxes_diff.size() > 0,
+           "Can't get any valid boxes while score_threshold is %f, "
+           "The boxes.size of lhs is %d, the boxes.size of rhs is %d",
+           score_threshold, lhs->boxes.size(), rhs->boxes.size())
+
+  DetectionDiff diff;
+  CalculateStatisInfo<float>(boxes_diff.data(), boxes_diff.size(),
+                             &(diff.boxes.mean), &(diff.boxes.max),
+                             &(diff.boxes.min));
+  CalculateStatisInfo<float>(scores_diff.data(), scores_diff.size(),
+                             &(diff.scores.mean), &(diff.scores.max),
+                             &(diff.scores.min));
+  CalculateStatisInfo<int32_t>(labels_diff.data(), labels_diff.size(),
+                               &(diff.labels.mean), &(diff.labels.max),
+                               &(diff.labels.min));
+  diff.mean = diff.boxes.mean;
+  diff.max = diff.boxes.max;
+  diff.min = diff.boxes.min;
+  return diff;
+}
+#endif  // ENABLE_VISION
+#endif  // ENABLE_BENCHMARK

 }  // namespace benchmark
 }  // namespace fastdeploy
--- a/fastdeploy/benchmark/utils.h
+++ b/fastdeploy/benchmark/utils.h
@@ -16,9 +16,15 @@
 #include <memory>
 #include <thread>  // NOLINT
 #include "fastdeploy/utils/utils.h"
+#include "fastdeploy/core/fd_tensor.h"
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+#include "fastdeploy/vision/common/result.h"
+#endif

 namespace fastdeploy {
 namespace benchmark {
+
+#if defined(ENABLE_BENCHMARK)
 /*! @brief ResourceUsageMonitor object used when to collect memory info.
 */
 class FASTDEPLOY_DECL ResourceUsageMonitor {
@@ -86,5 +92,48 @@ FASTDEPLOY_DECL void Split(const std::string& s,
                           std::vector<std::string>& tokens,
                           char delim = ' ');

+/// Diff values for precision evaluation
+struct FASTDEPLOY_DECL BaseDiff {};
+
+struct FASTDEPLOY_DECL EvalStatis {
+  double mean = -1.0;
+  double min = -1.0;
+  double max = -1.0;
+};
+
+struct FASTDEPLOY_DECL TensorDiff: public BaseDiff, public EvalStatis {};
+
+#if defined(ENABLE_VISION)
+struct FASTDEPLOY_DECL DetectionDiff: public BaseDiff, public EvalStatis {
+  EvalStatis boxes;
+  EvalStatis scores;
+  EvalStatis labels;
+};
+#endif  // ENABLE_VISION
+#endif  // ENABLE_BENCHMARK
+
+/// Utils for precision evaluation
+struct FASTDEPLOY_DECL ResultManager {
+#if defined(ENABLE_BENCHMARK)
+  /// Save & Load functions for FDTensor result.
+  static bool SaveFDTensor(const FDTensor& tensor, const std::string& path);
+  static bool LoadFDTensor(FDTensor* tensor, const std::string& path);
+  /// Calculate diff value between two FDTensor results.
+  static TensorDiff CalculateDiffStatis(FDTensor* lhs,
+                                        FDTensor* rhs);
+#if defined(ENABLE_VISION)
+  /// Save & Load functions for basic results.
+  static bool SaveDetectionResult(const vision::DetectionResult& res,
+                                  const std::string& path);
+  static bool LoadDetectionResult(vision::DetectionResult* res,
+                                  const std::string& path);
+  /// Calculate diff value between two basic results.
+  static DetectionDiff CalculateDiffStatis(vision::DetectionResult* lhs,
+                                           vision::DetectionResult* rhs,
+                                           float score_threshold = 0.3f);
+#endif  // ENABLE_VISION
+#endif  // ENABLE_BENCHMARK
+};
+
 }  // namespace benchmark
 }  // namespace fastdeploy
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -211,25 +211,6 @@ bool FDTensor::Reshape(const std::vector<int64_t>& new_shape) {
  return true;
 }

-template <typename T>
-void CalculateStatisInfo(const void* src_ptr, int size, double* mean,
-                         double* max, double* min) {
-  const T* ptr = static_cast<const T*>(src_ptr);
-  *mean = 0;
-  *max = -99999999;
-  *min = 99999999;
-  for (int i = 0; i < size; ++i) {
-    if (*(ptr + i) > *max) {
-      *max = *(ptr + i);
-    }
-    if (*(ptr + i) < *min) {
-      *min = *(ptr + i);
-    }
-    *mean += *(ptr + i);
-  }
-  *mean = *mean / size;
-}
-
 void FDTensor::PrintInfo(const std::string& prefix) const {
  double mean = 0;
  double max = -99999999;
--- a/fastdeploy/utils/utils.h
+++ b/fastdeploy/utils/utils.h
@@ -214,4 +214,24 @@ std::string Str(const std::vector<T>& shape) {
  return oss.str();
 }

+template <typename T>
+void CalculateStatisInfo(const void* src_ptr, int size, double* mean,
+                         double* max, double* min) {
+  const T* ptr = static_cast<const T*>(src_ptr);
+  *mean = static_cast<double>(0);
+  *max = static_cast<double>(-99999999);
+  *min = static_cast<double>(99999999);
+  for (int i = 0; i < size; ++i) {
+    if (*(ptr + i) > *max) {
+      *max = *(ptr + i);
+    }
+    if (*(ptr + i) < *min) {
+      *min = *(ptr + i);
+    }
+    *mean += *(ptr + i);
+  }
+  *mean = *mean / size;
+}
+
+
 }  // namespace fastdeploy
--- a/fastdeploy/vision/detection/ppdet/postprocessor.cc
+++ b/fastdeploy/vision/detection/ppdet/postprocessor.cc
@@ -82,6 +82,7 @@ bool PaddleDetPostprocessor::Run(const std::vector<FDTensor>& tensors,
    const auto* data = static_cast<const int64_t*>(tensors[1].CpuData());
    for (size_t i = 0; i < tensors[1].shape[0]; ++i) {
      num_boxes[i] = static_cast<int>(data[i]);
+      total_num_boxes += num_boxes[i];
    }
  }

--- a/fastdeploy/vision/utils/sort_det_res.cc
+++ b/fastdeploy/vision/utils/sort_det_res.cc
@@ -28,6 +28,7 @@ void Merge(DetectionResult* result, size_t low, size_t mid, size_t high) {
  size_t i = low;
  size_t j = mid + 1;
  size_t k = i;
+  // TODO(qiuyanjun): add masks process
  for (; i <= mid && j <= high; k++) {
    if (temp_scores[i] >= temp_scores[j]) {
      scores[k] = temp_scores[i];
@@ -70,12 +71,73 @@ void SortDetectionResult(DetectionResult* result) {
  size_t low = 0;
  size_t high = result->scores.size();
  if (high == 0) {
-      return;
+    return;
  }
  high = high - 1;
  MergeSort(result, low, high);
 }

+bool LexSortByXYCompare(const std::array<float, 4>& box_a,
+                        const std::array<float, 4>& box_b) {
+  // WARN: The status shoule be false if (a==b).
+  // https://blog.csdn.net/xxxwrq/article/details/83080640
+  auto is_equal = [](const float& a, const float& b) -> bool {
+    return std::abs(a - b) < 1e-6f;
+  };
+  const float& x0_a = box_a[0];
+  const float& y0_a = box_a[1];
+  const float& x0_b = box_b[0];
+  const float& y0_b = box_b[1];
+  if (is_equal(x0_a, x0_b)) {
+    return is_equal(y0_a, y0_b) ? false : y0_a > y0_b;
+  }
+  return x0_a > x0_b;
+}
+
+void ReorderDetectionResultByIndices(DetectionResult* result,
+                                     const std::vector<size_t>& indices) {
+  // reorder boxes, scores, label_ids, masks
+  DetectionResult backup = (*result);  // move
+  const bool contain_masks = backup.contain_masks;
+  const int boxes_num = backup.boxes.size();
+  result->Clear();
+  result->Resize(boxes_num);
+  // boxes, scores, labels_ids
+  for (int i = 0; i < boxes_num; ++i) {
+    result->boxes[i] = backup.boxes[indices[i]];
+    result->scores[i] = backup.scores[indices[i]];
+    result->label_ids[i] = backup.label_ids[indices[i]];
+  }
+  if (contain_masks) {
+    result->contain_masks = true;
+    for (int i = 0; i < boxes_num; ++i) {
+      const auto& shape = backup.masks[indices[i]].shape;
+      const int mask_numel = shape[0] * shape[1];
+      result->masks[i].shape = shape;
+      result->masks[i].Resize(mask_numel);
+      std::memcpy(result->masks[i].Data(), backup.masks[indices[i]].Data(),
+                  mask_numel * sizeof(uint8_t));
+    }
+  }
+}
+
+void LexSortDetectionResultByXY(DetectionResult* result) {
+  if (result->boxes.size() == 0) {
+    return;
+  }
+  std::vector<size_t> indices;
+  indices.resize(result->boxes.size());
+  for (size_t i = 0; i < result->boxes.size(); ++i) {
+    indices[i] = i;
+  }
+  // lex sort by x(w) then y(h)
+  auto& boxes = result->boxes;
+  std::sort(indices.begin(), indices.end(), [&boxes](size_t a, size_t b) {
+    return LexSortByXYCompare(boxes[a], boxes[b]);
+  });
+  ReorderDetectionResultByIndices(result, indices);
+}
+
 }  // namespace utils
 }  // namespace vision
 }  // namespace fastdeploy
--- a/fastdeploy/vision/utils/utils.h
+++ b/fastdeploy/vision/utils/utils.h
@@ -64,12 +64,13 @@ void NMS(DetectionResult* output, float iou_threshold = 0.5,

 void NMS(FaceDetectionResult* result, float iou_threshold = 0.5);

-// MergeSort
-void SortDetectionResult(DetectionResult* output);
+/// Sort DetectionResult/FaceDetectionResult by score
+FASTDEPLOY_DECL void SortDetectionResult(DetectionResult* result);
+FASTDEPLOY_DECL void SortDetectionResult(FaceDetectionResult* result);
+/// Lex Sort DetectionResult/FaceDetectionResult by x(w) & y(h) axis
+FASTDEPLOY_DECL void LexSortDetectionResultByXY(DetectionResult* result);

-void SortDetectionResult(FaceDetectionResult* result);
-
-// L2 Norm / cosine similarity  (for face recognition, ...)
+/// L2 Norm / cosine similarity  (for face recognition, ...)
 FASTDEPLOY_DECL std::vector<float>
 L2Normalize(const std::vector<float>& values);

--- a/scripts/android/build_android_cpp_with_benchmark.sh
+++ b/scripts/android/build_android_cpp_with_benchmark.sh
@@ -92,11 +92,12 @@ __build_fastdeploy_android_shared() {
        -DENABLE_FLYCV=ON \
        -DENABLE_TEXT=OFF \
        -DENABLE_VISION=ON \
-        -DBUILD_EXAMPLES=ON \
+        -DBUILD_EXAMPLES=OFF \
        -DENABLE_BENCHMARK=ON \
        -DWITH_OPENCV_STATIC=OFF \
        -DWITH_LITE_STATIC=OFF \
        -DWITH_OPENMP=OFF \
+        -DWITH_TESTING=OFF \
        -DCMAKE_INSTALL_PREFIX=${FASDEPLOY_INSTALL_DIR} \
        -Wno-dev ../../.. && make -j8 && make install

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -62,12 +62,22 @@ function(add_fastdeploy_unittest CC_FILE)
 endfunction()

 if(WITH_TESTING)
+  if(ANDROID OR IOS)
+    # gtest in FastDeploy is not support for cross compiling now.
+    message(FATAL_ERROR "Not support unittest for Android and IOS now.")
+  endif()
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
  add_library(fastdeploy_gtest_main STATIC gtest_main)
  target_link_libraries(fastdeploy_gtest_main PUBLIC gtest gflags)
  message(STATUS "")
  message(STATUS "*************FastDeploy Unittest Summary**********")
  file(GLOB_RECURSE ALL_TEST_SRCS ${PROJECT_SOURCE_DIR}/tests/*/test_*.cc)
+  if(NOT ENABLE_VISION)
+    # vision_preprocess and release_task need vision
+    file(GLOB_RECURSE VISION_TEST_SRCS ${PROJECT_SOURCE_DIR}/tests/vision_preprocess/test_*.cc)
+    file(GLOB_RECURSE RELEASE_TEST_SRCS ${PROJECT_SOURCE_DIR}/tests/release_task/test_*.cc)
+    list(REMOVE_ITEM ALL_TEST_SRCS ${VISION_TEST_SRCS} ${RELEASE_TEST_SRCS})
+  endif()
  foreach(_CC_FILE ${ALL_TEST_SRCS})
    add_fastdeploy_unittest(${_CC_FILE})
  endforeach()