Add reduce functions for FDTensor (#81)

* Add eigen tensor structure * Add Reduce function * Add cpp unittest framework * Add reduce function unittest * Add pr template * Add comments and docs for reduce function * Fix typo * Add ENABLE_FDTENSOR_FUNC macro * Add Todo comment * Add CheckData overload * Fix CheckData overload operator() Co-authored-by: Jason <jiangjiajun@baidu.com>
2025-12-24 13:28:13 +08:00 · 2022-08-10 10:31:04 +08:00
parent 9918374d74
commit c7d37b6732
17 changed files with 1568 additions and 277 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,9 @@
+<!-- Demo: https://github.com/PaddlePaddle/Paddle/pull/24810 -->
+### PR types
+<!-- One of [ New features | Bug fixes | Function optimization | Performance optimization | Breaking changes | Others ] -->
+
+### PR changes
+<!-- One of [ OPs | APIs | Docs | Others ] -->
+
+### Describe
+<!-- Describe what this PR does -->
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,9 @@ option(TRT_DIRECTORY "If build tensorrt backend, need to define path of tensorrt
 option(ENABLE_VISION "Whether to enable vision models usage." OFF)
 option(ENABLE_VISION_VISUALIZE "Whether to enable visualize vision model result toolbox." ON)
 option(ENABLE_TEXT "Whether to enable text models usage." OFF)
+option(WITH_TESTING "Whether to compile with unittest." OFF)
+# TODO(zhoushunjie): Will remove it later.
+option(ENABLE_FDTENSOR_FUNC "Whether to compile with function of FDTensor." OFF)

 # Please don't open this flag now, some bugs exists.
 option(ENABLE_OPENCV_CUDA "Whether to enable opencv with cuda, this will allow process image with GPU." OFF)
@@ -56,6 +59,12 @@ option(ENABLE_DEBUG "Whether to enable print debug information, this may reduce
 option(WITH_VISION_EXAMPLES "Whether to build fastdeply with vision examples" OFF)
 option(WITH_TEXT_EXAMPLES "Whether to build fastdeply with text examples" OFF)

+# config GIT_URL with github mirrors to speed up dependent repos clone
+option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
+if(NOT GIT_URL)
+    set(GIT_URL "https://github.com")
+endif()
+
 # Check for 32bit system
 if(WIN32)
  if(NOT CMAKE_CL_64)
@@ -108,13 +117,14 @@ endif()

 add_definitions(-DFASTDEPLOY_LIB)
 file(GLOB_RECURSE ALL_DEPLOY_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/*.cc)
+file(GLOB_RECURSE FDTENSOR_FUNC_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/function/*.cc)
 file(GLOB_RECURSE DEPLOY_ORT_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/ort/*.cc)
 file(GLOB_RECURSE DEPLOY_PADDLE_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/paddle/*.cc)
 file(GLOB_RECURSE DEPLOY_TRT_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/tensorrt/*.cc ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/tensorrt/*.cpp)
 file(GLOB_RECURSE DEPLOY_VISION_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/vision/*.cc)
 file(GLOB_RECURSE DEPLOY_TEXT_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/text/*.cc)
 file(GLOB_RECURSE DEPLOY_PYBIND_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/pybind/*.cc ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/*_pybind.cc)
-list(REMOVE_ITEM ALL_DEPLOY_SRCS ${DEPLOY_ORT_SRCS} ${DEPLOY_PADDLE_SRCS} ${DEPLOY_TRT_SRCS} ${DEPLOY_VISION_SRCS} ${DEPLOY_TEXT_SRCS})
+list(REMOVE_ITEM ALL_DEPLOY_SRCS ${DEPLOY_ORT_SRCS} ${DEPLOY_PADDLE_SRCS} ${DEPLOY_TRT_SRCS} ${DEPLOY_VISION_SRCS} ${DEPLOY_TEXT_SRCS} ${FDTENSOR_FUNC_SRCS})

 set(DEPEND_LIBS "")

@@ -223,6 +233,11 @@ if(ENABLE_TEXT)
  include(external/faster_tokenizer.cmake)
 endif()

+if (ENABLE_FDTENSOR_FUNC)
+  add_definitions(-DENABLE_FDTENSOR_FUNC)
+  list(APPEND ALL_DEPLOY_SRCS ${FDTENSOR_FUNC_SRCS})
+endif()
+
 configure_file(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/core/config.h.in ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/core/config.h)
 configure_file(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/pybind/main.cc.in ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/pybind/main.cc)
 configure_file(${PROJECT_SOURCE_DIR}/FastDeploy.cmake.in ${PROJECT_SOURCE_DIR}/FastDeploy.cmake @ONLY)
@@ -231,6 +246,8 @@ configure_file(${PROJECT_SOURCE_DIR}/fastdeploy/c_lib_wrap.py.in ${PROJECT_SOURC
 list(REMOVE_ITEM ALL_DEPLOY_SRCS ${DEPLOY_PYBIND_SRCS})

 add_library(${LIBRARY_NAME} SHARED ${ALL_DEPLOY_SRCS})
+add_dependencies(${LIBRARY_NAME} extern_eigen3)
+
 redefine_file_macro(${LIBRARY_NAME})
 set_target_properties(${LIBRARY_NAME} PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
 if(NOT APPLE)
@@ -276,6 +293,14 @@ if (WITH_TEXT_EXAMPLES AND EXISTS ${PROJECT_SOURCE_DIR}/examples)
  endif()
 endif()

+if (WITH_TESTING AND EXISTS ${PROJECT_SOURCE_DIR}/tests)
+  add_definitions(-DWITH_TESTING)
+  include(external/gtest.cmake)
+  include(external/gflags.cmake)
+  include(external/glog.cmake)
+  add_subdirectory(tests)
+endif()
+
 include(external/summary.cmake)
 fastdeploy_summary()
 if(WIN32)
--- a/csrcs/fastdeploy/function/eigen.cc
+++ b/csrcs/fastdeploy/function/eigen.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/function/eigen.h"
+
+namespace fastdeploy {
+
+std::shared_ptr<EigenDeviceWrapper> EigenDeviceWrapper::instance_ = nullptr;
+
+std::shared_ptr<EigenDeviceWrapper> EigenDeviceWrapper::GetInstance() {
+  if (instance_ == nullptr) {
+    instance_ = std::make_shared<EigenDeviceWrapper>();
+  }
+  return instance_;
+}
+
+const Eigen::DefaultDevice* EigenDeviceWrapper::GetDevice() const {
+  return &device_;
+}
+
+}  // namespace fastdeploy
--- a/csrcs/fastdeploy/function/eigen.h
+++ b/csrcs/fastdeploy/function/eigen.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+#include "fastdeploy/core/fd_tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace fastdeploy {
+// EigenDim converts shape into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const std::vector<int64_t>& dims) {
+    Type ret;
+    for (int64_t d = 0; d < dims.size(); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret FDTensor as EigenTensor and EigenConstTensor.
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(FDTensor& tensor,
+                   const std::vector<int64_t>& dims) {  // NOLINT
+    return Type(reinterpret_cast<T*>(tensor.Data()), EigenDim<D>::From(dims));
+  }
+
+  static Type From(FDTensor& tensor) {  // NOLINT
+    return From(tensor, tensor.shape);
+  }  // NOLINT
+
+  static ConstType From(const FDTensor& tensor,
+                        const std::vector<int64_t>& dims) {
+    return ConstType(reinterpret_cast<const T*>(tensor.Data()),
+                     EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const FDTensor& tensor) {
+    return From(tensor, tensor.shape);
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(FDTensor& tensor) {
+    return Type(reinterpret_cast<T*>(tensor.Data()));
+  }  // NOLINT
+
+  static ConstType From(const FDTensor& tensor) {
+    return ConstType(reinterpret_cast<const T*>(tensor.Data()));
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(FDTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {tensor.Numel()});
+  }
+
+  static typename EigenVector::ConstType Flatten(
+      const FDTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {tensor.Numel()});
+  }
+};
+
+class EigenDeviceWrapper {
+ public:
+  static std::shared_ptr<EigenDeviceWrapper> GetInstance();
+  const Eigen::DefaultDevice* GetDevice() const;
+
+ private:
+  Eigen::DefaultDevice device_;
+  static std::shared_ptr<EigenDeviceWrapper> instance_;
+};
+
+}  // namespace fastdeploy
--- a/csrcs/fastdeploy/function/reduce.cc
+++ b/csrcs/fastdeploy/function/reduce.cc
@@ -0,0 +1,246 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <set>
+
+#include "fastdeploy/function/eigen.h"
+#include "fastdeploy/function/reduce.h"
+#include "fastdeploy/function/reduce_functor.h"
+#include "fastdeploy/utils/utils.h"
+
+namespace fastdeploy {
+
+#ifdef ENABLE_FDTENSOR_FUNC
+
+template <typename T, size_t D, size_t R_D, typename Functor>
+void ReduceFunctor(const FDTensor& input, FDTensor* output,
+                   const std::vector<int64_t>& dims, bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int64_t> dims_ref = dims;
+
+  auto out_dims = input.shape;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+    out_dims[dims_ref[i]] = 1;
+  }
+  auto origin_output_dims = out_dims;
+  output->Allocate(origin_output_dims, TypeToDataType<T>::dtype);
+  // construct the squeezed output tensor
+  if (x_rank > 1) {
+    const int kDelFlag = -2;
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      out_dims[dims_ref[i]] = kDelFlag;
+    }
+    out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                   out_dims.end());
+  }
+
+  auto& place = *EigenDeviceWrapper::GetInstance()->GetDevice();
+  Functor functor;
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+    if (!keep_dim) {
+      output->shape = std::move(out_dims);
+    }
+  }
+}
+
+#define HANDLE_REDUCE_DIM(NDIM, RDIM)                                        \
+  if (ndim == NDIM && rdim == RDIM) {                                        \
+    ReduceFunctor<OutT, NDIM, RDIM, Functor>(input, output, dims, keep_dim); \
+  }
+
+inline void GetShuffledDim(const std::vector<int64_t>& src_dims,
+                           std::vector<int64_t>* dst_dims,
+                           const std::vector<int64_t>& reduced_dims,
+                           std::vector<int>* perm_axis) {
+  // check if it's a reduced dim
+  std::vector<bool> src_dims_check(src_dims.size(), false);
+  size_t src_size = src_dims.size();
+  size_t reduce_size = reduced_dims.size();
+  std::vector<int64_t> regular_reduced_dims = reduced_dims;
+  for (size_t i = 0; i < regular_reduced_dims.size(); i++) {
+    if (regular_reduced_dims[i] < 0) {
+      regular_reduced_dims[i] = src_size + regular_reduced_dims[i];
+    }
+  }
+
+  for (size_t i = 0; i < reduce_size; ++i) {
+    dst_dims->at(src_size - reduce_size + i) =
+        src_dims[regular_reduced_dims[i]];
+    (*perm_axis)[src_size - reduce_size + i] = regular_reduced_dims[i];
+    src_dims_check[regular_reduced_dims[i]] = true;
+  }
+
+  size_t offset = 0;
+  for (size_t i = 0; i < src_dims_check.size(); ++i) {
+    bool is_reduced = src_dims_check[i];
+    if (!is_reduced) {
+      (*perm_axis)[offset] = i;
+      dst_dims->at(offset++) = src_dims[i];
+    }
+  }
+}
+
+template <typename OutT>
+void GetShuffledInput(const FDTensor& input, FDTensor* shuffled_input,
+                      const std::vector<int64_t>& dims) {
+  auto shuffled_dims = input.shape;
+  std::vector<int> perm_axis(input.shape.size());
+  GetShuffledDim(input.shape, &shuffled_dims, dims, &perm_axis);
+
+  shuffled_input->Allocate(shuffled_dims, input.dtype);
+  // TODO(zhoushunjie) : Need to implement trans function
+  // phi::funcs::TransposeNormal<DeviceContext, OutT> trans;
+  // trans(dev_ctx, input, shuffled_input, perm_axis);
+}
+
+//////////////// HandleLargeDim
+template <typename OutT, typename Functor>
+void HandleLargeDim(const FDTensor& input, FDTensor* output,
+                    const std::vector<int64_t>& dims, bool keep_dim) {
+  //  shuffle the reduced dim to the end
+  FDTensor shuffled_input;
+  GetShuffledInput<OutT>(input, &shuffled_input, dims);
+
+  // transpose to 2D tensor whose shape is {unreduced, reduced}.
+  const int64_t unreduced = output->Numel();
+  const int64_t reduced = shuffled_input.Numel() / unreduced;
+  shuffled_input.Allocate({unreduced, reduced}, TypeToDataType<OutT>::dtype);
+
+  auto output_dim = output->shape;
+  output->Allocate({unreduced}, TypeToDataType<OutT>::dtype);
+
+  ReduceFunctor<OutT, 2, 1, Functor>(shuffled_input, output, {1}, keep_dim);
+  output->shape = output_dim;
+}
+
+////////////// ReduceKernel
+
+template <typename OutT, typename Functor>
+void ReduceKernelImpl(const FDTensor& input, FDTensor* output,
+                      const std::vector<int64_t>& dims, bool keep_dim,
+                      bool reduce_all) {
+  output->Allocate({1}, TypeToDataType<OutT>::dtype);
+  const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
+  if (reduce_all) {
+    // Flatten and reduce 1-D tensor
+    auto x = EigenVector<OutT>::Flatten(input);
+    auto out = EigenScalar<OutT>::From(*output);
+    auto reduce_dim = Eigen::array<int, 1>({{0}});
+
+    Functor functor;
+    functor(dev, &x, &out, reduce_dim);
+  } else {
+    int ndim = input.shape.size();
+    int rdim = dims.size();
+    if (ndim > 3) {
+      HandleLargeDim<OutT, Functor>(input, output, dims, keep_dim);
+    } else {
+      HANDLE_REDUCE_DIM(4, 3);
+      HANDLE_REDUCE_DIM(4, 2);
+      HANDLE_REDUCE_DIM(4, 1);
+      HANDLE_REDUCE_DIM(3, 2);
+      HANDLE_REDUCE_DIM(3, 1);
+      HANDLE_REDUCE_DIM(2, 1);
+      HANDLE_REDUCE_DIM(1, 1);
+    }
+  }
+}
+
+template <typename OutT, typename Functor>
+void BoolReduceKernel(const FDTensor& input, FDTensor* output,
+                      const std::vector<int64_t>& dims, bool keep_dim,
+                      bool reduce_all) {
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = input.shape.size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+
+  ReduceKernelImpl<bool, Functor>(input, output, dims, keep_dim, reduce_all);
+}
+
+template <typename Functor>
+void Reduce(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+            bool keep_dim, bool reduce_all) {
+  // If the dims has full dim, set the reduce_all is True
+  const int& input_dim_size = x.shape.size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (int i = 0; i < input_dim_size; ++i) {
+    if (dims_set.find(i) == dims_set.end() &&
+        dims_set.find(i - input_dim_size) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+
+  FD_VISIT_ALL_TYPES(x.dtype, "ReduceKernelImpl", ([&] {
+                       ReduceKernelImpl<data_t, Functor>(x, out, dims, keep_dim,
+                                                         reduce_all);
+                     }));
+}
+
+void Max(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+         bool keep_dim, bool reduce_all) {
+  Reduce<MaxFunctor>(x, out, dims, keep_dim, reduce_all);
+}
+
+void Min(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+         bool keep_dim, bool reduce_all) {
+  Reduce<MinFunctor>(x, out, dims, keep_dim, reduce_all);
+}
+
+void Sum(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+         bool keep_dim, bool reduce_all) {
+  Reduce<SumFunctor>(x, out, dims, keep_dim, reduce_all);
+}
+
+void All(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+         bool keep_dim, bool reduce_all) {
+  BoolReduceKernel<bool, AllFunctor>(x, out, dims, keep_dim, reduce_all);
+}
+
+void Any(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+         bool keep_dim, bool reduce_all) {
+  BoolReduceKernel<bool, AnyFunctor>(x, out, dims, keep_dim, reduce_all);
+}
+
+void Mean(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+          bool keep_dim, bool reduce_all) {
+  Reduce<MeanFunctor>(x, out, dims, keep_dim, reduce_all);
+}
+
+void Prod(const FDTensor& x, FDTensor* out, const std::vector<int64_t>& dims,
+          bool keep_dim, bool reduce_all) {
+  Reduce<ProdFunctor>(x, out, dims, keep_dim, reduce_all);
+}
+#endif
+
+}  // namespace fastdeploy
--- a/csrcs/fastdeploy/function/reduce.h
+++ b/csrcs/fastdeploy/function/reduce.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+
+#ifdef ENABLE_FDTENSOR_FUNC
+/** Excute the maximum operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+FASTDEPLOY_DECL void Max(const FDTensor& x, FDTensor* out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim = false, bool reduce_all = false);
+
+/** Excute the minimum operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+FASTDEPLOY_DECL void Min(const FDTensor& x, FDTensor* out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim = false, bool reduce_all = false);
+
+/** Excute the sum operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+FASTDEPLOY_DECL void Sum(const FDTensor& x, FDTensor* out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim = false, bool reduce_all = false);
+
+/** Excute the all operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+FASTDEPLOY_DECL void All(const FDTensor& x, FDTensor* out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim = false, bool reduce_all = false);
+
+/** Excute the any operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+FASTDEPLOY_DECL void Any(const FDTensor& x, FDTensor* out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim = false, bool reduce_all = false);
+
+/** Excute the mean operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+FASTDEPLOY_DECL void Mean(const FDTensor& x, FDTensor* out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim = false, bool reduce_all = false);
+
+/** Excute the product operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+FASTDEPLOY_DECL void Prod(const FDTensor& x, FDTensor* out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim = false, bool reduce_all = false);
+
+#endif
+}  // namespace fastdeploy
--- a/csrcs/fastdeploy/function/reduce_functor.h
+++ b/csrcs/fastdeploy/function/reduce_functor.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/function/eigen.h"
+namespace fastdeploy {
+
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
+    y->device(dev) = x->maximum(dim);
+  }
+};
+
+//////// Min Functor ///////
+struct MinFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
+    y->device(dev) = x->minimum(dim);
+  }
+};
+
+//////// Sum Functor ///////
+struct SumFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
+    y->device(dev) = x->sum(dim);
+  }
+};
+
+//////// All Functor ///////
+struct AllFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
+    y->device(dev) = x->all(dim);
+  }
+};
+
+//////// Any Functor ///////
+struct AnyFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
+    y->device(dev) = x->any(dim);
+  }
+};
+
+//////// Mean Functor ///////
+struct MeanFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
+    y->device(dev) = x->mean(dim);
+  }
+};
+
+//////// Prod Functor ///////
+struct ProdFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
+    y->device(dev) = x->prod(dim);
+  }
+};
+
+}  // namespace fastdeploy
--- a/csrcs/fastdeploy/utils/utils.h
+++ b/csrcs/fastdeploy/utils/utils.h
@@ -72,13 +72,13 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
 #define __REL_FILE__ __FILE__
 #endif

-#define FDERROR             \
-  FDLogger(true, "[ERROR]") \
-      << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
+#define FDERROR                                                \
+  FDLogger(true, "[ERROR]") << __REL_FILE__ << "(" << __LINE__ \
+                            << ")::" << __FUNCTION__ << "\t"

-#define FDWARNING             \
-  FDLogger(true, "[WARNING]") \
-      << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
+#define FDWARNING                                                \
+  FDLogger(true, "[WARNING]") << __REL_FILE__ << "(" << __LINE__ \
+                              << ")::" << __FUNCTION__ << "\t"

 #define FDINFO                                                \
  FDLogger(true, "[INFO]") << __REL_FILE__ << "(" << __LINE__ \
@@ -90,4 +90,61 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
    std::abort();                    \
  }

+///////// Basic Marco ///////////
+
+#define FD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
+  case enum_type: {                                                       \
+    using HINT = type;                                                    \
+    __VA_ARGS__();                                                        \
+    break;                                                                \
+  }
+
+#define FD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \
+  FD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__)
+
+#define FD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::BOOL, bool,     \
+                           __VA_ARGS__)                                    \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t, \
+                           __VA_ARGS__)                                    \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t, \
+                           __VA_ARGS__)                                    \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,    \
+                           __VA_ARGS__)                                    \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,   \
+                           __VA_ARGS__)                                    \
+      default:                                                             \
+        FDASSERT(false, "Invalid enum data type.")                         \
+    }                                                                      \
+  }()
+
+#define FD_VISIT_FLOAT_TYPES(TYPE, NAME, ...)                            \
+  [&] {                                                                  \
+    const auto& __dtype__ = TYPE;                                        \
+    switch (__dtype__) {                                                 \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,  \
+                           __VA_ARGS__)                                  \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double, \
+                           __VA_ARGS__)                                  \
+      default:                                                           \
+        FDASSERT(false, "Invalid enum data type.")                       \
+    }                                                                    \
+  }()
+
+#define FD_VISIT_INT_TYPES(TYPE, NAME, ...)                                \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t, \
+                           __VA_ARGS__)                                    \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t, \
+                           __VA_ARGS__)                                    \
+      default:                                                             \
+        FDASSERT(false, "Invalid enum data type.")                         \
+    }                                                                      \
+  }()
+
 }  // namespace fastdeploy
--- a/docs/api/function.md
+++ b/docs/api/function.md
@@ -0,0 +1,214 @@
+# FDTensor C++ 张量化函数
+
+FDTensor是FastDeploy在C++层表示张量的结构体。该结构体主要用于管理推理部署时模型的输入输出数据，支持在不同的Runtime后端中使用。在基于C++的推理部署应用开发过程中，我们往往需要对输入输出的数据进行一些数据处理，用以得到模型的实际输入或者应用的实际输出。这种数据预处理的逻辑可以使用原生的C++标准库来实现，但开发难度会比较大，如对3维Tensor的第2维求最大值。针对这个问题，FastDeploy基于FDTensor开发了一套C++张量化函数，用于降低FastDeploy用户的开发成本，提高开发效率。目前主要分为两类函数：Reduce类函数和Elementwise类函数。
+
+## Reduce类函数
+
+目前FastDeploy支持7种Reduce类函数：Max, Min, Sum, All, Any, Mean, Prod。
+
+### Max
+
+#### 函数签名
+
+```c++
+/** Excute the maximum operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+void Max(const FDTensor& x, FDTensor* out,
+         const std::vector<int64_t>& dims,
+         bool keep_dim = false, bool reduce_all = false);
+```
+
+#### 使用示例
+
+```c++
+FDTensor input, output;
+std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+// Calculate the max value for axis 0 of `inputs`
+// The output result would be [[7, 4, 5]].
+Max(input, &output, {0}, /* keep_dim = */true);
+```
+
+### Min
+
+#### 函数签名
+
+```c++
+/** Excute the minimum operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+void Min(const FDTensor& x, FDTensor* out,
+         const std::vector<int64_t>& dims,
+         bool keep_dim = false, bool reduce_all = false);
+```
+
+#### 使用示例
+
+```c++
+FDTensor input, output;
+std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+// Calculate the min value for axis 0 of `inputs`
+// The output result would be [[2, 1, 3]].
+Min(input, &output, {0}, /* keep_dim = */true);
+```
+
+### Sum
+
+#### 函数签名
+
+```c++
+/** Excute the sum operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+void Sum(const FDTensor& x, FDTensor* out,
+         const std::vector<int64_t>& dims,
+         bool keep_dim = false, bool reduce_all = false);
+```
+
+#### 使用示例
+
+```c++
+FDTensor input, output;
+std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+// Calculate the sum value for axis 0 of `inputs`
+// The output result would be [[9, 5, 8]].
+Sum(input, &output, {0}, /* keep_dim = */true);
+```
+
+### Mean
+
+#### 函数签名
+
+```c++
+/** Excute the mean operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+void Mean(const FDTensor& x, FDTensor* out,
+         const std::vector<int64_t>& dims,
+         bool keep_dim = false, bool reduce_all = false);
+```
+
+#### 使用示例
+
+```c++
+FDTensor input, output;
+std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+// Calculate the mean value for axis 0 of `inputs`
+// The output result would be [[4, 2, 4]].
+Mean(input, &output, {0}, /* keep_dim = */true);
+```
+
+### Prod
+
+#### 函数签名
+
+```c++
+/** Excute the product operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+void Prod(const FDTensor& x, FDTensor* out,
+         const std::vector<int64_t>& dims,
+         bool keep_dim = false, bool reduce_all = false);
+```
+
+#### 使用示例
+
+```c++
+FDTensor input, output;
+std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+// Calculate the product value for axis 0 of `inputs`
+// The output result would be [[14, 4, 15]].
+Prod(input, &output, {0}, /* keep_dim = */true);
+```
+
+### Any
+
+#### 函数签名
+
+```c++
+/** Excute the any operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+void Any(const FDTensor& x, FDTensor* out,
+         const std::vector<int64_t>& dims,
+         bool keep_dim = false, bool reduce_all = false);
+```
+
+#### 使用示例
+
+```c++
+FDTensor input, output;
+std::array<bool, 6> bool_inputs = {false, false, true, true, false, true};
+input.SetExternalData({2, 3}, FDDataType::INT32, bool_inputs.data());
+
+// Calculate the any value for axis 0 of `inputs`
+// The output result would be [[true, false, true]].
+Any(input, &output, {0}, /* keep_dim = */true);
+```
+
+### All
+
+#### 函数签名
+
+```c++
+/** Excute the all operation for input FDTensor along given dims.
+    @param x The input tensor.
+    @param out The output tensor which stores the result.
+    @param dims The vector of axis which will be reduced.
+    @param keep_dim Whether to keep the reduced dims, default false.
+    @param reduce_all Whether to reduce all dims, default false.
+*/
+void All(const FDTensor& x, FDTensor* out,
+         const std::vector<int64_t>& dims,
+         bool keep_dim = false, bool reduce_all = false);
+```
+
+#### 使用示例
+
+```c++
+FDTensor input, output;
+std::array<bool, 6> bool_inputs = {false, false, true, true, false, true};
+input.SetExternalData({2, 3}, FDDataType::INT32, bool_inputs.data());
+
+// Calculate the all value for axis 0 of `inputs`
+// The output result would be [[false, false, true]].
+All(input, &output, {0}, /* keep_dim = */true);
+```
+
+## Elementwise类函数
+
+正在开发中，敬请关注······
--- a/examples/text/compute.h
+++ b/examples/text/compute.h
@@ -1,270 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "fastdeploy/core/fd_tensor.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace fastdeploy {
-// EigenDim converts shape into Eigen::DSizes.
-template <int D>
-struct EigenDim {
-  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
-
-  static Type From(const std::vector<int64_t>& dims) {
-    Type ret;
-    for (int64_t d = 0; d < dims.size(); d++) {
-      ret[d] = dims[d];
-    }
-    return ret;
-  }
-};
-
-// Interpret FDTensor as EigenTensor and EigenConstTensor.
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenTensor {
-  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
-
-  using ConstType =
-      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
-
-  static Type From(FDTensor& tensor,
-                   const std::vector<int64_t>& dims) {  // NOLINT
-    return Type(reinterpret_cast<T*>(tensor.data.data()),
-                EigenDim<D>::From(dims));
-  }
-
-  static Type From(FDTensor& tensor) {  // NOLINT
-    return From(tensor, tensor.shape);
-  }  // NOLINT
-
-  static ConstType From(const FDTensor& tensor,
-                        const std::vector<int64_t>& dims) {
-    return ConstType(reinterpret_cast<const T*>(tensor.data.data()),
-                     EigenDim<D>::From(dims));
-  }
-
-  static ConstType From(const FDTensor& tensor) {
-    return From(tensor, tensor.shape);
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenScalar {
-  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
-  using Type = Eigen::TensorMap<
-      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
-  using ConstType = Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
-
-  static Type From(FDTensor& tensor) {
-    return Type(reinterpret_cast<T*>(tensor.data.data()));
-  }  // NOLINT
-
-  static ConstType From(const FDTensor& tensor) {
-    return ConstType(reinterpret_cast<const T*>(tensor.data.data()));
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten reshapes a Tensor into an EigenVector.
-  static typename EigenVector::Type Flatten(FDTensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {tensor.Numel()});
-  }
-
-  static typename EigenVector::ConstType Flatten(
-      const FDTensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {tensor.Numel()});
-  }
-};
-
-template <typename T, size_t D, size_t R_D, typename Functor>
-void ReduceFunctor(const Eigen::DefaultDevice& dev, const FDTensor& input,
-                   FDTensor* output, const std::vector<int64_t>& dims,
-                   bool keep_dim = true) {
-  auto x = EigenTensor<T, D>::From(input);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto reduce_dim = Eigen::array<int, R_D>();
-  std::vector<int64_t> dims_ref = dims;
-  std::vector<int> out_dims(input.shape.size());
-  std::copy(input.shape.begin(), input.shape.end(), out_dims.begin());
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
-    out_dims[dims_ref[i]] = 1;
-    reduce_dim[i] = dims_ref[i];
-  }
-  output->Allocate(out_dims, TypeToDataType<T>::dtype);
-  if (keep_dim && x_rank > 1) {
-    const int kDelFlag = -2;
-    auto dims_vector = out_dims;
-    for (size_t i = 0; i < dims_ref.size(); ++i) {
-      dims_vector[dims_ref[i]] = kDelFlag;
-    }
-    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-                      dims_vector.end());
-    out_dims = dims_vector;
-  }
-  Functor functor;
-
-  if (D == 1) {
-    auto out = EigenScalar<T>::From(*output);
-    functor(dev, &x, &out, reduce_dim);
-  } else {
-    dims_ref.resize(out_dims.size());
-    std::copy(out_dims.begin(), out_dims.end(), dims_ref.begin());
-    for (int i = 0; i < dims_ref.size(); ++i) {
-      std::cerr << dims_ref[i] << ", ";
-    }
-    std::cerr << std::endl;
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, dims_ref);
-    functor(dev, &x, &out, reduce_dim);
-  }
-}
-
-struct MaxFunctor {
-  template <typename X, typename Y, typename Dim>
-  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
-    y->device(dev) = x->maximum(dim);
-  }
-};
-
-struct SumFunctor {
-  template <typename X, typename Y, typename Dim>
-  void operator()(const Eigen::DefaultDevice& dev, X* x, Y* y, const Dim& dim) {
-    y->device(dev) = x->sum(dim);
-  }
-};
-
-inline void GetBroadcastDimsArrays(const std::vector<int64_t>& x_dims,
-                                   const std::vector<int64_t>& y_dims,
-                                   int* x_dims_array, int* y_dims_array,
-                                   int* out_dims_array, const int max_dim,
-                                   const int axis) {
-  if (x_dims.size() > y_dims.size()) {
-    std::fill(y_dims_array, y_dims_array + axis, 1);
-    if (axis + y_dims.size() < max_dim) {
-      std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1);
-    }
-    std::copy(x_dims.data(), x_dims.data() + x_dims.size(), x_dims_array);
-    std::copy(y_dims.data(), y_dims.data() + y_dims.size(),
-              y_dims_array + axis);
-  } else {
-    std::fill(x_dims_array, x_dims_array + axis, 1);
-    if (axis + x_dims.size() < max_dim) {
-      std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1);
-    }
-    std::copy(x_dims.data(), x_dims.data() + x_dims.size(),
-              x_dims_array + axis);
-    std::copy(y_dims.data(), y_dims.data() + y_dims.size(), y_dims_array);
-  }
-
-  for (int i = 0; i < max_dim; i++) {
-    if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
-        (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
-      out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]);
-    } else {
-      out_dims_array[i] = -1;
-    }
-  }
-}
-
-inline int GetElementwiseIndex(const int* x_dims_array, const int max_dim,
-                               const int* index_array) {
-  int index_ = 0;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] > 1) {
-      index_ = index_ * x_dims_array[i] + index_array[i];
-    }
-  }
-  return index_;
-}
-
-inline void UpdateElementwiseIndexArray(const int* out_dims_array,
-                                        const int max_dim, int* index_array) {
-  for (int i = max_dim - 1; i >= 0; --i) {
-    ++index_array[i];
-    if (index_array[i] >= out_dims_array[i]) {
-      index_array[i] -= out_dims_array[i];
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonElementwiseBroadcastForward(const FDTensor& x, const FDTensor& y,
-                                       FDTensor* z, Functor func, int axis,
-                                       const bool is_xsize_larger = true) {
-  std::vector<int64_t> x_dims = x.shape;
-  std::vector<int64_t> y_dims = y.shape;
-  int max_dim = (std::max)(x_dims.size(), y_dims.size());
-  int diff = x_dims.size() - y_dims.size();
-  axis = (axis == -1 ? std::abs(diff) : axis);
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                         y_dims_array.data(), out_dims_array.data(), max_dim,
-                         axis);
-
-  const T* x_data = reinterpret_cast<const T*>(x.Data());
-  const T* y_data = reinterpret_cast<const T*>(y.Data());
-
-  z->Allocate(out_dims_array, TypeToDataType<OutType>::dtype);
-  OutType* out_data = reinterpret_cast<T*>(z->MutableData());
-
-  const int out_size =
-      std::accumulate(out_dims_array.data(), out_dims_array.data() + max_dim, 1,
-                      std::multiplies<int>());
-  int x_index, y_index;
-  std::vector<int> index_array(max_dim, 0);
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index =
-        GetElementwiseIndex(x_dims_array.data(), max_dim, index_array.data());
-    y_index =
-        GetElementwiseIndex(y_dims_array.data(), max_dim, index_array.data());
-    if (is_xsize_larger) {
-      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
-    } else {
-      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array.data(), max_dim,
-                                index_array.data());
-  }
-}
-
-template <typename T>
-struct AddFunctor {
-  T operator()(const T& lhs, const T& rhs) { return lhs + rhs; }
-};
-
-template <typename T>
-struct SubFunctor {
-  T operator()(const T& lhs, const T& rhs) { return lhs - rhs; }
-};
-
-template <typename T>
-struct DivFunctor {
-  T operator()(const T& lhs, const T& rhs) { return lhs / rhs; }
-};
-
-}  // namespace fastdeploy
--- a/external/gflags.cmake
+++ b/external/gflags.cmake
@@ -0,0 +1,73 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GFLAGS_PREFIX_DIR  ${THIRD_PARTY_PATH}/gflags)
+SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
+SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
+set(GFLAGS_REPOSITORY ${GIT_URL}/gflags/gflags.git)
+set(GFLAGS_TAG "v2.2.2")
+IF(WIN32)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ELSE(WIN32)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(BUILD_COMMAND $(MAKE) --silent)
+  set(INSTALL_COMMAND $(MAKE) install)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_gflags
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    ${SHALLOW_CLONE}
+    GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
+    GIT_TAG         ${GFLAGS_TAG}
+    PREFIX          ${GFLAGS_PREFIX_DIR}
+    UPDATE_COMMAND  ""
+    BUILD_COMMAND   ${BUILD_COMMAND}
+    INSTALL_COMMAND ${INSTALL_COMMAND}
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DBUILD_STATIC_LIBS=ON
+                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
+)
+
+ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
+ADD_DEPENDENCIES(gflags extern_gflags)
+
+# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
+if (WIN32)
+  include(CheckIncludeFileCXX)
+  check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
+  if (HAVE_SHLWAPI)
+    set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
+  endif(HAVE_SHLWAPI)
+endif (WIN32)
--- a/external/glog.cmake
+++ b/external/glog.cmake
@@ -0,0 +1,68 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GLOG_PREFIX_DIR  ${THIRD_PARTY_PATH}/glog)
+SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
+SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
+SET(GLOG_REPOSITORY ${GIT_URL}/google/glog.git)
+SET(GLOG_TAG        v0.4.0)
+
+IF(WIN32)
+  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+ELSE(WIN32)
+  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_glog
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    ${SHALLOW_CLONE}
+    GIT_REPOSITORY  ${GLOG_REPOSITORY}
+    GIT_TAG         ${GLOG_TAG}
+    DEPENDS         gflags
+    PREFIX          ${GLOG_PREFIX_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DWITH_GFLAGS=OFF
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GLOG_LIBRARIES}
+)
+
+ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
+ADD_DEPENDENCIES(glog extern_glog gflags)
+LINK_LIBRARIES(glog)
--- a/external/gtest.cmake
+++ b/external/gtest.cmake
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WITH_TESTING)
+
+INCLUDE(GNUInstallDirs)
+INCLUDE(ExternalProject)
+
+SET(GTEST_PREFIX_DIR    ${THIRD_PARTY_PATH}/gtest)
+SET(GTEST_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gtest)
+SET(GTEST_INCLUDE_DIR   "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
+set(GTEST_REPOSITORY    ${GIT_URL}/google/googletest.git)
+set(GTEST_TAG           release-1.8.1)
+
+INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
+
+IF(WIN32)
+    set(GTEST_LIBRARIES
+        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
+    set(GTEST_MAIN_LIBRARIES
+        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
+    string(REPLACE "/w " "" GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    string(REPLACE "/w " "" GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    string(REPLACE "/W0 " "" GTEST_CMAKE_C_FLAGS "${GTEST_CMAKE_C_FLAGS}")
+    string(REPLACE "/W0 " "" GTEST_CMAKE_CXX_FLAGS "${GTEST_CMAKE_CXX_FLAGS}")
+ELSE(WIN32)
+    set(GTEST_LIBRARIES
+        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
+    set(GTEST_MAIN_LIBRARIES
+        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
+    set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+ENDIF(WIN32)
+
+ExternalProject_Add(
+    extern_gtest
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    ${SHALLOW_CLONE}
+    GIT_REPOSITORY  ${GTEST_REPOSITORY}
+    GIT_TAG         ${GTEST_TAG}
+    PREFIX          ${GTEST_PREFIX_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_GMOCK=ON
+                    -Dgtest_disable_pthreads=ON
+                    -Dgtest_force_shared_crt=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
+    BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}
+)
+
+ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
+ADD_DEPENDENCIES(gtest extern_gtest)
+
+ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
+ADD_DEPENDENCIES(gtest_main extern_gtest)
+
+ENDIF()
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+function(cc_test_build TARGET_NAME)
+  if(WITH_TESTING)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cc_test_SRCS})
+    set_target_properties(${TARGET_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/bin)
+    if(WIN32)
+      if("${cc_test_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_test_DEPS python)
+        target_link_libraries(${TARGET_NAME} PUBLIC ${PYTHON_LIBRARIES})
+      endif()
+      set(EXTERNAL_LIB "")
+    else(WIN32)
+      set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+    endif(WIN32)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} PUBLIC ${cc_test_DEPS} ${os_dependency_modules} fastdeploy_gtest_main gtest glog ${EXTERNAL_LIB})
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest)
+  endif()
+endfunction()
+
+function(cc_test TARGET_NAME)
+  if(WITH_TESTING)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cc_test_build(${TARGET_NAME}
+      SRCS ${cc_test_SRCS}
+      DEPS ${cc_test_DEPS})
+    add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND})
+  endif()
+endfunction(cc_test)
+
+function(add_fastdeploy_unittest CC_FILE)
+  set(TEMP_TARGET_FILE ${CC_FILE})
+  string(REGEX MATCHALL "[0-9A-Za-z_]*.cc" FILE_NAME ${CC_FILE})
+  string(REGEX REPLACE ".cc" "" FILE_PREFIX ${FILE_NAME})
+  set(TEMP_TARGET_NAME ${FILE_PREFIX})
+  if (EXISTS ${TEMP_TARGET_FILE} AND TARGET fastdeploy)
+    cc_test(${TEMP_TARGET_NAME} SRCS ${TEMP_TARGET_FILE} DEPS fastdeploy)
+    message(STATUS "  Added FastDeploy unittest       : ${TEMP_TARGET_NAME}")
+  endif()
+  unset(TEMP_TARGET_FILE)
+  unset(TEMP_TARGET_NAME)
+endfunction()
+
+if(WITH_TESTING)
+  add_library(fastdeploy_gtest_main STATIC gtest_main)
+  target_link_libraries(fastdeploy_gtest_main PUBLIC gtest gflags)
+  message(STATUS "")
+  message(STATUS "*************FastDeploy Unittest Summary**********")
+  file(GLOB ALL_TEST_SRCS ${PROJECT_SOURCE_DIR}/tests/test_*.cc)
+  foreach(_CC_FILE ${ALL_TEST_SRCS})
+    add_fastdeploy_unittest(${_CC_FILE})
+  endforeach()
+endif()
--- a/tests/gtest_main.cc
+++ b/tests/gtest_main.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  std::vector<char*> new_argv;
+  for (int i = 0; i < argc; ++i) {
+    new_argv.push_back(argv[i]);
+  }
+
+  std::vector<std::string> envs;
+  std::vector<std::string> undefok;
+
+  char* env_str = nullptr;
+  if (envs.size() > 0) {
+    std::string env_string = "--tryfromenv=";
+    for (auto t : envs) {
+      env_string += t + ",";
+    }
+    env_string = env_string.substr(0, env_string.length() - 1);
+    env_str = strdup(env_string.c_str());
+    new_argv.push_back(env_str);
+    VLOG(1) << "gtest env_string:" << env_string;
+  }
+
+  char* undefok_str = nullptr;
+  if (undefok.size() > 0) {
+    std::string undefok_string = "--undefok=";
+    for (auto t : undefok) {
+      undefok_string += t + ",";
+    }
+    undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
+    undefok_str = strdup(undefok_string.c_str());
+    new_argv.push_back(undefok_str);
+    VLOG(1) << "gtest undefok_string:" << undefok_string;
+  }
+
+  int new_argc = static_cast<int>(new_argv.size());
+  char** new_argv_address = new_argv.data();
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&new_argc, &new_argv_address,
+                                            false);
+  int ret = RUN_ALL_TESTS();
+  if (env_str) free(env_str);
+  if (undefok_str) free(undefok_str);
+  return ret;
+}
--- a/tests/gtest_utils.h
+++ b/tests/gtest_utils.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace fastdeploy {
+
+struct CheckShape {
+  template <typename T>
+  void operator()(const std::vector<T>& lhs, const std::vector<T>& rhs) {
+    ASSERT_EQ(lhs.size(), rhs.size());
+    for (int i = 0; i < lhs.size(); ++i) {
+      ASSERT_EQ(lhs[i], rhs[i]);
+    }
+  }
+};
+
+struct CheckData {
+  template <typename T>
+  void operator()(const T* lhs_ptr, const T* rhs_ptr, int num) {
+    for (int i = 0; i < num; ++i) {
+      ASSERT_EQ(lhs_ptr[i], rhs_ptr[i]);
+    }
+  }
+  void operator()(const float* lhs_ptr, const float* rhs_ptr, int num) {
+    for (int i = 0; i < num; ++i) {
+      ASSERT_FLOAT_EQ(lhs_ptr[i], rhs_ptr[i]);
+    }
+  }
+  void operator()(const double* lhs_ptr, const double* rhs_ptr, int num) {
+    for (int i = 0; i < num; ++i) {
+      ASSERT_DOUBLE_EQ(lhs_ptr[i], rhs_ptr[i]);
+    }
+  }
+};
+
+}  // namespace fastdeploy
--- a/tests/test_reduce.cc
+++ b/tests/test_reduce.cc
@@ -0,0 +1,286 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/function/reduce.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "gtest_utils.h"
+
+namespace fastdeploy {
+
+#ifdef ENABLE_FDTENSOR_FUNC
+TEST(fastdeploy, reduce_max) {
+  FDTensor input, output;
+  CheckShape check_shape;
+  CheckData check_data;
+
+  std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+  std::vector<int> expected_result_axis0 = {7, 4, 5};
+  std::vector<int> expected_result_axis1 = {4, 7};
+  std::vector<int> expected_result_noaxis = {7};
+  input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+  // keep_dim = true, reduce_all = false
+  Max(input, &output, {0}, true);
+  check_shape(output.shape, {1, 3});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis0.data(), expected_result_axis0.size());
+
+  // keep_dim = false, reduce_all = false
+  Max(input, &output, {1});
+  check_shape(output.shape, {2});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis1.data(), expected_result_axis1.size());
+
+  // keep_dim = false, reduce_all = true
+  Max(input, &output, {1}, false, true);
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+
+  // test 1-D tensor
+  input.shape = {6};
+  Max(input, &output, {0});
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+}
+
+TEST(fastdeploy, reduce_min) {
+  FDTensor input, output;
+  CheckShape check_shape;
+  CheckData check_data;
+
+  std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+  std::vector<int> expected_result_axis0 = {2, 1, 3};
+  std::vector<int> expected_result_axis1 = {2, 1};
+  std::vector<int> expected_result_noaxis = {1};
+  input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+  // keep_dim = true, reduce_all = false
+  Min(input, &output, {0}, true);
+  check_shape(output.shape, {1, 3});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis0.data(), expected_result_axis0.size());
+
+  // keep_dim = false, reduce_all = false
+  Min(input, &output, {1});
+  check_shape(output.shape, {2});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis1.data(), expected_result_axis1.size());
+
+  // keep_dim = false, reduce_all = true
+  Min(input, &output, {1}, false, true);
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+
+  // test 1-D tensor
+  input.shape = {6};
+  Min(input, &output, {0});
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+}
+
+TEST(fastdeploy, reduce_sum) {
+  FDTensor input, output;
+  CheckShape check_shape;
+  CheckData check_data;
+
+  std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+  std::vector<int> expected_result_axis0 = {9, 5, 8};
+  std::vector<int> expected_result_axis1 = {9, 13};
+  std::vector<int> expected_result_noaxis = {22};
+  input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+  // keep_dim = true, reduce_all = false
+  Sum(input, &output, {0}, true);
+  check_shape(output.shape, {1, 3});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis0.data(), expected_result_axis0.size());
+
+  // keep_dim = false, reduce_all = false
+  Sum(input, &output, {1});
+  check_shape(output.shape, {2});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis1.data(), expected_result_axis1.size());
+
+  // keep_dim = false, reduce_all = true
+  Sum(input, &output, {1}, false, true);
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+
+  // test 1-D tensor
+  input.shape = {6};
+  Sum(input, &output, {0});
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+}
+
+TEST(fastdeploy, reduce_prod) {
+  FDTensor input, output;
+  CheckShape check_shape;
+  CheckData check_data;
+
+  std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+  std::vector<int> expected_result_axis0 = {14, 4, 15};
+  std::vector<int> expected_result_axis1 = {24, 35};
+  std::vector<int> expected_result_noaxis = {840};
+  input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+  // keep_dim = true, reduce_all = false
+  Prod(input, &output, {0}, true);
+  check_shape(output.shape, {1, 3});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis0.data(), expected_result_axis0.size());
+
+  // keep_dim = false, reduce_all = false
+  Prod(input, &output, {1});
+  check_shape(output.shape, {2});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis1.data(), expected_result_axis1.size());
+
+  // keep_dim = false, reduce_all = true
+  Prod(input, &output, {1}, false, true);
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+
+  // test 1-D tensor
+  input.shape = {6};
+  Prod(input, &output, {0});
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+}
+
+TEST(fastdeploy, reduce_mean) {
+  FDTensor input, output;
+  CheckShape check_shape;
+  CheckData check_data;
+
+  std::vector<int> inputs = {2, 4, 3, 7, 1, 5};
+  std::vector<int> expected_result_axis0 = {4, 2, 4};
+  std::vector<int> expected_result_axis1 = {3, 4};
+  std::vector<int> expected_result_noaxis = {3};
+  input.SetExternalData({2, 3}, FDDataType::INT32, inputs.data());
+
+  // keep_dim = true, reduce_all = false
+  Mean(input, &output, {0}, true);
+  check_shape(output.shape, {1, 3});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis0.data(), expected_result_axis0.size());
+
+  // keep_dim = false, reduce_all = false
+  Mean(input, &output, {1});
+  check_shape(output.shape, {2});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_axis1.data(), expected_result_axis1.size());
+
+  // keep_dim = false, reduce_all = true
+  Mean(input, &output, {1}, false, true);
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+
+  // test 1-D tensor
+  input.shape = {6};
+  Mean(input, &output, {0});
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const int*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+}
+
+TEST(fastdeploy, reduce_all) {
+  FDTensor input, output;
+  CheckShape check_shape;
+  CheckData check_data;
+
+  std::array<bool, 6> inputs = {false, false, true, true, false, true};
+  std::array<bool, 3> expected_result_axis0 = {false, false, true};
+  std::array<bool, 2> expected_result_axis1 = {false, false};
+  std::array<bool, 1> expected_result_noaxis = {false};
+
+  input.SetExternalData({2, 3}, FDDataType::BOOL, inputs.data());
+
+  // keep_dim = true, reduce_all = false
+  All(input, &output, {0}, true);
+  check_shape(output.shape, {1, 3});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_axis0.data(), expected_result_axis0.size());
+
+  // keep_dim = false, reduce_all = false
+  All(input, &output, {1});
+  check_shape(output.shape, {2});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_axis1.data(), expected_result_axis1.size());
+
+  // keep_dim = false, reduce_all = true
+  All(input, &output, {1}, false, true);
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+
+  // test 1-D tensor
+  input.shape = {6};
+  All(input, &output, {0});
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+}
+
+TEST(fastdeploy, reduce_any) {
+  FDTensor input, output;
+  CheckShape check_shape;
+  CheckData check_data;
+
+  std::array<bool, 6> inputs = {false, false, true, true, false, true};
+  std::array<bool, 3> expected_result_axis0 = {true, false, true};
+  std::array<bool, 2> expected_result_axis1 = {true, true};
+  std::array<bool, 1> expected_result_noaxis = {true};
+
+  input.SetExternalData({2, 3}, FDDataType::BOOL, inputs.data());
+
+  // keep_dim = true, reduce_all = false
+  Any(input, &output, {0}, true);
+  check_shape(output.shape, {1, 3});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_axis0.data(), expected_result_axis0.size());
+
+  // keep_dim = false, reduce_all = false
+  Any(input, &output, {1});
+  check_shape(output.shape, {2});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_axis1.data(), expected_result_axis1.size());
+
+  // keep_dim = false, reduce_all = true
+  Any(input, &output, {1}, false, true);
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+
+  // test 1-D tensor
+  input.shape = {6};
+  Any(input, &output, {0});
+  check_shape(output.shape, {1});
+  check_data(reinterpret_cast<const bool*>(output.Data()),
+             expected_result_noaxis.data(), expected_result_noaxis.size());
+}
+#endif
+}  // namespace fastdeploy