[Functions] Add +-*/ operators and reshape for FDTensor (#655)

* Add +-*/ functions * Add same dims test case for operations * add broadcast 0 * Add broadcast dim2 testcase * Add broadcast dim3 and dim4 testcase * Add +-*/ operators * Add mixed operation * refresh code style * Add reshape op * update code style
2025-10-06 17:17:14 +08:00 · 2022-11-23 11:34:02 +08:00
parent c11bfb8341
commit de98163efa
12 changed files with 1163 additions and 126 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -149,7 +149,7 @@
 # SpaceBeforeRangeBasedForLoopColon: true
 # SpaceInEmptyBlock: false
 # SpaceInEmptyParentheses: false
-# SpacesBeforeTrailingComments: 1
+# SpacesBeforeTrailingComments: 2
 # SpacesInAngles:  Never
 # SpacesInConditionalStatement: false
 # SpacesInContainerLiterals: true
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -11,11 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <cstring>
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/utils/utils.h"
 #include <algorithm>
 #include <cstring>
 #ifdef WITH_GPU
 #include <cuda_runtime_api.h>
 #endif
@@ -151,9 +151,63 @@ void FDTensor::Resize(const std::vector<int64_t>& new_shape,
  shape.assign(new_shape.begin(), new_shape.end());
 }
 bool FDTensor::Reshape(const std::vector<int64_t>& new_shape) {
  int numel = Numel();
  const int64_t unk_dim_val = -1;
  const int64_t copy_dim_val = 0;
  std::vector<int64_t> output_shape(new_shape.size(), 0);
  int64_t capacity = 1;
  int unk_dim_idx = -1;
  for (size_t i = 0; i < new_shape.size(); ++i) {
    if (new_shape[i] == unk_dim_val) {
      FDASSERT(unk_dim_idx == -1,
               "Only one dimension value of 'shape' in ReshapeOp can "
               "be -1. But received shape = [%s], shape[%d] is also -1.",
               Str(new_shape).c_str(), i);
      unk_dim_idx = i;
    } else if (new_shape[i] == copy_dim_val) {
      FDASSERT(i < shape.size(),
               "The index of 0 in `shape` must be less than "
               "the input tensor X's dimensions. "
               "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
               "X's dimensions = %d.",
               Str(new_shape).c_str(), i, Str(shape).c_str(), shape.size());
    } else {
      FDASSERT(new_shape[i] > 0,
               "Each dimension value of 'shape' in ReshapeOp must not "
               "be negative except one unknown dimension. "
               "But received  shape = [%s], shape[%d] = %d.",
               Str(new_shape).c_str(), i, new_shape[i]);
    }
    capacity *= (new_shape[i] ? new_shape[i] : shape[i]);
    output_shape[i] = (new_shape[i] ? new_shape[i] : shape[i]);
  }
  if (unk_dim_idx != -1) {
    output_shape[unk_dim_idx] = -numel / capacity;
    FDASSERT(output_shape[unk_dim_idx] * capacity == -numel,
             "The 'shape' attribute in ReshapeOp is invalid. "
             "The input tensor X'size must be divisible by known "
             "capacity of 'shape'. "
             "But received X's shape = [%s], X's size = %d, "
             "'shape' is [%s], known capacity of 'shape' is %d.",
             Str(shape).c_str(), numel, Str(new_shape).c_str(), capacity);
  } else {
    FDASSERT(numel == capacity,
             "The 'shape' in ReshapeOp is invalid. "
             "The input tensor X'size must be equal to the capacity of "
             "'shape'. "
             "But received X's shape = [%s], X's size = %d, 'shape' is "
             "[%s], the capacity of 'shape' is %d.",
             Str(shape).c_str(), numel, Str(shape).c_str(), capacity);
  }
  shape = output_shape;
  return true;
 }
 template <typename T>
-void CalculateStatisInfo(const void* src_ptr, int size, double* mean, double* max,
+void CalculateStatisInfo(const void* src_ptr, int size, double* mean,
-                         double* min) {
+                         double* max, double* min) {
  const T* ptr = static_cast<const T*>(src_ptr);
  *mean = 0;
  *max = -99999999;
@@ -213,8 +267,7 @@ bool FDTensor::ReallocFn(size_t nbytes) {
    }
    return buffer_ != nullptr;
 #else
-    FDASSERT(false,
+    FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
             "The FastDeploy FDTensor allocator didn't compile under "
                    "-DWITH_GPU=ON,"
                    "so this is an unexpected problem happend.");
 #endif
@@ -230,8 +283,7 @@ bool FDTensor::ReallocFn(size_t nbytes) {
      }
      return buffer_ != nullptr;
 #else
-      FDASSERT(false,
+      FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
               "The FastDeploy FDTensor allocator didn't compile under "
                      "-DWITH_GPU=ON,"
                      "so this is an unexpected problem happend.");
 #endif
@@ -242,7 +294,8 @@ bool FDTensor::ReallocFn(size_t nbytes) {
 }
 void FDTensor::FreeFn() {
-  if (external_data_ptr != nullptr) external_data_ptr = nullptr;
+  if (external_data_ptr != nullptr)
    external_data_ptr = nullptr;
  if (buffer_ != nullptr) {
    if (device == Device::GPU) {
 #ifdef WITH_GPU
@@ -293,11 +346,8 @@ void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes,
 FDTensor::FDTensor(const std::string& tensor_name) { name = tensor_name; }
 FDTensor::FDTensor(const FDTensor& other)
-    : shape(other.shape),
+    : shape(other.shape), name(other.name), dtype(other.dtype),
-      name(other.name),
+      device(other.device), external_data_ptr(other.external_data_ptr) {
      dtype(other.dtype),
      device(other.device),
      external_data_ptr(other.external_data_ptr) {
  // Copy buffer
  if (other.buffer_ == nullptr) {
    buffer_ = nullptr;
@@ -310,12 +360,9 @@ FDTensor::FDTensor(const FDTensor& other)
 }
 FDTensor::FDTensor(FDTensor&& other)
-    : buffer_(other.buffer_),
+    : buffer_(other.buffer_), shape(std::move(other.shape)),
-      shape(std::move(other.shape)),
+      name(std::move(other.name)), dtype(other.dtype),
-      name(std::move(other.name)),
+      external_data_ptr(other.external_data_ptr), device(other.device) {
      dtype(other.dtype),
      external_data_ptr(other.external_data_ptr),
      device(other.device) {
  other.name = "";
  // Note(zhoushunjie): Avoid double free.
  other.buffer_ = nullptr;
--- a/fastdeploy/core/fd_tensor.h
+++ b/fastdeploy/core/fd_tensor.h
@@ -57,9 +57,7 @@ struct FASTDEPLOY_DECL FDTensor {
  void* Data();
-  bool IsShared() {
+  bool IsShared() { return external_data_ptr != nullptr; }
    return external_data_ptr != nullptr;
  }
  void StopSharing();
@@ -116,6 +114,7 @@ struct FASTDEPLOY_DECL FDTensor {
              const FDDataType& data_type, const std::string& tensor_name = "",
              const Device& new_device = Device::CPU);
  bool Reshape(const std::vector<int64_t>& new_shape);
  // Debug function
  // Use this function to print shape, dtype, mean, max, min
  // prefix will also be printed as tag
--- a/fastdeploy/function/concat.cc
+++ b/fastdeploy/function/concat.cc
@@ -14,26 +14,17 @@
 #include "fastdeploy/function/concat.h"
 #include "fastdeploy/utils/utils.h"
 #include <cstring>
 #include <limits>
 #include <set>
 #include <sstream>
 #include "fastdeploy/utils/utils.h"
 namespace fastdeploy {
 namespace function {
 std::string Str(const std::vector<int64_t>& shape) {
  std::ostringstream oss;
  oss << "[ " << shape[0];
  for (int i = 1; i < shape.size(); ++i) {
    oss << " ," << shape[i];
  }
  oss << " ]";
  return oss.str();
 }
-std::vector<int64_t> ComputeAndCheckConcatOutputShape(
+std::vector<int64_t>
-    const std::vector<FDTensor>& input, int axis) {
+ComputeAndCheckConcatOutputShape(const std::vector<FDTensor>& input, int axis) {
  const size_t n = input.size();
  auto out_dims = input[0].shape;
  size_t in_zero_dims_size = out_dims.size();
@@ -58,8 +49,7 @@ std::vector<int64_t> ComputeAndCheckConcatOutputShape(
  return out_dims;
 }
-template <typename T>
+template <typename T> struct ConcatFunctor {
 struct ConcatFunctor {
  void operator()(const std::vector<FDTensor>& input, int axis,
                  FDTensor* output) {
    size_t num = input.size();
--- a/fastdeploy/function/elementwise.cc
+++ b/fastdeploy/function/elementwise.cc
@@ -0,0 +1,75 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/function/elementwise.h"
 #include "fastdeploy/function/eigen.h"
 #include "fastdeploy/function/elementwise_base.h"
 #include "fastdeploy/function/elementwise_functor.h"
 #include "fastdeploy/utils/utils.h"
 #include <algorithm>
 namespace fastdeploy {
 namespace function {
 DEFINE_ELEMENTWISE_OP(Add);
 DEFINE_ELEMENTWISE_OP(Multiply);
 DEFINE_ELEMENTWISE_OP(Subtract);
 DEFINE_ELEMENTWISE_OP(Divide);
 void Add(const FDTensor& x, const FDTensor& y, FDTensor* out) {
  FD_VISIT_ALL_TYPES(x.dtype, "AddRawKernel",
                     ([&] { AddRawKernel<data_t>()(x, y, -1, out); }));
 }
 FDTensor operator+(const FDTensor& x, const FDTensor& y) {
  FDTensor out;
  Add(x, y, &out);
  return out;
 }
 void Subtract(const FDTensor& x, const FDTensor& y, FDTensor* out) {
  FD_VISIT_ALL_TYPES(x.dtype, "SubtractRawKernel",
                     ([&] { SubtractRawKernel<data_t>()(x, y, -1, out); }));
 }
 FDTensor operator-(const FDTensor& x, const FDTensor& y) {
  FDTensor out;
  Subtract(x, y, &out);
  return out;
 }
 void Multiply(const FDTensor& x, const FDTensor& y, FDTensor* out) {
  FD_VISIT_ALL_TYPES(x.dtype, "MultiplyRawKernel",
                     ([&] { MultiplyRawKernel<data_t>()(x, y, -1, out); }));
 }
 FDTensor operator*(const FDTensor& x, const FDTensor& y) {
  FDTensor out;
  Multiply(x, y, &out);
  return out;
 }
 void Divide(const FDTensor& x, const FDTensor& y, FDTensor* out) {
  FD_VISIT_ALL_TYPES(x.dtype, "DivideRawKernel",
                     ([&] { DivideRawKernel<data_t>()(x, y, -1, out); }));
 }
 FDTensor operator/(const FDTensor& x, const FDTensor& y) {
  FDTensor out;
  Divide(x, y, &out);
  return out;
 }
 }  // namespace function
 }  // namespace fastdeploy
--- a/fastdeploy/function/elementwise.h
+++ b/fastdeploy/function/elementwise.h
@@ -0,0 +1,60 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "fastdeploy/core/fd_tensor.h"
 namespace fastdeploy {
 namespace function {
 /** Excute the add operation for input FDTensors. *out = x + y.
    @param x The input tensor.
    @param y The input tensor.
    @param out The output tensor which stores the result.
 */
 FASTDEPLOY_DECL void Add(const FDTensor& x, const FDTensor& y, FDTensor* out);
 FASTDEPLOY_DECL FDTensor operator+(const FDTensor& x, const FDTensor& y);
 /** Excute the subtract operation for input FDTensors.  *out = x - y.
    @param x The input tensor.
    @param y The input tensor.
    @param out The output tensor which stores the result.
 */
 FASTDEPLOY_DECL void Subtract(const FDTensor& x, const FDTensor& y,
                              FDTensor* out);
 FASTDEPLOY_DECL FDTensor operator-(const FDTensor& x, const FDTensor& y);
 /** Excute the multiply operation for input FDTensors.  *out = x * y.
    @param x The input tensor.
    @param y The input tensor.
    @param out The output tensor which stores the result.
 */
 FASTDEPLOY_DECL void Multiply(const FDTensor& x, const FDTensor& y,
                              FDTensor* out);
 FASTDEPLOY_DECL FDTensor operator*(const FDTensor& x, const FDTensor& y);
 /** Excute the divide operation for input FDTensors.  *out = x / y.
    @param x The input tensor.
    @param y The input tensor.
    @param out The output tensor which stores the result.
 */
 FASTDEPLOY_DECL void Divide(const FDTensor& x, const FDTensor& y,
                            FDTensor* out);
 FASTDEPLOY_DECL FDTensor operator/(const FDTensor& x, const FDTensor& y);
 }  // namespace function
 }  // namespace fastdeploy
--- a/fastdeploy/function/elementwise_base.h
+++ b/fastdeploy/function/elementwise_base.h
@@ -0,0 +1,263 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <algorithm>
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/function/eigen.h"
 namespace fastdeploy {
 namespace function {
 #define DEFINE_ELEMENTWISE_OP(name)                                            \
  template <typename T> struct name##RawKernel {                               \
    void operator()(const FDTensor& x, const FDTensor& y, int axis,            \
                    FDTensor* out) {                                           \
      if (x.Shape() == y.Shape()) {                                            \
        SameDimsElementwiseCompute<SameDims##name##Functor<T>>()(x, y, out);   \
      } else {                                                                 \
        auto x_dims = x.Shape();                                               \
        auto y_dims = y.Shape();                                               \
        if (x_dims.size() >= y_dims.size()) {                                  \
          ElementwiseCompute<name##Functor<T>, T>(x, y, axis,                  \
                                                  name##Functor<T>(), out);    \
        } else {                                                               \
          ElementwiseCompute<Inverse##name##Functor<T>, T>(                    \
              x, y, axis, Inverse##name##Functor<T>(), out);                   \
        }                                                                      \
      }                                                                        \
    }                                                                          \
  }
 inline void GetMidDims(const std::vector<int64_t>& x_dims,
                       const std::vector<int64_t>& y_dims, const int axis,
                       int* pre, int* n, int* post,
                       int* is_run_common_broadcast) {
  *pre = 1;
  *n = 1;
  *post = 1;
  *is_run_common_broadcast = 0;
  for (int i = 0; i < axis; ++i) {
    (*pre) *= x_dims[i];
  }
  for (int i = 0; i < y_dims.size(); ++i) {
    if (x_dims[i + axis] != y_dims[i]) {
      FDASSERT(y_dims[i] == 1 || x_dims[i + axis] == 1,
               "Broadcast dimension mismatch. Operands "
               "could not be broadcast together with the shape of "
               "X = [%s] and the shape of Y = [%s]. Received [%d] "
               "in X is not equal to [%d] in Y.",
               Str(x_dims).c_str(), Str(y_dims).c_str(), x_dims[i + axis],
               y_dims[i]);
      *is_run_common_broadcast = 1;
      return;
    }
    (*n) *= y_dims[i];
  }
  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
    (*post) *= x_dims[i];
  }
 }
 inline std::vector<int64_t>
 TrimTrailingSingularDims(const std::vector<int64_t>& dims) {
  // Remove trailing dimensions of size 1 for y
  auto actual_dims_size = dims.size();
  for (; actual_dims_size != 0; --actual_dims_size) {
    if (dims[actual_dims_size - 1] != 1)
      break;
  }
  if (actual_dims_size == dims.size())
    return dims;
  std::vector<int64_t> trim_dims;
  trim_dims.resize(actual_dims_size);
  for (int i = 0; i < actual_dims_size; ++i) {
    trim_dims[i] = dims[i];
  }
  return trim_dims;
 }
 inline int GetElementwiseIndex(const int64_t* x_dims_array, const int max_dim,
                               const int64_t* index_array) {
  int index_ = 0;
  for (int i = 0; i < max_dim; i++) {
    if (x_dims_array[i] > 1) {
      index_ = index_ * x_dims_array[i] + index_array[i];
    }
  }
  return index_;
 }
 inline void UpdateElementwiseIndexArray(const int64_t* out_dims_array,
                                        const int max_dim,
                                        int64_t* index_array) {
  for (int i = max_dim - 1; i >= 0; --i) {
    ++index_array[i];
    if (index_array[i] >= out_dims_array[i]) {
      index_array[i] -= out_dims_array[i];
    } else {
      break;
    }
  }
 }
 inline void GetBroadcastDimsArrays(const std::vector<int64_t>& x_dims,
                                   const std::vector<int64_t>& y_dims,
                                   int64_t* x_dims_array, int64_t* y_dims_array,
                                   int64_t* out_dims_array, const int max_dim,
                                   const int axis) {
  FDASSERT(axis >= 0,
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis);
  FDASSERT(axis < max_dim,
           "Axis should be less than %d, but received axis is %d.", max_dim,
           axis);
  if (x_dims.size() > y_dims.size()) {
    std::fill(y_dims_array, y_dims_array + axis, 1);
    if (axis + y_dims.size() < max_dim) {
      std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1);
    }
    std::copy(x_dims.data(), x_dims.data() + x_dims.size(), x_dims_array);
    std::copy(y_dims.data(), y_dims.data() + y_dims.size(),
              y_dims_array + axis);
  } else {
    std::fill(x_dims_array, x_dims_array + axis, 1);
    if (axis + x_dims.size() < max_dim) {
      std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1);
    }
    std::copy(x_dims.data(), x_dims.data() + x_dims.size(),
              x_dims_array + axis);
    std::copy(y_dims.data(), y_dims.data() + y_dims.size(), y_dims_array);
  }
  for (int i = 0; i < max_dim; i++) {
    FDASSERT(x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
                 y_dims_array[i] <= 1,
             "Broadcast dimension mismatch. Operands "
             "could not be broadcast together with the shape of "
             "X = [%s] and the shape of Y = [%s]. Received [%d] "
             "in X is not equal to [%d] in Y.",
             Str(x_dims).c_str(), Str(y_dims).c_str(), x_dims[i + axis],
             y_dims[i]);
    if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
        (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
      out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]);
    } else {
      out_dims_array[i] = -1;
    }
  }
 }
 template <typename Functor, typename T, typename OutType = T>
 void CommonForwardBroadcastCPU(const FDTensor& x, const FDTensor& y,
                               FDTensor* z, int64_t* x_dims_array,
                               int64_t* y_dims_array, int64_t* out_dims_array,
                               int max_dim, Functor func,
                               const bool is_xsize_larger = true) {
  std::vector<int64_t> index_array(max_dim, 0);
  const T* x_data = reinterpret_cast<const T*>(x.Data());
  const T* y_data = reinterpret_cast<const T*>(y.Data());
  FDASSERT(x_data != nullptr, "The input X should not be empty.");
  FDASSERT(y_data != nullptr, "The input X should not be empty.");
  OutType* out_data = reinterpret_cast<OutType*>(z->Data());
  const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim,
                                       1, std::multiplies<int64_t>());
  int x_index, y_index;
  for (int out_index = 0; out_index < out_size; ++out_index) {
    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
    if (is_xsize_larger) {
      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
    } else {
      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
    }
    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
  }
 }
 template <typename Functor, typename T, typename OutType = T>
 void CommonElementwiseBroadcastForward(const FDTensor& x, const FDTensor& y,
                                       FDTensor* z,
                                       const std::vector<int64_t>& x_dims,
                                       const std::vector<int64_t>& y_dims,
                                       Functor func, int axis,
                                       const bool is_xsize_larger = true) {
  int x_dims_size = x_dims.size();
  int y_dims_size = y_dims.size();
  int max_dim = (std::max)(x_dims_size, y_dims_size);
  axis = (axis == -1 ? std::abs(x_dims_size - y_dims_size) : axis);
  FDASSERT(axis >= 0,
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis);
  FDASSERT(axis < max_dim,
           "Axis should be less than %d, but received axis is %d.", max_dim,
           axis);
  std::vector<int64_t> x_dims_array(max_dim);
  std::vector<int64_t> y_dims_array(max_dim);
  std::vector<int64_t> out_dims_array(max_dim);
  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
                         y_dims_array.data(), out_dims_array.data(), max_dim,
                         axis);
  z->Allocate(out_dims_array, TypeToDataType<OutType>::dtype);
  CommonForwardBroadcastCPU<Functor, T, OutType>(
      x, y, z, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(),
      max_dim, func, is_xsize_larger);
 }
 template <typename Functor, typename T, typename OutType = T>
 void ElementwiseCompute(const FDTensor& x, const FDTensor& y, int axis,
                        Functor func, FDTensor* z) {
  auto x_dims = x.Shape();
  auto y_dims = y.Shape();
  bool is_xsize_larger = true;
  int max_dim = x_dims.size();
  if (x_dims.size() < y_dims.size()) {
    is_xsize_larger = false;
    max_dim = y_dims.size();
  }
  int diff_size = x_dims.size() - y_dims.size();
  axis = (axis == -1 ? std::abs(diff_size) : axis);
  FDASSERT(axis >= 0,
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis);
  FDASSERT(axis < max_dim,
           "Axis should be less than %d, but received axis is %d.", max_dim,
           axis);
  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
  if (is_xsize_larger) {
    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
    GetMidDims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post,
               &is_run_common_broadcast);
  } else {
    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
    GetMidDims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post,
               &is_run_common_broadcast);
  }
  // special case for common implementation.
  // case 1: x=[2,3,1,5], y=[2,1,4,1]
  // case 2: x=[2,3,4], y=[1,1,4]
  CommonElementwiseBroadcastForward<Functor, T, OutType>(
      x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
 }
 }  // namespace function
 }  // namespace fastdeploy
--- a/fastdeploy/function/elementwise_functor.h
+++ b/fastdeploy/function/elementwise_functor.h
@@ -0,0 +1,126 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "fastdeploy/function/eigen.h"
 #include "fastdeploy/function/elementwise.h"
 #include "fastdeploy/function/elementwise_base.h"
 #include <algorithm>
 namespace fastdeploy {
 namespace function {
 template <typename Functor> struct SameDimsElementwiseCompute {
  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
    z->Allocate(x.Shape(), x.Dtype());
    Functor()(x, y, z);
  }
 };
 template <typename T> struct SameDimsAddFunctor {
  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
    auto eigen_x = EigenVector<T>::Flatten(x);
    auto eigen_y = EigenVector<T>::Flatten(y);
    auto eigen_z = EigenVector<T>::Flatten(*z);
    eigen_z.device(dev) = eigen_x + eigen_y;
  }
 };
 template <typename T> struct SameDimsSubtractFunctor {
  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
    auto eigen_x = EigenVector<T>::Flatten(x);
    auto eigen_y = EigenVector<T>::Flatten(y);
    auto eigen_z = EigenVector<T>::Flatten(*z);
    eigen_z.device(dev) = eigen_x - eigen_y;
  }
 };
 template <typename T> struct SameDimsMultiplyFunctor {
  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
    auto eigen_x = EigenVector<T>::Flatten(x);
    auto eigen_y = EigenVector<T>::Flatten(y);
    auto eigen_z = EigenVector<T>::Flatten(*z);
    eigen_z.device(dev) = eigen_x * eigen_y;
  }
 };
 template <typename T> struct SameDimsDivideFunctor {
  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
    auto eigen_x = EigenVector<T>::Flatten(x);
    auto eigen_y = EigenVector<T>::Flatten(y);
    auto eigen_z = EigenVector<T>::Flatten(*z);
    eigen_z.device(dev) = eigen_x / eigen_y;
  }
 };
 // Add
 template <typename T> struct AddFunctor {
  inline T operator()(const T a, const T b) const { return a + b; }
 };
 template <typename T> struct InverseAddFunctor {
  inline T operator()(const T a, const T b) const { return b + a; }
 };
 // Subtract
 template <typename T> struct SubtractFunctor {
  inline T operator()(const T a, const T b) const { return a - b; }
 };
 template <typename T> struct InverseSubtractFunctor {
  inline T operator()(const T a, const T b) const { return b - a; }
 };
 // Multiply
 template <typename T> struct MultiplyFunctor {
  inline T operator()(const T a, const T b) const { return a * b; }
 };
 template <> struct MultiplyFunctor<bool> {
  inline bool operator()(const bool a, const bool b) const { return a && b; }
 };
 template <typename T> struct InverseMultiplyFunctor {
  inline T operator()(const T a, const T b) const { return b * a; }
 };
 template <> struct InverseMultiplyFunctor<bool> {
  inline bool operator()(const bool a, const bool b) const { return b && a; }
 };
 // Divide
 #define DIV_ERROR_INFO                                                         \
  "InvalidArgumentError: Integer division by zero encountered in "             \
  "(floor) divide. Please check the input value."
 template <typename T, typename Enable = void> struct DivideFunctor {
  inline T operator()(const T a, const T b) const { return a / b; }
 };
 template <typename T>
 struct DivideFunctor<
    T, typename std::enable_if<std::is_integral<T>::value>::type> {
  inline T operator()(const T a, const T b) const {
    // For int32/int64, need to check whether the divison is zero.
    FDASSERT(b != 0, DIV_ERROR_INFO);
    return a / b;
  }
 };
 template <typename T, typename Enable = void> struct InverseDivideFunctor {
  inline T operator()(const T a, const T b) const { return b / a; }
 };
 }  // namespace function
 }  // namespace fastdeploy
--- a/fastdeploy/utils/utils.cc
+++ b/fastdeploy/utils/utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "fastdeploy/utils/utils.h"
 #include <sstream>
 namespace fastdeploy {
@@ -55,4 +56,14 @@ std::vector<int64_t> GetStride(const std::vector<int64_t>& dims) {
  return result;
 }
 std::string Str(const std::vector<int64_t>& shape) {
  std::ostringstream oss;
  oss << "[ " << shape[0];
  for (int i = 1; i < shape.size(); ++i) {
    oss << " ," << shape[i];
  }
  oss << " ]";
  return oss.str();
 }
 }  // namespace fastdeploy
--- a/fastdeploy/utils/utils.h
+++ b/fastdeploy/utils/utils.h
@@ -14,15 +14,15 @@
 #pragma once
 #include <stdlib.h>
 #include <cstdio>
 #include <stdlib.h>
 #include <fstream>
 #include <iostream>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
 #include <numeric>
 #if defined(_WIN32)
 #ifdef FASTDEPLOY_LIB
@@ -45,8 +45,7 @@ class FASTDEPLOY_DECL FDLogger {
  }
  explicit FDLogger(bool verbose, const std::string& prefix = "[FastDeploy]");
-  template <typename T>
+  template <typename T> FDLogger& operator<<(const T& val) {
  FDLogger& operator<<(const T& val) {
    if (!verbose_) {
      return *this;
    }
@@ -76,12 +75,12 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
 #endif
 #define FDERROR                                                                \
-  FDLogger(true, "[ERROR]") << __REL_FILE__ << "(" << __LINE__ \
+  FDLogger(true, "[ERROR]")                                                    \
-                            << ")::" << __FUNCTION__ << "\t"
+      << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 #define FDWARNING                                                              \
-  FDLogger(true, "[WARNING]") << __REL_FILE__ << "(" << __LINE__ \
+  FDLogger(true, "[WARNING]")                                                  \
-                              << ")::" << __FUNCTION__ << "\t"
+      << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 #define FDINFO                                                                 \
  FDLogger(true, "[INFO]") << __REL_FILE__ << "(" << __LINE__                  \
@@ -124,9 +123,9 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,       \
                           __VA_ARGS__)                                        \
    default:                                                                   \
-        FDASSERT(                                                              \
+      FDASSERT(false,                                                          \
-            false,                                                             \
+               "Invalid enum data type. Expect to accept data "                \
-            "Invalid enum data type. Expect to accept data type BOOL, INT32, " \
+               "type BOOL, INT32, "                                            \
               "INT64, FP32, FP64, but receive type %s.",                      \
               Str(__dtype__).c_str());                                        \
    }                                                                          \
@@ -184,7 +183,9 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
    }                                                                          \
  }()
-FASTDEPLOY_DECL std::vector<int64_t> GetStride(
+FASTDEPLOY_DECL std::vector<int64_t>
-    const std::vector<int64_t>& dims);
+GetStride(const std::vector<int64_t>& dims);
 FASTDEPLOY_DECL std::string Str(const std::vector<int64_t>& shape);
 }  // namespace fastdeploy
--- a/tests/core/test_fd_tensor.cc
+++ b/tests/core/test_fd_tensor.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/core/fd_tensor.h"
 #include "gtest_utils.h"
 #include "gtest/gtest.h"
 #include <array>
 #include <cstring>
 #include <vector>
 #include "fastdeploy/core/fd_tensor.h"
 #include "gtest/gtest.h"
 #include "gtest_utils.h"
 namespace fastdeploy {
@@ -86,4 +86,18 @@ TEST(fastdeploy, fd_tensor_assignment) {
  ASSERT_EQ(tensor1.Data(), nullptr);
 }
 TEST(fastdeploy, fd_tensor_reshape) {
  CheckShape check_shape;
  FDTensor x;
  x.Allocate({2, 3, 4, 5}, FDDataType::FP32);
  x.Reshape({-1, 3, 2, 2, 5});
  check_shape(x.Shape(), {2, 3, 2, 2, 5});
  x.Reshape({0, -1, 5, 2});
  check_shape(x.Shape(), {2, 6, 5, 2});
  x.Reshape({2, 3, 0, 0, 2});
  check_shape(x.Shape(), {2, 3, 5, 2, 2});
 }
 }  // namespace fastdeploy
--- a/tests/function/test_elementwise.cc
+++ b/tests/function/test_elementwise.cc
@@ -0,0 +1,451 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/function/elementwise.h"
 #include "glog/logging.h"
 #include "gtest_utils.h"
 #include "gtest/gtest.h"
 #include <array>
 #include <tuple>
 #include <vector>
 namespace fastdeploy {
 namespace function {
 std::tuple<std::vector<float>, std::vector<float>> CreateSameDimeData() {
  // Shape: [2, 3, 4]
  std::vector<float> x_data = {
      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
  // Shape: [2, 3, 4]
  std::vector<float> y_data = {
      0.8345295,  0.551608,   0.77101785, 0.386742,   0.12658621, 0.41240612,
      0.20051356, 0.68455917, 0.37947154, 0.2953741,  0.97703844, 0.2931625,
      0.2344262,  0.5054064,  0.40617892, 0.16315177, 0.71458364, 0.3748885,
      0.65257984, 0.83870554, 0.55464447, 0.38836837, 0.472637,   0.5546991};
  return std::make_tuple(x_data, y_data);
 }
 std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim1Data() {
  // Shape: [2, 3, 4]
  std::vector<float> x_data = {
      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
  // Shape: [2, 1, 1]
  std::vector<float> y_data = {0.97375137, 0.11732706};
  return std::make_tuple(x_data, y_data);
 }
 std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim2Data() {
  // Shape: [2, 3, 4]
  std::vector<float> x_data = {
      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
  // Shape: [1, 3, 1]
  std::vector<float> y_data = {0.30803263, 0.41172066, 0.5588573};
  return std::make_tuple(x_data, y_data);
 }
 std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim3Data() {
  // Shape: [2, 3, 4]
  std::vector<float> x_data = {
      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
  // Shape: [1, 1, 4]
  std::vector<float> y_data = {0.62653106, 0.5128424, 0.9891219, 0.32416528};
  return std::make_tuple(x_data, y_data);
 }
 std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim4Data() {
  // Shape: [2, 1, 4]
  std::vector<float> x_data = {0.8428625, 0.6461913, 0.13740455, 0.11430702,
                               0.659926,  0.535816,  0.7429162,  0.8456049};
  // Shape: [2, 2, 1]
  std::vector<float> y_data = {0.62653106, 0.5128424, 0.9891219, 0.32416528};
  return std::make_tuple(x_data, y_data);
 }
 TEST(fastdeploy, check_same_dim) {
  CheckShape check_shape;
  CheckData check_data;
  FDTensor x, y, z;
  auto test_data = CreateSameDimeData();
  auto x_data = std::get<0>(test_data);
  auto y_data = std::get<1>(test_data);
  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
  y.SetExternalData({2, 3, 4}, FDDataType::FP32, y_data.data());
  // Test Add functions
  std::vector<float> add_result = {
      1.677392,   1.1977993,  0.9084224, 0.50104904, 0.7865122,  0.94822216,
      0.94342977, 1.530164,   0.5917533, 0.5950749,  1.8392098,  0.70210385,
      0.36127308, 0.66202587, 0.8350199, 1.0108044,  0.96044695, 1.0439345,
      1.5314629,  1.5149645,  1.2210975, 0.7135986,  0.8865758,  1.3888397};
  Add(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  z = x + y;
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  // Test subtract
  std::vector<float> sub_result = {
      0.008332968, 0.09458327,  -0.6336133,   -0.27243498, 0.5333398,
      0.1234099,   0.5424027,   0.16104573,   -0.16718978, 0.004326731,
      -0.11486715, 0.11577883,  -0.10757932,  -0.3487869,  0.022662044,
      0.6845008,   -0.46872032, 0.29415748,   0.22630322,  -0.16244662,
      0.11180854,  -0.0631381,  -0.058698207, 0.27944148};
  Subtract(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  z = x - y;
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  // Test multiply
  std::vector<float> mul_result = {
      0.70339364, 0.3564443,  0.105941355, 0.044207327, 0.083537534,
      0.2209738,  0.14896478, 0.5788666,   0.08055489,  0.08852386,
      0.8423745,  0.11988626, 0.029736232, 0.079156496, 0.17418616,
      0.13829602, 0.17568989, 0.25081766,  0.57354134,  0.5671821,
      0.36964446, 0.12630916, 0.19564278,  0.46269706};
  Multiply(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  z = x * y;
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  // Test divide
  std::vector<float> div_result = {
      1.0099852,  1.1714683,  0.17821188, 0.29556403, 5.2132535,  1.2992436,
      3.7050674,  1.2352546,  0.5594142,  1.0146483,  0.88243335, 1.3949306,
      0.54109514, 0.30988827, 1.0557933,  5.195485,   0.34406513, 1.7846532,
      1.3467824,  0.8063127,  1.201586,   0.8374273,  0.875807,   1.5037713};
  Divide(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
  z = x / y;
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
 }
 TEST(fastdeploy, check_broadcast_dim1) {
  CheckShape check_shape;
  CheckData check_data;
  FDTensor x, y, z;
  auto test_data = CreateBroadcastDim1Data();
  auto x_data = std::get<0>(test_data);
  auto y_data = std::get<1>(test_data);
  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
  y.SetExternalData({2, 1, 1}, FDDataType::FP32, y_data.data());
  // Test Add functions
  std::vector<float> add_result = {
      1.816614, 1.619943, 1.111156, 1.088058, 1.633677, 1.509567,
      1.716668, 1.819356, 1.186033, 1.273452, 1.835923, 1.382693,
      0.244174, 0.273947, 0.546168, 0.96498,  0.36319,  0.786373,
      0.99621,  0.793586, 0.78378,  0.442557, 0.531266, 0.951468};
  Add(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  z = x + y;
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  // Test subtract
  std::vector<float> sub_result = {
      -0.130889, -0.32756,  -0.836347, -0.859444, -0.313825, -0.437935,
      -0.230835, -0.128146, -0.76147,  -0.674051, -0.11158,  -0.56481,
      0.00952,   0.039292,  0.311514,  0.730326,  0.128536,  0.551719,
      0.761556,  0.558932,  0.549126,  0.207903,  0.296612,  0.716814};
  Subtract(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  z = x - y;
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  // Test multiply
  std::vector<float> mul_result = {
      0.820738, 0.62923,  0.133798, 0.111307, 0.642604, 0.521752,
      0.723416, 0.823409, 0.20671,  0.291834, 0.83954,  0.398207,
      0.014883, 0.018376, 0.050315, 0.099453, 0.028846, 0.078497,
      0.103117, 0.079343, 0.078193, 0.038158, 0.048566, 0.097867};
  Multiply(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  z = x * y;
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  // Test divide
  std::vector<float> div_result = {
      0.865583, 0.66361,  0.141108, 0.117388, 0.677715, 0.55026,
      0.762942, 0.868399, 0.218004, 0.30778,  0.885412, 0.419965,
      1.081139, 1.334897, 3.65509,  7.224699, 2.095538, 5.702402,
      7.490881, 5.763879, 5.680301, 2.771997, 3.528076, 7.109533};
  Divide(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
  z = x / y;
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
 }
 TEST(fastdeploy, check_broadcast_dim2) {
  CheckShape check_shape;
  CheckData check_data;
  FDTensor x, y, z;
  auto test_data = CreateBroadcastDim2Data();
  auto x_data = std::get<0>(test_data);
  auto y_data = std::get<1>(test_data);
  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
  y.SetExternalData({1, 3, 1}, FDDataType::FP32, y_data.data());
  // Test Add functions
  std::vector<float> add_result = {
      1.150895, 0.954224, 0.445437, 0.42234,  1.071647, 0.947537,
      1.154637, 1.257326, 0.771139, 0.858558, 1.421029, 0.967799,
      0.43488,  0.464652, 0.736874, 1.155685, 0.657584, 1.080767,
      1.290604, 1.08798,  1.22531,  0.884088, 0.972796, 1.392998};
  Add(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  z = x + y;
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  // Test subtract
  std::vector<float> sub_result = {
      0.53483,   0.338159,  -0.170628, -0.193726, 0.248205,  0.124095,
      0.331196,  0.433884,  -0.346576, -0.259156, 0.303314,  -0.149916,
      -0.181186, -0.151413, 0.120808,  0.53962,   -0.165857, 0.257325,
      0.467162,  0.264538,  0.107596,  -0.233627, -0.144919, 0.275283};
  Subtract(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  z = x - y;
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  // Test multiply
  std::vector<float> mul_result = {
      0.259629, 0.199048, 0.042325, 0.03521,  0.271705, 0.220607,
      0.305874, 0.348153, 0.118635, 0.16749,  0.481831, 0.22854,
      0.039073, 0.048244, 0.132097, 0.261105, 0.101227, 0.27546,
      0.361854, 0.27843,  0.372452, 0.181757, 0.231333, 0.466166};
  Multiply(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  // Test divide
  std::vector<float> div_result = {
      2.736277, 2.097801, 0.446071, 0.371087, 1.602849, 1.301407,
      1.804418, 2.053832, 0.37985,  0.536274, 1.54274,  0.731745,
      0.411797, 0.508451, 1.392193, 2.751827, 0.59716,  1.625,
      2.134659, 1.642519, 1.192528, 0.581956, 0.740688, 1.492582};
  Divide(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
 }
 TEST(fastdeploy, check_broadcast_dim3) {
  CheckShape check_shape;
  CheckData check_data;
  FDTensor x, y, z;
  auto test_data = CreateBroadcastDim3Data();
  auto x_data = std::get<0>(test_data);
  auto y_data = std::get<1>(test_data);
  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
  y.SetExternalData({4}, FDDataType::FP32, y_data.data());
  // Test Add functions
  std::vector<float> add_result = {
      1.469393, 1.159034, 1.126526, 0.438472, 1.286457, 1.048658,
      1.732038, 1.16977,  0.838813, 0.812543, 1.851293, 0.733107,
      0.753378, 0.669462, 1.417963, 1.171818, 0.872394, 1.181888,
      1.868005, 1.000424, 1.292984, 0.838073, 1.403061, 1.158306};
  Add(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  z = x + y;
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  // Test subtract
  std::vector<float> sub_result = {
      0.216331,  0.133349,  -0.851717, -0.209858, 0.033395,  0.022974,
      -0.246206, 0.52144,   -0.414249, -0.213142, -0.126951, 0.084776,
      -0.499684, -0.356223, -0.560281, 0.523487,  -0.380668, 0.156204,
      -0.110239, 0.352094,  0.039922,  -0.187612, -0.575183, 0.509975};
  Subtract(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  z = x - y;
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  // Test multiply
  std::vector<float> mul_result = {
      0.52808,  0.331394, 0.13591,  0.037054, 0.413464, 0.274789,
      0.734835, 0.274116, 0.133001, 0.153699, 0.852793, 0.132565,
      0.079474, 0.080321, 0.424176, 0.27478,  0.154041, 0.343115,
      0.869322, 0.21922,  0.417554, 0.166792, 0.409436, 0.270399};
  Multiply(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  z = x * y;
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  // Test divide
  std::vector<float> div_result = {
      1.345284, 1.260019, 0.138916, 0.35262,  1.053301, 1.044797,
      0.751087, 2.608561, 0.338821, 0.584392, 0.871653, 1.261521,
      0.202459, 0.305395, 0.433557, 2.614878, 0.39242,  1.304584,
      0.888549, 2.086155, 1.063719, 0.634172, 0.418491, 2.573195};
  Divide(x, y, &z);
  check_shape(z.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
  z = x / y;
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
 }
 TEST(fastdeploy, check_broadcast_dim4) {
  CheckShape check_shape;
  CheckData check_data;
  FDTensor x, y, z;
  auto test_data = CreateBroadcastDim4Data();
  auto x_data = std::get<0>(test_data);
  auto y_data = std::get<1>(test_data);
  x.SetExternalData({2, 1, 4}, FDDataType::FP32, x_data.data());
  y.SetExternalData({2, 2, 1}, FDDataType::FP32, y_data.data());
  // Test Add functions
  std::vector<float> add_result = {1.469393, 1.272722, 0.763936, 0.740838,
                                   1.355705, 1.159034, 0.650247, 0.627149,
                                   1.649048, 1.524938, 1.732038, 1.834727,
                                   0.984091, 0.859981, 1.067081, 1.16977};
  Add(x, y, &z);
  check_shape(z.shape, {2, 2, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  z = x + y;
  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
             add_result.size());
  // Test subtract
  std::vector<float> sub_result = {0.216331,  0.01966,   -0.489127, -0.512224,
                                   0.33002,   0.133349,  -0.375438, -0.398535,
                                   -0.329196, -0.453306, -0.246206, -0.143517,
                                   0.335761,  0.211651,  0.418751,  0.52144};
  Subtract(x, y, &z);
  check_shape(z.shape, {2, 2, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  z = x - y;
  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
             sub_result.size());
  // Test multiply
  std::vector<float> mul_result = {0.52808,  0.404859, 0.086088, 0.071617,
                                   0.432256, 0.331394, 0.070467, 0.058621,
                                   0.652747, 0.529987, 0.734835, 0.836406,
                                   0.213925, 0.173693, 0.240828, 0.274116};
  Multiply(x, y, &z);
  check_shape(z.shape, {2, 2, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  z = x * y;
  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
             mul_result.size());
  // Test divide
  std::vector<float> div_result = {1.345284, 1.031379, 0.21931,  0.182444,
                                   1.643512, 1.260019, 0.267927, 0.222889,
                                   0.667184, 0.541709, 0.751087, 0.854905,
                                   2.03577,  1.65291,  2.291782, 2.608561};
  Divide(x, y, &z);
  check_shape(z.shape, {2, 2, 4});
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
  z = x / y;
  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
             div_result.size());
 }
 TEST(fastdeploy, mixed_operation) {
  CheckShape check_shape;
  CheckData check_data;
  FDTensor a, b, c, d, e, output;
  auto test_data = CreateSameDimeData();
  auto a_data = std::get<0>(test_data);
  auto b_data = std::get<1>(test_data);
  auto c_data = std::get<1>(CreateBroadcastDim1Data());
  auto d_data = std::get<1>(CreateBroadcastDim2Data());
  auto e_data = std::get<1>(CreateBroadcastDim3Data());
  a.SetExternalData({2, 3, 4}, FDDataType::FP32, a_data.data());
  b.SetExternalData({2, 3, 4}, FDDataType::FP32, b_data.data());
  c.SetExternalData({2, 1, 1}, FDDataType::FP32, c_data.data());
  d.SetExternalData({1, 3, 1}, FDDataType::FP32, d_data.data());
  e.SetExternalData({1, 1, 4}, FDDataType::FP32, e_data.data());
  std::vector<float> result = {
      3.238058,  3.004797,  2.278015,  2.881238,  1.822084,  2.073209,
      1.524921,  2.619779,  1.196421,  1.318079,  1.59565,   1.538118,
      -0.215903, -0.052794, -0.434044, 0.195022,  -0.165874, 0.022943,
      -0.130613, 0.527984,  -0.046946, -0.176592, -0.583538, 0.348473};
  output = a * b + c / d - e;
  check_shape(output.shape, {2, 3, 4});
  check_data(reinterpret_cast<const float*>(output.Data()), result.data(),
             result.size());
 }
 }  // namespace function
 }  // namespace fastdeploy