diff --git a/.clang-format b/.clang-format
index 79aa44660..c91ec19fb 100644
--- a/.clang-format
+++ b/.clang-format
@@ -149,7 +149,7 @@
 # SpaceBeforeRangeBasedForLoopColon: true
 # SpaceInEmptyBlock: false
 # SpaceInEmptyParentheses: false
-# SpacesBeforeTrailingComments: 1
+# SpacesBeforeTrailingComments: 2
 # SpacesInAngles:  Never
 # SpacesInConditionalStatement: false
 # SpacesInContainerLiterals: true
diff --git a/fastdeploy/core/fd_tensor.cc b/fastdeploy/core/fd_tensor.cc
old mode 100755
new mode 100644
index 86ce866f4..200e51ade
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -11,11 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <cstring>
-
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/utils/utils.h"
+#include <algorithm>
+#include <cstring>
 #ifdef WITH_GPU
 #include <cuda_runtime_api.h>
 #endif
@@ -151,9 +151,63 @@ void FDTensor::Resize(const std::vector<int64_t>& new_shape,
   shape.assign(new_shape.begin(), new_shape.end());
 }
 
+bool FDTensor::Reshape(const std::vector<int64_t>& new_shape) {
+  int numel = Numel();
+  const int64_t unk_dim_val = -1;
+  const int64_t copy_dim_val = 0;
+
+  std::vector<int64_t> output_shape(new_shape.size(), 0);
+  int64_t capacity = 1;
+  int unk_dim_idx = -1;
+  for (size_t i = 0; i < new_shape.size(); ++i) {
+    if (new_shape[i] == unk_dim_val) {
+      FDASSERT(unk_dim_idx == -1,
+               "Only one dimension value of 'shape' in ReshapeOp can "
+               "be -1. But received shape = [%s], shape[%d] is also -1.",
+               Str(new_shape).c_str(), i);
+      unk_dim_idx = i;
+    } else if (new_shape[i] == copy_dim_val) {
+      FDASSERT(i < shape.size(),
+               "The index of 0 in `shape` must be less than "
+               "the input tensor X's dimensions. "
+               "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
+               "X's dimensions = %d.",
+               Str(new_shape).c_str(), i, Str(shape).c_str(), shape.size());
+    } else {
+      FDASSERT(new_shape[i] > 0,
+               "Each dimension value of 'shape' in ReshapeOp must not "
+               "be negative except one unknown dimension. "
+               "But received  shape = [%s], shape[%d] = %d.",
+               Str(new_shape).c_str(), i, new_shape[i]);
+    }
+    capacity *= (new_shape[i] ? new_shape[i] : shape[i]);
+    output_shape[i] = (new_shape[i] ? new_shape[i] : shape[i]);
+  }
+  if (unk_dim_idx != -1) {
+    output_shape[unk_dim_idx] = -numel / capacity;
+    FDASSERT(output_shape[unk_dim_idx] * capacity == -numel,
+             "The 'shape' attribute in ReshapeOp is invalid. "
+             "The input tensor X'size must be divisible by known "
+             "capacity of 'shape'. "
+             "But received X's shape = [%s], X's size = %d, "
+             "'shape' is [%s], known capacity of 'shape' is %d.",
+             Str(shape).c_str(), numel, Str(new_shape).c_str(), capacity);
+  } else {
+    FDASSERT(numel == capacity,
+             "The 'shape' in ReshapeOp is invalid. "
+             "The input tensor X'size must be equal to the capacity of "
+             "'shape'. "
+             "But received X's shape = [%s], X's size = %d, 'shape' is "
+             "[%s], the capacity of 'shape' is %d.",
+             Str(shape).c_str(), numel, Str(shape).c_str(), capacity);
+  }
+  shape = output_shape;
+  return true;
+}
+
 template <typename T>
-void CalculateStatisInfo(const void* src_ptr, int size, double* mean, double* max,
-                         double* min) {
+void CalculateStatisInfo(const void* src_ptr, int size, double* mean,
+                         double* max, double* min) {
   const T* ptr = static_cast<const T*>(src_ptr);
   *mean = 0;
   *max = -99999999;
@@ -213,10 +267,9 @@ bool FDTensor::ReallocFn(size_t nbytes) {
     }
     return buffer_ != nullptr;
 #else
-    FDASSERT(false,
-             "The FastDeploy FDTensor allocator didn't compile under "
-             "-DWITH_GPU=ON,"
-             "so this is an unexpected problem happend.");
+    FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
+                    "-DWITH_GPU=ON,"
+                    "so this is an unexpected problem happend.");
 #endif
   } else {
     if (is_pinned_memory) {
@@ -230,10 +283,9 @@ bool FDTensor::ReallocFn(size_t nbytes) {
       }
       return buffer_ != nullptr;
 #else
-      FDASSERT(false,
-               "The FastDeploy FDTensor allocator didn't compile under "
-               "-DWITH_GPU=ON,"
-               "so this is an unexpected problem happend.");
+      FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
+                      "-DWITH_GPU=ON,"
+                      "so this is an unexpected problem happend.");
 #endif
     }
     buffer_ = realloc(buffer_, nbytes);
@@ -242,7 +294,8 @@ bool FDTensor::ReallocFn(size_t nbytes) {
 }
 
 void FDTensor::FreeFn() {
-  if (external_data_ptr != nullptr) external_data_ptr = nullptr;
+  if (external_data_ptr != nullptr)
+    external_data_ptr = nullptr;
   if (buffer_ != nullptr) {
     if (device == Device::GPU) {
 #ifdef WITH_GPU
@@ -293,11 +346,8 @@ void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes,
 FDTensor::FDTensor(const std::string& tensor_name) { name = tensor_name; }
 
 FDTensor::FDTensor(const FDTensor& other)
-    : shape(other.shape),
-      name(other.name),
-      dtype(other.dtype),
-      device(other.device),
-      external_data_ptr(other.external_data_ptr) {
+    : shape(other.shape), name(other.name), dtype(other.dtype),
+      device(other.device), external_data_ptr(other.external_data_ptr) {
   // Copy buffer
   if (other.buffer_ == nullptr) {
     buffer_ = nullptr;
@@ -310,12 +360,9 @@ FDTensor::FDTensor(const FDTensor& other)
 }
 
 FDTensor::FDTensor(FDTensor&& other)
-    : buffer_(other.buffer_),
-      shape(std::move(other.shape)),
-      name(std::move(other.name)),
-      dtype(other.dtype),
-      external_data_ptr(other.external_data_ptr),
-      device(other.device) {
+    : buffer_(other.buffer_), shape(std::move(other.shape)),
+      name(std::move(other.name)), dtype(other.dtype),
+      external_data_ptr(other.external_data_ptr), device(other.device) {
   other.name = "";
   // Note(zhoushunjie): Avoid double free.
   other.buffer_ = nullptr;
diff --git a/fastdeploy/core/fd_tensor.h b/fastdeploy/core/fd_tensor.h
old mode 100755
new mode 100644
index 7deb48229..6a86bba1b
--- a/fastdeploy/core/fd_tensor.h
+++ b/fastdeploy/core/fd_tensor.h
@@ -57,9 +57,7 @@ struct FASTDEPLOY_DECL FDTensor {
 
   void* Data();
 
-  bool IsShared() {
-    return external_data_ptr != nullptr;
-  }
+  bool IsShared() { return external_data_ptr != nullptr; }
 
   void StopSharing();
 
@@ -116,6 +114,7 @@ struct FASTDEPLOY_DECL FDTensor {
               const FDDataType& data_type, const std::string& tensor_name = "",
               const Device& new_device = Device::CPU);
 
+  bool Reshape(const std::vector<int64_t>& new_shape);
   // Debug function
   // Use this function to print shape, dtype, mean, max, min
   // prefix will also be printed as tag
@@ -141,7 +140,7 @@ struct FASTDEPLOY_DECL FDTensor {
 
   static void CopyBuffer(void* dst, const void* src, size_t nbytes,
                          const Device& device = Device::CPU,
-                        bool is_pinned_memory = false);
+                         bool is_pinned_memory = false);
 };
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/concat.cc b/fastdeploy/function/concat.cc
index 3a59e7910..295c3c25a 100644
--- a/fastdeploy/function/concat.cc
+++ b/fastdeploy/function/concat.cc
@@ -14,26 +14,17 @@
 
 #include "fastdeploy/function/concat.h"
 
+#include "fastdeploy/utils/utils.h"
 #include <cstring>
 #include <limits>
 #include <set>
 #include <sstream>
-#include "fastdeploy/utils/utils.h"
 
 namespace fastdeploy {
 namespace function {
-std::string Str(const std::vector<int64_t>& shape) {
-  std::ostringstream oss;
-  oss << "[ " << shape[0];
-  for (int i = 1; i < shape.size(); ++i) {
-    oss << " ," << shape[i];
-  }
-  oss << " ]";
-  return oss.str();
-}
 
-std::vector<int64_t> ComputeAndCheckConcatOutputShape(
-    const std::vector<FDTensor>& input, int axis) {
+std::vector<int64_t>
+ComputeAndCheckConcatOutputShape(const std::vector<FDTensor>& input, int axis) {
   const size_t n = input.size();
   auto out_dims = input[0].shape;
   size_t in_zero_dims_size = out_dims.size();
@@ -58,8 +49,7 @@ std::vector<int64_t> ComputeAndCheckConcatOutputShape(
   return out_dims;
 }
 
-template <typename T>
-struct ConcatFunctor {
+template <typename T> struct ConcatFunctor {
   void operator()(const std::vector<FDTensor>& input, int axis,
                   FDTensor* output) {
     size_t num = input.size();
diff --git a/fastdeploy/function/elementwise.cc b/fastdeploy/function/elementwise.cc
new file mode 100644
index 000000000..27dacbcd9
--- /dev/null
+++ b/fastdeploy/function/elementwise.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/function/elementwise.h"
+#include "fastdeploy/function/eigen.h"
+#include "fastdeploy/function/elementwise_base.h"
+#include "fastdeploy/function/elementwise_functor.h"
+#include "fastdeploy/utils/utils.h"
+#include <algorithm>
+
+namespace fastdeploy {
+namespace function {
+
+DEFINE_ELEMENTWISE_OP(Add);
+DEFINE_ELEMENTWISE_OP(Multiply);
+DEFINE_ELEMENTWISE_OP(Subtract);
+DEFINE_ELEMENTWISE_OP(Divide);
+
+void Add(const FDTensor& x, const FDTensor& y, FDTensor* out) {
+  FD_VISIT_ALL_TYPES(x.dtype, "AddRawKernel",
+                     ([&] { AddRawKernel<data_t>()(x, y, -1, out); }));
+}
+
+FDTensor operator+(const FDTensor& x, const FDTensor& y) {
+  FDTensor out;
+  Add(x, y, &out);
+  return out;
+}
+
+void Subtract(const FDTensor& x, const FDTensor& y, FDTensor* out) {
+  FD_VISIT_ALL_TYPES(x.dtype, "SubtractRawKernel",
+                     ([&] { SubtractRawKernel<data_t>()(x, y, -1, out); }));
+}
+
+FDTensor operator-(const FDTensor& x, const FDTensor& y) {
+  FDTensor out;
+  Subtract(x, y, &out);
+  return out;
+}
+
+void Multiply(const FDTensor& x, const FDTensor& y, FDTensor* out) {
+  FD_VISIT_ALL_TYPES(x.dtype, "MultiplyRawKernel",
+                     ([&] { MultiplyRawKernel<data_t>()(x, y, -1, out); }));
+}
+
+FDTensor operator*(const FDTensor& x, const FDTensor& y) {
+  FDTensor out;
+  Multiply(x, y, &out);
+  return out;
+}
+
+void Divide(const FDTensor& x, const FDTensor& y, FDTensor* out) {
+  FD_VISIT_ALL_TYPES(x.dtype, "DivideRawKernel",
+                     ([&] { DivideRawKernel<data_t>()(x, y, -1, out); }));
+}
+
+FDTensor operator/(const FDTensor& x, const FDTensor& y) {
+  FDTensor out;
+  Divide(x, y, &out);
+  return out;
+}
+
+}  // namespace function
+}  // namespace fastdeploy
diff --git a/fastdeploy/function/elementwise.h b/fastdeploy/function/elementwise.h
new file mode 100644
index 000000000..33eb5b762
--- /dev/null
+++ b/fastdeploy/function/elementwise.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+namespace function {
+
+/** Excute the add operation for input FDTensors. *out = x + y.
+    @param x The input tensor.
+    @param y The input tensor.
+    @param out The output tensor which stores the result.
+*/
+FASTDEPLOY_DECL void Add(const FDTensor& x, const FDTensor& y, FDTensor* out);
+
+FASTDEPLOY_DECL FDTensor operator+(const FDTensor& x, const FDTensor& y);
+
+/** Excute the subtract operation for input FDTensors.  *out = x - y.
+    @param x The input tensor.
+    @param y The input tensor.
+    @param out The output tensor which stores the result.
+*/
+FASTDEPLOY_DECL void Subtract(const FDTensor& x, const FDTensor& y,
+                              FDTensor* out);
+
+FASTDEPLOY_DECL FDTensor operator-(const FDTensor& x, const FDTensor& y);
+
+/** Excute the multiply operation for input FDTensors.  *out = x * y.
+    @param x The input tensor.
+    @param y The input tensor.
+    @param out The output tensor which stores the result.
+*/
+FASTDEPLOY_DECL void Multiply(const FDTensor& x, const FDTensor& y,
+                              FDTensor* out);
+
+FASTDEPLOY_DECL FDTensor operator*(const FDTensor& x, const FDTensor& y);
+/** Excute the divide operation for input FDTensors.  *out = x / y.
+    @param x The input tensor.
+    @param y The input tensor.
+    @param out The output tensor which stores the result.
+*/
+FASTDEPLOY_DECL void Divide(const FDTensor& x, const FDTensor& y,
+                            FDTensor* out);
+FASTDEPLOY_DECL FDTensor operator/(const FDTensor& x, const FDTensor& y);
+
+}  // namespace function
+}  // namespace fastdeploy
diff --git a/fastdeploy/function/elementwise_base.h b/fastdeploy/function/elementwise_base.h
new file mode 100644
index 000000000..e2fab684e
--- /dev/null
+++ b/fastdeploy/function/elementwise_base.h
@@ -0,0 +1,263 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/function/eigen.h"
+
+namespace fastdeploy {
+namespace function {
+
+#define DEFINE_ELEMENTWISE_OP(name)                                            \
+  template <typename T> struct name##RawKernel {                               \
+    void operator()(const FDTensor& x, const FDTensor& y, int axis,            \
+                    FDTensor* out) {                                           \
+      if (x.Shape() == y.Shape()) {                                            \
+        SameDimsElementwiseCompute<SameDims##name##Functor<T>>()(x, y, out);   \
+      } else {                                                                 \
+        auto x_dims = x.Shape();                                               \
+        auto y_dims = y.Shape();                                               \
+        if (x_dims.size() >= y_dims.size()) {                                  \
+          ElementwiseCompute<name##Functor<T>, T>(x, y, axis,                  \
+                                                  name##Functor<T>(), out);    \
+        } else {                                                               \
+          ElementwiseCompute<Inverse##name##Functor<T>, T>(                    \
+              x, y, axis, Inverse##name##Functor<T>(), out);                   \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+  }
+
+inline void GetMidDims(const std::vector<int64_t>& x_dims,
+                       const std::vector<int64_t>& y_dims, const int axis,
+                       int* pre, int* n, int* post,
+                       int* is_run_common_broadcast) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  *is_run_common_broadcast = 0;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    if (x_dims[i + axis] != y_dims[i]) {
+      FDASSERT(y_dims[i] == 1 || x_dims[i + axis] == 1,
+               "Broadcast dimension mismatch. Operands "
+               "could not be broadcast together with the shape of "
+               "X = [%s] and the shape of Y = [%s]. Received [%d] "
+               "in X is not equal to [%d] in Y.",
+               Str(x_dims).c_str(), Str(y_dims).c_str(), x_dims[i + axis],
+               y_dims[i]);
+      *is_run_common_broadcast = 1;
+      return;
+    }
+    (*n) *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+}
+
+inline std::vector<int64_t>
+TrimTrailingSingularDims(const std::vector<int64_t>& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1)
+      break;
+  }
+  if (actual_dims_size == dims.size())
+    return dims;
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  return trim_dims;
+}
+
+inline int GetElementwiseIndex(const int64_t* x_dims_array, const int max_dim,
+                               const int64_t* index_array) {
+  int index_ = 0;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] > 1) {
+      index_ = index_ * x_dims_array[i] + index_array[i];
+    }
+  }
+  return index_;
+}
+
+inline void UpdateElementwiseIndexArray(const int64_t* out_dims_array,
+                                        const int max_dim,
+                                        int64_t* index_array) {
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_dims_array[i]) {
+      index_array[i] -= out_dims_array[i];
+    } else {
+      break;
+    }
+  }
+}
+
+inline void GetBroadcastDimsArrays(const std::vector<int64_t>& x_dims,
+                                   const std::vector<int64_t>& y_dims,
+                                   int64_t* x_dims_array, int64_t* y_dims_array,
+                                   int64_t* out_dims_array, const int max_dim,
+                                   const int axis) {
+  FDASSERT(axis >= 0,
+           "Axis should be great than or equal to 0, but received axis is %d.",
+           axis);
+  FDASSERT(axis < max_dim,
+           "Axis should be less than %d, but received axis is %d.", max_dim,
+           axis);
+  if (x_dims.size() > y_dims.size()) {
+    std::fill(y_dims_array, y_dims_array + axis, 1);
+    if (axis + y_dims.size() < max_dim) {
+      std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1);
+    }
+    std::copy(x_dims.data(), x_dims.data() + x_dims.size(), x_dims_array);
+    std::copy(y_dims.data(), y_dims.data() + y_dims.size(),
+              y_dims_array + axis);
+  } else {
+    std::fill(x_dims_array, x_dims_array + axis, 1);
+    if (axis + x_dims.size() < max_dim) {
+      std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1);
+    }
+    std::copy(x_dims.data(), x_dims.data() + x_dims.size(),
+              x_dims_array + axis);
+    std::copy(y_dims.data(), y_dims.data() + y_dims.size(), y_dims_array);
+  }
+
+  for (int i = 0; i < max_dim; i++) {
+    FDASSERT(x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
+                 y_dims_array[i] <= 1,
+             "Broadcast dimension mismatch. Operands "
+             "could not be broadcast together with the shape of "
+             "X = [%s] and the shape of Y = [%s]. Received [%d] "
+             "in X is not equal to [%d] in Y.",
+             Str(x_dims).c_str(), Str(y_dims).c_str(), x_dims[i + axis],
+             y_dims[i]);
+    if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
+        (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
+      out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]);
+    } else {
+      out_dims_array[i] = -1;
+    }
+  }
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonForwardBroadcastCPU(const FDTensor& x, const FDTensor& y,
+                               FDTensor* z, int64_t* x_dims_array,
+                               int64_t* y_dims_array, int64_t* out_dims_array,
+                               int max_dim, Functor func,
+                               const bool is_xsize_larger = true) {
+  std::vector<int64_t> index_array(max_dim, 0);
+  const T* x_data = reinterpret_cast<const T*>(x.Data());
+  const T* y_data = reinterpret_cast<const T*>(y.Data());
+  FDASSERT(x_data != nullptr, "The input X should not be empty.");
+  FDASSERT(y_data != nullptr, "The input X should not be empty.");
+  OutType* out_data = reinterpret_cast<OutType*>(z->Data());
+
+  const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim,
+                                       1, std::multiplies<int64_t>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (is_xsize_larger) {
+      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
+    } else {
+      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonElementwiseBroadcastForward(const FDTensor& x, const FDTensor& y,
+                                       FDTensor* z,
+                                       const std::vector<int64_t>& x_dims,
+                                       const std::vector<int64_t>& y_dims,
+                                       Functor func, int axis,
+                                       const bool is_xsize_larger = true) {
+  int x_dims_size = x_dims.size();
+  int y_dims_size = y_dims.size();
+  int max_dim = (std::max)(x_dims_size, y_dims_size);
+  axis = (axis == -1 ? std::abs(x_dims_size - y_dims_size) : axis);
+  FDASSERT(axis >= 0,
+           "Axis should be great than or equal to 0, but received axis is %d.",
+           axis);
+  FDASSERT(axis < max_dim,
+           "Axis should be less than %d, but received axis is %d.", max_dim,
+           axis);
+  std::vector<int64_t> x_dims_array(max_dim);
+  std::vector<int64_t> y_dims_array(max_dim);
+  std::vector<int64_t> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+  z->Allocate(out_dims_array, TypeToDataType<OutType>::dtype);
+  CommonForwardBroadcastCPU<Functor, T, OutType>(
+      x, y, z, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(),
+      max_dim, func, is_xsize_larger);
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const FDTensor& x, const FDTensor& y, int axis,
+                        Functor func, FDTensor* z) {
+  auto x_dims = x.Shape();
+  auto y_dims = y.Shape();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  int diff_size = x_dims.size() - y_dims.size();
+  axis = (axis == -1 ? std::abs(diff_size) : axis);
+  FDASSERT(axis >= 0,
+           "Axis should be great than or equal to 0, but received axis is %d.",
+           axis);
+  FDASSERT(axis < max_dim,
+           "Axis should be less than %d, but received axis is %d.", max_dim,
+           axis);
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    GetMidDims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post,
+               &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    GetMidDims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post,
+               &is_run_common_broadcast);
+  }
+  // special case for common implementation.
+  // case 1: x=[2,3,1,5], y=[2,1,4,1]
+  // case 2: x=[2,3,4], y=[1,1,4]
+  CommonElementwiseBroadcastForward<Functor, T, OutType>(
+      x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
+}
+
+}  // namespace function
+}  // namespace fastdeploy
diff --git a/fastdeploy/function/elementwise_functor.h b/fastdeploy/function/elementwise_functor.h
new file mode 100644
index 000000000..6a0c02e71
--- /dev/null
+++ b/fastdeploy/function/elementwise_functor.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/function/eigen.h"
+#include "fastdeploy/function/elementwise.h"
+#include "fastdeploy/function/elementwise_base.h"
+#include <algorithm>
+
+namespace fastdeploy {
+namespace function {
+
+template <typename Functor> struct SameDimsElementwiseCompute {
+  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
+    z->Allocate(x.Shape(), x.Dtype());
+    Functor()(x, y, z);
+  }
+};
+
+template <typename T> struct SameDimsAddFunctor {
+  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
+    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
+    auto eigen_x = EigenVector<T>::Flatten(x);
+    auto eigen_y = EigenVector<T>::Flatten(y);
+    auto eigen_z = EigenVector<T>::Flatten(*z);
+    eigen_z.device(dev) = eigen_x + eigen_y;
+  }
+};
+
+template <typename T> struct SameDimsSubtractFunctor {
+  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
+    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
+    auto eigen_x = EigenVector<T>::Flatten(x);
+    auto eigen_y = EigenVector<T>::Flatten(y);
+    auto eigen_z = EigenVector<T>::Flatten(*z);
+    eigen_z.device(dev) = eigen_x - eigen_y;
+  }
+};
+
+template <typename T> struct SameDimsMultiplyFunctor {
+  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
+    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
+    auto eigen_x = EigenVector<T>::Flatten(x);
+    auto eigen_y = EigenVector<T>::Flatten(y);
+    auto eigen_z = EigenVector<T>::Flatten(*z);
+    eigen_z.device(dev) = eigen_x * eigen_y;
+  }
+};
+
+template <typename T> struct SameDimsDivideFunctor {
+  void operator()(const FDTensor& x, const FDTensor& y, FDTensor* z) {
+    const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
+    auto eigen_x = EigenVector<T>::Flatten(x);
+    auto eigen_y = EigenVector<T>::Flatten(y);
+    auto eigen_z = EigenVector<T>::Flatten(*z);
+    eigen_z.device(dev) = eigen_x / eigen_y;
+  }
+};
+
+// Add
+template <typename T> struct AddFunctor {
+  inline T operator()(const T a, const T b) const { return a + b; }
+};
+template <typename T> struct InverseAddFunctor {
+  inline T operator()(const T a, const T b) const { return b + a; }
+};
+
+// Subtract
+template <typename T> struct SubtractFunctor {
+  inline T operator()(const T a, const T b) const { return a - b; }
+};
+template <typename T> struct InverseSubtractFunctor {
+  inline T operator()(const T a, const T b) const { return b - a; }
+};
+
+// Multiply
+template <typename T> struct MultiplyFunctor {
+  inline T operator()(const T a, const T b) const { return a * b; }
+};
+template <> struct MultiplyFunctor<bool> {
+  inline bool operator()(const bool a, const bool b) const { return a && b; }
+};
+template <typename T> struct InverseMultiplyFunctor {
+  inline T operator()(const T a, const T b) const { return b * a; }
+};
+template <> struct InverseMultiplyFunctor<bool> {
+  inline bool operator()(const bool a, const bool b) const { return b && a; }
+};
+
+// Divide
+#define DIV_ERROR_INFO                                                         \
+  "InvalidArgumentError: Integer division by zero encountered in "             \
+  "(floor) divide. Please check the input value."
+
+template <typename T, typename Enable = void> struct DivideFunctor {
+  inline T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <typename T>
+struct DivideFunctor<
+    T, typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline T operator()(const T a, const T b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    FDASSERT(b != 0, DIV_ERROR_INFO);
+    return a / b;
+  }
+};
+
+template <typename T, typename Enable = void> struct InverseDivideFunctor {
+  inline T operator()(const T a, const T b) const { return b / a; }
+};
+
+}  // namespace function
+}  // namespace fastdeploy
diff --git a/fastdeploy/utils/utils.cc b/fastdeploy/utils/utils.cc
index 6e76c7888..d89b1d555 100644
--- a/fastdeploy/utils/utils.cc
+++ b/fastdeploy/utils/utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/utils/utils.h"
+#include <sstream>
 
 namespace fastdeploy {
 
@@ -55,4 +56,14 @@ std::vector<int64_t> GetStride(const std::vector<int64_t>& dims) {
   return result;
 }
 
+std::string Str(const std::vector<int64_t>& shape) {
+  std::ostringstream oss;
+  oss << "[ " << shape[0];
+  for (int i = 1; i < shape.size(); ++i) {
+    oss << " ," << shape[i];
+  }
+  oss << " ]";
+  return oss.str();
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/utils/utils.h b/fastdeploy/utils/utils.h
index 994ea9baa..45b0f57cd 100644
--- a/fastdeploy/utils/utils.h
+++ b/fastdeploy/utils/utils.h
@@ -14,15 +14,15 @@
 
 #pragma once
 
-#include <stdlib.h>
 #include <cstdio>
+#include <stdlib.h>
 
 #include <fstream>
 #include <iostream>
+#include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
-#include <numeric>
 
 #if defined(_WIN32)
 #ifdef FASTDEPLOY_LIB
@@ -45,8 +45,7 @@ class FASTDEPLOY_DECL FDLogger {
   }
   explicit FDLogger(bool verbose, const std::string& prefix = "[FastDeploy]");
 
-  template <typename T>
-  FDLogger& operator<<(const T& val) {
+  template <typename T> FDLogger& operator<<(const T& val) {
     if (!verbose_) {
       return *this;
     }
@@ -75,37 +74,37 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
 #define __REL_FILE__ __FILE__
 #endif
 
-#define FDERROR                                                \
-  FDLogger(true, "[ERROR]") << __REL_FILE__ << "(" << __LINE__ \
-                            << ")::" << __FUNCTION__ << "\t"
+#define FDERROR                                                                \
+  FDLogger(true, "[ERROR]")                                                    \
+      << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 
-#define FDWARNING                                                \
-  FDLogger(true, "[WARNING]") << __REL_FILE__ << "(" << __LINE__ \
-                              << ")::" << __FUNCTION__ << "\t"
+#define FDWARNING                                                              \
+  FDLogger(true, "[WARNING]")                                                  \
+      << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 
-#define FDINFO                                                \
-  FDLogger(true, "[INFO]") << __REL_FILE__ << "(" << __LINE__ \
+#define FDINFO                                                                 \
+  FDLogger(true, "[INFO]") << __REL_FILE__ << "(" << __LINE__                  \
                            << ")::" << __FUNCTION__ << "\t"
 
-#define FDASSERT(condition, format, ...)                        \
-  if (!(condition)) {                                           \
-    int n = std::snprintf(nullptr, 0, format, ##__VA_ARGS__);   \
-    std::vector<char> buffer(n + 1);                            \
-    std::snprintf(buffer.data(), n + 1, format, ##__VA_ARGS__); \
-    FDERROR << buffer.data() << std::endl;                      \
-    std::abort();                                               \
+#define FDASSERT(condition, format, ...)                                       \
+  if (!(condition)) {                                                          \
+    int n = std::snprintf(nullptr, 0, format, ##__VA_ARGS__);                  \
+    std::vector<char> buffer(n + 1);                                           \
+    std::snprintf(buffer.data(), n + 1, format, ##__VA_ARGS__);                \
+    FDERROR << buffer.data() << std::endl;                                     \
+    std::abort();                                                              \
   }
 
 ///////// Basic Marco ///////////
 
-#define FD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
-  case enum_type: {                                                       \
-    using HINT = type;                                                    \
-    __VA_ARGS__();                                                        \
-    break;                                                                \
+#define FD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...)      \
+  case enum_type: {                                                            \
+    using HINT = type;                                                         \
+    __VA_ARGS__();                                                             \
+    break;                                                                     \
   }
 
-#define FD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \
+#define FD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...)                       \
   FD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__)
 
 // Visit different data type to match the corresponding function of FDTensor
@@ -123,68 +122,70 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
                            __VA_ARGS__)                                        \
       FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,       \
                            __VA_ARGS__)                                        \
-      default:                                                                 \
-        FDASSERT(                                                              \
-            false,                                                             \
-            "Invalid enum data type. Expect to accept data type BOOL, INT32, " \
-            "INT64, FP32, FP64, but receive type %s.",                         \
-            Str(__dtype__).c_str());                                           \
+    default:                                                                   \
+      FDASSERT(false,                                                          \
+               "Invalid enum data type. Expect to accept data "                \
+               "type BOOL, INT32, "                                            \
+               "INT64, FP32, FP64, but receive type %s.",                      \
+               Str(__dtype__).c_str());                                        \
     }                                                                          \
   }()
 
-#define FD_VISIT_INT_FLOAT_TYPES(TYPE, NAME, ...)                             \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,    \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,    \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,       \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,      \
-                           __VA_ARGS__)                                       \
-      default:                                                                \
-        FDASSERT(false,                                                       \
-                 "Invalid enum data type. Expect to accept data type INT32, " \
-                 "INT64, FP32, FP64, but receive type %s.",                   \
-                 Str(__dtype__).c_str());                                     \
-    }                                                                         \
+#define FD_VISIT_INT_FLOAT_TYPES(TYPE, NAME, ...)                              \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,     \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,     \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,        \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,       \
+                           __VA_ARGS__)                                        \
+    default:                                                                   \
+      FDASSERT(false,                                                          \
+               "Invalid enum data type. Expect to accept data type INT32, "    \
+               "INT64, FP32, FP64, but receive type %s.",                      \
+               Str(__dtype__).c_str());                                        \
+    }                                                                          \
   }()
 
-#define FD_VISIT_FLOAT_TYPES(TYPE, NAME, ...)                                \
-  [&] {                                                                      \
-    const auto& __dtype__ = TYPE;                                            \
-    switch (__dtype__) {                                                     \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,      \
-                           __VA_ARGS__)                                      \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,     \
-                           __VA_ARGS__)                                      \
-      default:                                                               \
-        FDASSERT(false,                                                      \
-                 "Invalid enum data type. Expect to accept data type FP32, " \
-                 "FP64, but receive type %s.",                               \
-                 Str(__dtype__).c_str());                                    \
-    }                                                                        \
+#define FD_VISIT_FLOAT_TYPES(TYPE, NAME, ...)                                  \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,        \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,       \
+                           __VA_ARGS__)                                        \
+    default:                                                                   \
+      FDASSERT(false,                                                          \
+               "Invalid enum data type. Expect to accept data type FP32, "     \
+               "FP64, but receive type %s.",                                   \
+               Str(__dtype__).c_str());                                        \
+    }                                                                          \
   }()
 
-#define FD_VISIT_INT_TYPES(TYPE, NAME, ...)                                   \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,    \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,    \
-                           __VA_ARGS__)                                       \
-      default:                                                                \
-        FDASSERT(false,                                                       \
-                 "Invalid enum data type. Expect to accept data type INT32, " \
-                 "INT64, but receive type %s.",                               \
-                 Str(__dtype__).c_str());                                     \
-    }                                                                         \
+#define FD_VISIT_INT_TYPES(TYPE, NAME, ...)                                    \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,     \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,     \
+                           __VA_ARGS__)                                        \
+    default:                                                                   \
+      FDASSERT(false,                                                          \
+               "Invalid enum data type. Expect to accept data type INT32, "    \
+               "INT64, but receive type %s.",                                  \
+               Str(__dtype__).c_str());                                        \
+    }                                                                          \
   }()
 
-FASTDEPLOY_DECL std::vector<int64_t> GetStride(
-    const std::vector<int64_t>& dims);
+FASTDEPLOY_DECL std::vector<int64_t>
+GetStride(const std::vector<int64_t>& dims);
+
+FASTDEPLOY_DECL std::string Str(const std::vector<int64_t>& shape);
 
 }  // namespace fastdeploy
diff --git a/tests/core/test_fd_tensor.cc b/tests/core/test_fd_tensor.cc
index ad4d639e4..286d14579 100644
--- a/tests/core/test_fd_tensor.cc
+++ b/tests/core/test_fd_tensor.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "fastdeploy/core/fd_tensor.h"
+#include "gtest_utils.h"
+#include "gtest/gtest.h"
 #include <array>
 #include <cstring>
 #include <vector>
-#include "fastdeploy/core/fd_tensor.h"
-#include "gtest/gtest.h"
-#include "gtest_utils.h"
 
 namespace fastdeploy {
 
@@ -86,4 +86,18 @@ TEST(fastdeploy, fd_tensor_assignment) {
   ASSERT_EQ(tensor1.Data(), nullptr);
 }
 
-}  // namespace fastdeploy
\ No newline at end of file
+TEST(fastdeploy, fd_tensor_reshape) {
+  CheckShape check_shape;
+  FDTensor x;
+  x.Allocate({2, 3, 4, 5}, FDDataType::FP32);
+  x.Reshape({-1, 3, 2, 2, 5});
+  check_shape(x.Shape(), {2, 3, 2, 2, 5});
+
+  x.Reshape({0, -1, 5, 2});
+  check_shape(x.Shape(), {2, 6, 5, 2});
+
+  x.Reshape({2, 3, 0, 0, 2});
+  check_shape(x.Shape(), {2, 3, 5, 2, 2});
+}
+
+}  // namespace fastdeploy
diff --git a/tests/function/test_elementwise.cc b/tests/function/test_elementwise.cc
new file mode 100644
index 000000000..6319f37cc
--- /dev/null
+++ b/tests/function/test_elementwise.cc
@@ -0,0 +1,451 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/function/elementwise.h"
+#include "glog/logging.h"
+#include "gtest_utils.h"
+#include "gtest/gtest.h"
+#include <array>
+#include <tuple>
+#include <vector>
+
+namespace fastdeploy {
+namespace function {
+
+std::tuple<std::vector<float>, std::vector<float>> CreateSameDimeData() {
+  // Shape: [2, 3, 4]
+  std::vector<float> x_data = {
+      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
+      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
+      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
+      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
+  // Shape: [2, 3, 4]
+  std::vector<float> y_data = {
+      0.8345295,  0.551608,   0.77101785, 0.386742,   0.12658621, 0.41240612,
+      0.20051356, 0.68455917, 0.37947154, 0.2953741,  0.97703844, 0.2931625,
+      0.2344262,  0.5054064,  0.40617892, 0.16315177, 0.71458364, 0.3748885,
+      0.65257984, 0.83870554, 0.55464447, 0.38836837, 0.472637,   0.5546991};
+  return std::make_tuple(x_data, y_data);
+}
+
+std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim1Data() {
+  // Shape: [2, 3, 4]
+  std::vector<float> x_data = {
+      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
+      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
+      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
+      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
+  // Shape: [2, 1, 1]
+  std::vector<float> y_data = {0.97375137, 0.11732706};
+  return std::make_tuple(x_data, y_data);
+}
+
+std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim2Data() {
+  // Shape: [2, 3, 4]
+  std::vector<float> x_data = {
+      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
+      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
+      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
+      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
+  // Shape: [1, 3, 1]
+  std::vector<float> y_data = {0.30803263, 0.41172066, 0.5588573};
+  return std::make_tuple(x_data, y_data);
+}
+
+std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim3Data() {
+  // Shape: [2, 3, 4]
+  std::vector<float> x_data = {
+      0.8428625,  0.6461913, 0.13740455, 0.11430702, 0.659926,  0.535816,
+      0.7429162,  0.8456049, 0.21228176, 0.29970083, 0.8621713, 0.40894133,
+      0.12684688, 0.1566195, 0.42884097, 0.8476526,  0.2458633, 0.669046,
+      0.87888306, 0.6762589, 0.666453,   0.32523027, 0.4139388, 0.8341406};
+  // Shape: [1, 1, 4]
+  std::vector<float> y_data = {0.62653106, 0.5128424, 0.9891219, 0.32416528};
+  return std::make_tuple(x_data, y_data);
+}
+
+std::tuple<std::vector<float>, std::vector<float>> CreateBroadcastDim4Data() {
+  // Shape: [2, 1, 4]
+  std::vector<float> x_data = {0.8428625, 0.6461913, 0.13740455, 0.11430702,
+                               0.659926,  0.535816,  0.7429162,  0.8456049};
+  // Shape: [2, 2, 1]
+  std::vector<float> y_data = {0.62653106, 0.5128424, 0.9891219, 0.32416528};
+  return std::make_tuple(x_data, y_data);
+}
+
+TEST(fastdeploy, check_same_dim) {
+  CheckShape check_shape;
+  CheckData check_data;
+  FDTensor x, y, z;
+
+  auto test_data = CreateSameDimeData();
+  auto x_data = std::get<0>(test_data);
+  auto y_data = std::get<1>(test_data);
+  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
+  y.SetExternalData({2, 3, 4}, FDDataType::FP32, y_data.data());
+
+  // Test Add functions
+  std::vector<float> add_result = {
+      1.677392,   1.1977993,  0.9084224, 0.50104904, 0.7865122,  0.94822216,
+      0.94342977, 1.530164,   0.5917533, 0.5950749,  1.8392098,  0.70210385,
+      0.36127308, 0.66202587, 0.8350199, 1.0108044,  0.96044695, 1.0439345,
+      1.5314629,  1.5149645,  1.2210975, 0.7135986,  0.8865758,  1.3888397};
+
+  Add(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+  z = x + y;
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+  // Test subtract
+  std::vector<float> sub_result = {
+      0.008332968, 0.09458327,  -0.6336133,   -0.27243498, 0.5333398,
+      0.1234099,   0.5424027,   0.16104573,   -0.16718978, 0.004326731,
+      -0.11486715, 0.11577883,  -0.10757932,  -0.3487869,  0.022662044,
+      0.6845008,   -0.46872032, 0.29415748,   0.22630322,  -0.16244662,
+      0.11180854,  -0.0631381,  -0.058698207, 0.27944148};
+  Subtract(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  z = x - y;
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+
+  // Test multiply
+  std::vector<float> mul_result = {
+      0.70339364, 0.3564443,  0.105941355, 0.044207327, 0.083537534,
+      0.2209738,  0.14896478, 0.5788666,   0.08055489,  0.08852386,
+      0.8423745,  0.11988626, 0.029736232, 0.079156496, 0.17418616,
+      0.13829602, 0.17568989, 0.25081766,  0.57354134,  0.5671821,
+      0.36964446, 0.12630916, 0.19564278,  0.46269706};
+  Multiply(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+  z = x * y;
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+
+  // Test divide
+  std::vector<float> div_result = {
+      1.0099852,  1.1714683,  0.17821188, 0.29556403, 5.2132535,  1.2992436,
+      3.7050674,  1.2352546,  0.5594142,  1.0146483,  0.88243335, 1.3949306,
+      0.54109514, 0.30988827, 1.0557933,  5.195485,   0.34406513, 1.7846532,
+      1.3467824,  0.8063127,  1.201586,   0.8374273,  0.875807,   1.5037713};
+  Divide(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+  z = x / y;
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+}
+
+TEST(fastdeploy, check_broadcast_dim1) {
+  CheckShape check_shape;
+  CheckData check_data;
+  FDTensor x, y, z;
+
+  auto test_data = CreateBroadcastDim1Data();
+  auto x_data = std::get<0>(test_data);
+  auto y_data = std::get<1>(test_data);
+  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
+  y.SetExternalData({2, 1, 1}, FDDataType::FP32, y_data.data());
+
+  // Test Add functions
+  std::vector<float> add_result = {
+      1.816614, 1.619943, 1.111156, 1.088058, 1.633677, 1.509567,
+      1.716668, 1.819356, 1.186033, 1.273452, 1.835923, 1.382693,
+      0.244174, 0.273947, 0.546168, 0.96498,  0.36319,  0.786373,
+      0.99621,  0.793586, 0.78378,  0.442557, 0.531266, 0.951468};
+
+  Add(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+  z = x + y;
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+
+  // Test subtract
+  std::vector<float> sub_result = {
+      -0.130889, -0.32756,  -0.836347, -0.859444, -0.313825, -0.437935,
+      -0.230835, -0.128146, -0.76147,  -0.674051, -0.11158,  -0.56481,
+      0.00952,   0.039292,  0.311514,  0.730326,  0.128536,  0.551719,
+      0.761556,  0.558932,  0.549126,  0.207903,  0.296612,  0.716814};
+  Subtract(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  z = x - y;
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+
+  // Test multiply
+  std::vector<float> mul_result = {
+      0.820738, 0.62923,  0.133798, 0.111307, 0.642604, 0.521752,
+      0.723416, 0.823409, 0.20671,  0.291834, 0.83954,  0.398207,
+      0.014883, 0.018376, 0.050315, 0.099453, 0.028846, 0.078497,
+      0.103117, 0.079343, 0.078193, 0.038158, 0.048566, 0.097867};
+  Multiply(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+  z = x * y;
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+
+  // Test divide
+  std::vector<float> div_result = {
+      0.865583, 0.66361,  0.141108, 0.117388, 0.677715, 0.55026,
+      0.762942, 0.868399, 0.218004, 0.30778,  0.885412, 0.419965,
+      1.081139, 1.334897, 3.65509,  7.224699, 2.095538, 5.702402,
+      7.490881, 5.763879, 5.680301, 2.771997, 3.528076, 7.109533};
+  Divide(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+  z = x / y;
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+}
+
+TEST(fastdeploy, check_broadcast_dim2) {
+  CheckShape check_shape;
+  CheckData check_data;
+  FDTensor x, y, z;
+
+  auto test_data = CreateBroadcastDim2Data();
+  auto x_data = std::get<0>(test_data);
+  auto y_data = std::get<1>(test_data);
+  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
+  y.SetExternalData({1, 3, 1}, FDDataType::FP32, y_data.data());
+
+  // Test Add functions
+  std::vector<float> add_result = {
+      1.150895, 0.954224, 0.445437, 0.42234,  1.071647, 0.947537,
+      1.154637, 1.257326, 0.771139, 0.858558, 1.421029, 0.967799,
+      0.43488,  0.464652, 0.736874, 1.155685, 0.657584, 1.080767,
+      1.290604, 1.08798,  1.22531,  0.884088, 0.972796, 1.392998};
+
+  Add(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+  z = x + y;
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+
+  // Test subtract
+  std::vector<float> sub_result = {
+      0.53483,   0.338159,  -0.170628, -0.193726, 0.248205,  0.124095,
+      0.331196,  0.433884,  -0.346576, -0.259156, 0.303314,  -0.149916,
+      -0.181186, -0.151413, 0.120808,  0.53962,   -0.165857, 0.257325,
+      0.467162,  0.264538,  0.107596,  -0.233627, -0.144919, 0.275283};
+  Subtract(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  z = x - y;
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  // Test multiply
+  std::vector<float> mul_result = {
+      0.259629, 0.199048, 0.042325, 0.03521,  0.271705, 0.220607,
+      0.305874, 0.348153, 0.118635, 0.16749,  0.481831, 0.22854,
+      0.039073, 0.048244, 0.132097, 0.261105, 0.101227, 0.27546,
+      0.361854, 0.27843,  0.372452, 0.181757, 0.231333, 0.466166};
+  Multiply(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+
+  // Test divide
+  std::vector<float> div_result = {
+      2.736277, 2.097801, 0.446071, 0.371087, 1.602849, 1.301407,
+      1.804418, 2.053832, 0.37985,  0.536274, 1.54274,  0.731745,
+      0.411797, 0.508451, 1.392193, 2.751827, 0.59716,  1.625,
+      2.134659, 1.642519, 1.192528, 0.581956, 0.740688, 1.492582};
+  Divide(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+}
+
+TEST(fastdeploy, check_broadcast_dim3) {
+  CheckShape check_shape;
+  CheckData check_data;
+  FDTensor x, y, z;
+
+  auto test_data = CreateBroadcastDim3Data();
+  auto x_data = std::get<0>(test_data);
+  auto y_data = std::get<1>(test_data);
+  x.SetExternalData({2, 3, 4}, FDDataType::FP32, x_data.data());
+  y.SetExternalData({4}, FDDataType::FP32, y_data.data());
+
+  // Test Add functions
+  std::vector<float> add_result = {
+      1.469393, 1.159034, 1.126526, 0.438472, 1.286457, 1.048658,
+      1.732038, 1.16977,  0.838813, 0.812543, 1.851293, 0.733107,
+      0.753378, 0.669462, 1.417963, 1.171818, 0.872394, 1.181888,
+      1.868005, 1.000424, 1.292984, 0.838073, 1.403061, 1.158306};
+
+  Add(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+  z = x + y;
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+
+  // Test subtract
+  std::vector<float> sub_result = {
+      0.216331,  0.133349,  -0.851717, -0.209858, 0.033395,  0.022974,
+      -0.246206, 0.52144,   -0.414249, -0.213142, -0.126951, 0.084776,
+      -0.499684, -0.356223, -0.560281, 0.523487,  -0.380668, 0.156204,
+      -0.110239, 0.352094,  0.039922,  -0.187612, -0.575183, 0.509975};
+  Subtract(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  z = x - y;
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  // Test multiply
+  std::vector<float> mul_result = {
+      0.52808,  0.331394, 0.13591,  0.037054, 0.413464, 0.274789,
+      0.734835, 0.274116, 0.133001, 0.153699, 0.852793, 0.132565,
+      0.079474, 0.080321, 0.424176, 0.27478,  0.154041, 0.343115,
+      0.869322, 0.21922,  0.417554, 0.166792, 0.409436, 0.270399};
+  Multiply(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+  z = x * y;
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+  // Test divide
+  std::vector<float> div_result = {
+      1.345284, 1.260019, 0.138916, 0.35262,  1.053301, 1.044797,
+      0.751087, 2.608561, 0.338821, 0.584392, 0.871653, 1.261521,
+      0.202459, 0.305395, 0.433557, 2.614878, 0.39242,  1.304584,
+      0.888549, 2.086155, 1.063719, 0.634172, 0.418491, 2.573195};
+  Divide(x, y, &z);
+  check_shape(z.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+  z = x / y;
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+}
+
+TEST(fastdeploy, check_broadcast_dim4) {
+  CheckShape check_shape;
+  CheckData check_data;
+  FDTensor x, y, z;
+
+  auto test_data = CreateBroadcastDim4Data();
+  auto x_data = std::get<0>(test_data);
+  auto y_data = std::get<1>(test_data);
+  x.SetExternalData({2, 1, 4}, FDDataType::FP32, x_data.data());
+  y.SetExternalData({2, 2, 1}, FDDataType::FP32, y_data.data());
+
+  // Test Add functions
+  std::vector<float> add_result = {1.469393, 1.272722, 0.763936, 0.740838,
+                                   1.355705, 1.159034, 0.650247, 0.627149,
+                                   1.649048, 1.524938, 1.732038, 1.834727,
+                                   0.984091, 0.859981, 1.067081, 1.16977};
+
+  Add(x, y, &z);
+  check_shape(z.shape, {2, 2, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+
+  z = x + y;
+  check_data(reinterpret_cast<const float*>(z.Data()), add_result.data(),
+             add_result.size());
+  // Test subtract
+  std::vector<float> sub_result = {0.216331,  0.01966,   -0.489127, -0.512224,
+                                   0.33002,   0.133349,  -0.375438, -0.398535,
+                                   -0.329196, -0.453306, -0.246206, -0.143517,
+                                   0.335761,  0.211651,  0.418751,  0.52144};
+  Subtract(x, y, &z);
+  check_shape(z.shape, {2, 2, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  z = x - y;
+  check_data(reinterpret_cast<const float*>(z.Data()), sub_result.data(),
+             sub_result.size());
+  // Test multiply
+  std::vector<float> mul_result = {0.52808,  0.404859, 0.086088, 0.071617,
+                                   0.432256, 0.331394, 0.070467, 0.058621,
+                                   0.652747, 0.529987, 0.734835, 0.836406,
+                                   0.213925, 0.173693, 0.240828, 0.274116};
+  Multiply(x, y, &z);
+  check_shape(z.shape, {2, 2, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+  z = x * y;
+  check_data(reinterpret_cast<const float*>(z.Data()), mul_result.data(),
+             mul_result.size());
+
+  // Test divide
+  std::vector<float> div_result = {1.345284, 1.031379, 0.21931,  0.182444,
+                                   1.643512, 1.260019, 0.267927, 0.222889,
+                                   0.667184, 0.541709, 0.751087, 0.854905,
+                                   2.03577,  1.65291,  2.291782, 2.608561};
+  Divide(x, y, &z);
+  check_shape(z.shape, {2, 2, 4});
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+  z = x / y;
+  check_data(reinterpret_cast<const float*>(z.Data()), div_result.data(),
+             div_result.size());
+}
+
+TEST(fastdeploy, mixed_operation) {
+  CheckShape check_shape;
+  CheckData check_data;
+  FDTensor a, b, c, d, e, output;
+
+  auto test_data = CreateSameDimeData();
+  auto a_data = std::get<0>(test_data);
+  auto b_data = std::get<1>(test_data);
+  auto c_data = std::get<1>(CreateBroadcastDim1Data());
+  auto d_data = std::get<1>(CreateBroadcastDim2Data());
+  auto e_data = std::get<1>(CreateBroadcastDim3Data());
+
+  a.SetExternalData({2, 3, 4}, FDDataType::FP32, a_data.data());
+  b.SetExternalData({2, 3, 4}, FDDataType::FP32, b_data.data());
+  c.SetExternalData({2, 1, 1}, FDDataType::FP32, c_data.data());
+  d.SetExternalData({1, 3, 1}, FDDataType::FP32, d_data.data());
+  e.SetExternalData({1, 1, 4}, FDDataType::FP32, e_data.data());
+
+  std::vector<float> result = {
+      3.238058,  3.004797,  2.278015,  2.881238,  1.822084,  2.073209,
+      1.524921,  2.619779,  1.196421,  1.318079,  1.59565,   1.538118,
+      -0.215903, -0.052794, -0.434044, 0.195022,  -0.165874, 0.022943,
+      -0.130613, 0.527984,  -0.046946, -0.176592, -0.583538, 0.348473};
+
+  output = a * b + c / d - e;
+  check_shape(output.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(output.Data()), result.data(),
+             result.size());
+}
+
+}  // namespace function
+}  // namespace fastdeploy
\ No newline at end of file