[Sync][Internal] sync some internal paddle3d codes (#2108)

2025-10-16 05:30:58 +08:00 · 2023-07-13 22:06:28 +08:00
parent 77ee48f9b8
commit 681ccc4c24
30 changed files with 2517 additions and 45 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -462,7 +462,7 @@ endif()
 if(ENABLE_ENCRYPTION)
  add_definitions(-DENABLE_ENCRYPTION)
  list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_ENCRYPTION_SRCS})
-  include(${PROJECT_SOURCE_DIR}/cmake/gflags.cmake)
+  # include(${PROJECT_SOURCE_DIR}/cmake/gflags.cmake)
  include(${PROJECT_SOURCE_DIR}/cmake/openssl.cmake)
  list(APPEND DEPEND_LIBS ${OPENSSL_LIBRARIES})
 endif()
--- a/benchmark/paddlex/CMakeLists.txt
+++ b/benchmark/paddlex/CMakeLists.txt
@@ -21,6 +21,7 @@ add_executable(benchmark_structurev2_table ${PROJECT_SOURCE_DIR}/benchmark_struc
 add_executable(benchmark_structurev2_layout ${PROJECT_SOURCE_DIR}/benchmark_structurev2_layout.cc)
 add_executable(benchmark_ppshituv2_rec ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_rec.cc)
 add_executable(benchmark_ppshituv2_det ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_det.cc)
+add_executable(benchmark_pp3d_cadnn ${PROJECT_SOURCE_DIR}/benchmark_pp3d_cadnn.cc)
 add_executable(benchmark_pp3d_centerpoint ${PROJECT_SOURCE_DIR}/benchmark_pp3d_centerpoint.cc)

 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
@@ -34,6 +35,7 @@ if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
  target_link_libraries(benchmark_structurev2_layout ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppshituv2_det ${FASTDEPLOY_LIBS} gflags pthread)
+  target_link_libraries(benchmark_pp3d_cadnn ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_pp3d_centerpoint ${FASTDEPLOY_LIBS} gflags pthread)
 else()
  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags)
@@ -46,6 +48,7 @@ else()
  target_link_libraries(benchmark_structurev2_layout ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppshituv2_det ${FASTDEPLOY_LIBS} gflags)
+  target_link_libraries(benchmark_pp3d_cadnn ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_pp3d_centerpoint ${FASTDEPLOY_LIBS} gflags)
 endif()
 # only for Android ADB test
--- a/benchmark/paddlex/benchmark_gpu.sh
+++ b/benchmark/paddlex/benchmark_gpu.sh
@@ -41,9 +41,17 @@ fi

 # PP-ShiTuV2
 ./benchmark_ppshituv2_rec --model PP-ShiTuv2-rec --image ppshituv2_wangzai.png --config_path $CONFIG_PATH
+./benchmark_ppshituv2_det --model PP-ShiTuv2-det --image ppdet_det_img.jpg --config_path $CONFIG_PATH

 # PP-StructureV2
 ./benchmark_structurev2_layout --model PP-Structurev2-layout --image structurev2_layout_val_0002.jpg --config_path $CONFIG_PATH
 ./benchmark_structurev2_table --model PP-Structurev2-SLANet --image structurev2_table.jpg --table_char_dict_path table_structure_dict_ch.txt --config_path $CONFIG_PATH
+./benchmark --model PP-Structurev2-vi-layoutxlm --shapes "1,512:1,512,4:1,512:1,512" --trt_shapes "1,512:1,512:1,512:1,512,4:1,512,4:1,512,4:1,512:1,512:1,512:1,512:1,512:1,512" --names "x_0:x_1:x_2:x_3" --dtypes "INT64:INT64:INT64:INT64" --disable_mkldnn --custom_tensor_value 0.2 --config_path $CONFIG_PATH
+
+# Paddle3D
+./benchmark --model PETRv1_v99 --shapes "1,6,3,320,800:1,6,4,4" --names "images:img2lidars" --dtypes "FP32:FP32" --disable_mkldnn --config_path $CONFIG_PATH 
+./benchmark --model PETRv2_v99 --shapes "1,12,3,320,800:1,12,4,4:1,12" --names "images:img2lidars:timestamps" --dtypes "FP32:FP32:FP32" --disable_mkldnn --config_path $CONFIG_PATH 
+./benchmark_pp3d_centerpoint --model CenterPoint-Pillars-02Voxel --image paddle3d_centerpoint_n008_LIDAR_TOP__1533151603547590.pcd.bin --config_path $CONFIG_PATH 
+./benchmark_pp3d_cadnn --model CADNN_OCRNet-HRNetW18 --image paddle3d_cadnn_kitti_000780.png --config_path $CONFIG_PATH

 set +x
--- a/benchmark/paddlex/benchmark_gpu_trt.sh
+++ b/benchmark/paddlex/benchmark_gpu_trt.sh
@@ -43,9 +43,17 @@ fi

 # PP-ShiTuV2
 ./benchmark_ppshituv2_rec --model PP-ShiTuv2-rec --image ppshituv2_wangzai.png --config_path $CONFIG_PATH
+./benchmark_ppshituv2_det --model PP-ShiTuv2-det --image ppdet_det_img.jpg --config_path $CONFIG_PATH

 # PP-StructureV2
 ./benchmark_structurev2_layout --model PP-Structurev2-layout --image structurev2_layout_val_0002.jpg --config_path $CONFIG_PATH
 ./benchmark_structurev2_table --model PP-Structurev2-SLANet --image structurev2_table.jpg --table_char_dict_path table_structure_dict_ch.txt --config_path $CONFIG_PATH
+./benchmark --model PP-Structurev2-vi-layoutxlm --shapes "1,512:1,512,4:1,512:1,512" --trt_shapes "1,512:1,512:1,512:1,512,4:1,512,4:1,512,4:1,512:1,512:1,512:1,512:1,512:1,512" --names "x_0:x_1:x_2:x_3" --dtypes "INT64:INT64:INT64:INT64" --disable_mkldnn --custom_tensor_value 0.2 --collect_trt_shape_by_custom_tensor_value --collect_trt_shape_by_device --config_path $CONFIG_PATH
+
+# Paddle3D
+./benchmark --model PETRv1_v99 --shapes "1,6,3,320,800:1,6,4,4" --trt_shapes "1,6,3,320,800:1,6,3,320,800:1,6,3,320,800:1,6,4,4:1,6,4,4:1,6,4,4" --names "images:img2lidars" --dtypes "FP32:FP32" --disable_mkldnn --config_path $CONFIG_PATH 
+./benchmark --model PETRv2_v99 --shapes "1,12,3,320,800:1,12,4,4:1,12" --trt_shapes "1,12,3,320,800:1,12,3,320,800:1,12,3,320,800:1,12,4,4:1,12,4,4:1,12,4,4:1,12:1,12:1,12" --names "images:img2lidars:timestamps" --dtypes "FP32:FP32:FP32" --disable_mkldnn --config_path $CONFIG_PATH 
+./benchmark_pp3d_centerpoint --model CenterPoint-Pillars-02Voxel --image paddle3d_centerpoint_n008_LIDAR_TOP__1533151603547590.pcd.bin --config_path $CONFIG_PATH 
+./benchmark_pp3d_cadnn --model CADNN_OCRNet-HRNetW18 --image paddle3d_cadnn_kitti_000780.png --config_path $CONFIG_PATH

 set +x
--- a/benchmark/paddlex/benchmark_pp3d_cadnn.cc
+++ b/benchmark/paddlex/benchmark_pp3d_cadnn.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "flags.h"
+#include "macros.h"
+#include "option.h"
+
+namespace vision = fastdeploy::vision;
+namespace benchmark = fastdeploy::benchmark;
+
+int main(int argc, char* argv[]) {
+#if defined(ENABLE_BENCHMARK) && defined(ENABLE_VISION)
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option, argc, argv, true)) {
+    return -1;
+  }
+  auto im = cv::imread(FLAGS_image);
+  std::unordered_map<std::string, std::string> config_info;
+  benchmark::ResultManager::LoadBenchmarkConfig(FLAGS_config_path,
+                                                &config_info);
+  std::string model_name, params_name, config_name;
+  auto model_format = fastdeploy::ModelFormat::PADDLE;
+  if (!UpdateModelResourceName(&model_name, &params_name, &config_name,
+                               &model_format, config_info, false)) {
+    return -1;
+  }
+  auto model_file = FLAGS_model + sep + model_name;
+  auto params_file = FLAGS_model + sep + params_name;
+  std::vector<float> cam_data{7.183351e+02, 0.000000e+00,  6.003891e+02,
+                              4.450382e+01, 0.000000e+00,  7.183351e+02,
+                              1.815122e+02, -5.951107e-01, 0.000000e+00,
+                              0.000000e+00, 1.000000e+00,  2.616315e-03};
+  std::vector<float> lidar_data = {
+      0.0048523,   -0.9999298, -0.01081266, -0.00711321,
+      -0.00302069, 0.01079808, -0.99993706, -0.06176636,
+      0.99998367,  0.00488465, -0.00296808, -0.26739058,
+      0.,          0.,         0.,          1.};
+  if (config_info["backend"] == "paddle_trt") {
+    option.paddle_infer_option.collect_trt_shape = true;
+    option.paddle_infer_option.collect_trt_shape_by_device = true;
+    option.paddle_infer_option.trt_min_subgraph_size = 12;
+    option.paddle_infer_option.DisableTrtOps({"squeeze2"});
+    option.trt_option.max_batch_size = 1;
+  }
+  if (config_info["backend"] == "paddle_trt" ||
+      config_info["backend"] == "trt") {
+    // use custom data to perform collect shapes.  
+    option.trt_option.SetShape("images", {1, 3, 375, 1242},
+      {1, 3, 375, 1242}, {1, 3, 375, 1242});
+    option.trt_option.SetShape("trans_lidar_to_cam", {1, 4, 4},
+      {1, 4, 4}, {1, 4, 4});
+    option.trt_option.SetShape("trans_cam_to_img", {1, 3, 4},
+      {1, 3, 4}, {1, 3, 4});
+    std::vector<float> image_data;
+    image_data.assign(im.data, im.data + 1*3*375*1242);
+    option.trt_option.SetInputData("trans_lidar_to_cam", lidar_data);  
+    option.trt_option.SetInputData("trans_cam_to_img", cam_data);  
+    option.trt_option.SetInputData("images", image_data);  
+  }
+  auto model_cadnn = vision::perception::Caddn(
+      model_file, params_file, "", option, model_format);
+  vision::PerceptionResult res;
+  // Run profiling
+  BENCHMARK_MODEL(model_cadnn, model_cadnn.Predict(im, cam_data, lidar_data, &res))
+  std::cout << res.Str() << std::endl;
+#endif
+
+  return 0;
+}
--- a/benchmark/paddlex/benchmark_pp3d_centerpoint.cc
+++ b/benchmark/paddlex/benchmark_pp3d_centerpoint.cc
@@ -93,7 +93,7 @@ int main(int argc, char* argv[]) {
  vision::PerceptionResult res;
  // Run profiling
  BENCHMARK_MODEL(model_centerpoint, model_centerpoint.Predict(point_dir, &res))
-  // std::cout << res.Str() << std::endl;
+  std::cout << res.Str() << std::endl;
 #endif

  return 0;
--- a/benchmark/paddlex/benchmark_x86.sh
+++ b/benchmark/paddlex/benchmark_x86.sh
@@ -41,9 +41,17 @@ fi

 # PP-ShiTuV2
 ./benchmark_ppshituv2_rec --model PP-ShiTuv2-rec --image ppshituv2_wangzai.png --config_path $CONFIG_PATH
+./benchmark_ppshituv2_det --model PP-ShiTuv2-det --image ppdet_det_img.jpg --config_path $CONFIG_PATH

 # PP-StructureV2
 ./benchmark_structurev2_layout --model PP-Structurev2-layout --image structurev2_layout_val_0002.jpg --config_path $CONFIG_PATH
 ./benchmark_structurev2_table --model PP-Structurev2-SLANet --image structurev2_table.jpg --table_char_dict_path table_structure_dict_ch.txt --config_path $CONFIG_PATH
+./benchmark --model PP-Structurev2-vi-layoutxlm --shapes "1,512:1,512,4:1,512:1,512" --names "x_0:x_1:x_2:x_3" --dtypes "INT64:INT64:INT64:INT64" --disable_mkldnn --custom_tensor_value 0.2 --config_path $CONFIG_PATH
+
+# Paddle3D
+./benchmark --model PETRv1_v99 --config_path $CONFIG_PATH --shapes "1,6,3,320,800:1,6,4,4" --names "images:img2lidars" --dtypes "FP32:FP32" --disable_mkldnn --warmup 5 --repeat 20
+./benchmark --model PETRv2_v99 --config_path $CONFIG_PATH --shapes "1,12,3,320,800:1,12,4,4:1,12" --names "images:img2lidars:timestamps" --dtypes "FP32:FP32:FP32" --disable_mkldnn --warmup 5 --repeat 20
+./benchmark_pp3d_centerpoint --model CenterPoint-Pillars-02Voxel --image paddle3d_centerpoint_n008_LIDAR_TOP__1533151603547590.pcd.bin --config_path $CONFIG_PATH 
+./benchmark_pp3d_cadnn --model CADNN_OCRNet-HRNetW18 --image paddle3d_cadnn_kitti_000780.png --config_path $CONFIG_PATH

 set +x
--- a/fastdeploy/runtime/backends/paddle/ops/centerpoint_postprocess_op.cc
+++ b/fastdeploy/runtime/backends/paddle/ops/centerpoint_postprocess_op.cc
@@ -119,3 +119,4 @@ PD_BUILD_OP(centerpoint_postprocess)
    .SetInferDtypeFn(PD_INFER_DTYPE(fastdeploy::paddle_custom_ops::PostProcessInferDtype));

 #endif // WITH_GPU
+
--- a/fastdeploy/runtime/backends/paddle/ops/centerpoint_postprocess_op.cu
+++ b/fastdeploy/runtime/backends/paddle/ops/centerpoint_postprocess_op.cu
@@ -220,7 +220,7 @@ std::vector<paddle::Tensor> postprocess_gpu(

    // nms
    // in NmsLauncher, rot = - theta - pi / 2
-    const int col_blocks = DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS);
+    int col_blocks = DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS);
    auto nms_mask = paddle::empty({num_bboxes_for_nms * col_blocks},
                                  paddle::DataType::INT64, paddle::GPUPlace());
    int64_t *nms_mask_data = nms_mask.data<int64_t>();
--- a/fastdeploy/runtime/backends/paddle/ops/grid_sample_3d.cc
+++ b/fastdeploy/runtime/backends/paddle/ops/grid_sample_3d.cc
@@ -0,0 +1,94 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(WITH_GPU)
+
+#include "grid_sample_3d.h"
+
+#include <vector>
+
+#if defined(PADDLEINFERENCE_API_COMPAT_2_4_x)
+#include "paddle/include/experimental/ext_all.h"
+#elif defined(PADDLEINFERENCE_API_COMPAT_2_5_x)
+#include "paddle/include/paddle/extension.h"
+#else
+#include "paddle/extension.h"
+#endif
+
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+std::vector<paddle::Tensor> GridSample3DCUDAForward(
+    const paddle::Tensor& x, const paddle::Tensor& grid,
+    const std::string& mode, const std::string& padding_mode,
+    bool align_corners);
+
+std::vector<paddle::Tensor> GridSample3DForward(const paddle::Tensor& x,
+                                                const paddle::Tensor& grid,
+                                                const std::string& mode,
+                                                const std::string& padding_mode,
+                                                bool align_corners) {
+  return GridSample3DCUDAForward(x, grid, mode, padding_mode, align_corners);
+}
+
+std::vector<paddle::Tensor> GridSample3DCUDABackward(
+    const paddle::Tensor& x, const paddle::Tensor& grid,
+    const paddle::Tensor& grad_out, const std::string& mode,
+    const std::string& padding_mode, bool align_corners);
+
+std::vector<paddle::Tensor> GridSample3DBackward(
+    const paddle::Tensor& x, const paddle::Tensor& grid,
+    const paddle::Tensor& grad_out, const std::string& mode,
+    const std::string& padding_mode, bool align_corners) {
+  return GridSample3DCUDABackward(x, grid, grad_out, mode, padding_mode,
+                                  align_corners);
+}
+
+std::vector<std::vector<int64_t>> GridSample3DInferShape(
+    std::vector<int64_t> x_shape, std::vector<int64_t> grid_shape) {
+  return {
+      {x_shape[0], x_shape[1], grid_shape[1], grid_shape[2], grid_shape[3]}};
+}
+
+std::vector<std::vector<int64_t>> GridSample3DInferBackShape(
+    std::vector<int64_t> x_shape, std::vector<int64_t> grid_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> GridSample3DInferDtype(
+    paddle::DataType x_dtype, paddle::DataType grid_dtype) {
+  return {x_dtype};
+}
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
+
+PD_BUILD_OP(grid_sample_3d)
+    .Inputs({"x", "grid"})
+    .Attrs({"mode: std::string", "padding_mode: std::string",
+            "align_corners: bool"})
+    .Outputs({"out"})
+    .SetKernelFn(PD_KERNEL(fastdeploy::paddle_custom_ops::GridSample3DForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(fastdeploy::paddle_custom_ops::GridSample3DInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(fastdeploy::paddle_custom_ops::GridSample3DInferDtype));
+
+PD_BUILD_GRAD_OP(grid_sample_3d)
+    .Inputs({"x", "grid", paddle::Grad("out")})
+    .Attrs({"mode: std::string", "padding_mode: std::string",
+            "align_corners: bool"})
+    .Outputs({paddle::Grad("x")})
+    .SetKernelFn(PD_KERNEL(fastdeploy::paddle_custom_ops::GridSample3DBackward))
+    .SetInferShapeFn(PD_INFER_SHAPE(fastdeploy::paddle_custom_ops::GridSample3DInferBackShape));
+
+#endif  
--- a/fastdeploy/runtime/backends/paddle/ops/grid_sample_3d.cu
+++ b/fastdeploy/runtime/backends/paddle/ops/grid_sample_3d.cu
@@ -0,0 +1,657 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+
+#include "grid_sample_3d.h"
+
+#if defined(PADDLEINFERENCE_API_COMPAT_2_4_x)
+#include "paddle/include/experimental/ext_all.h"
+#elif defined(PADDLEINFERENCE_API_COMPAT_2_5_x)
+#include "paddle/include/paddle/extension.h"
+#else
+#include "paddle/extension.h"
+#endif
+
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+static __forceinline__ __device__ bool InBounds3D(int64_t d, int64_t h,
+                                                  int64_t w, int64_t D,
+                                                  int64_t H, int64_t W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type)                  \
+  index_type _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x; \
+  for (index_type i = _i_n_d_e_x; _i_n_d_e_x < (n);              \
+       _i_n_d_e_x += blockDim.x * gridDim.x, i = _i_n_d_e_x)
+
+#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int)
+
+template <typename T>
+static __forceinline__ __device__ T Unnormalize(T coord, int size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T ReflectIndexes(T in, int twice_low,
+                                                   int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ComputePositions(T coord, int size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = ReflectIndexes(coord, -1, 2 * size - 1);
+    }
+    coord = ClipIndexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T, typename index_t>
+__global__ void GridSample3DCudaKernel(
+    const index_t nthreads, index_t out_c, index_t out_d, index_t out_h,
+    index_t out_w, index_t in_d, index_t in_h, index_t in_w, const T* input,
+    const T* grid, T* output, const Mode interpolation_mode,
+    const PaddingMode padding_mode, bool align_corners) {
+  // printf("size: %d, %d, %d, %d, %d, %d \n", out_c, out_d, out_w, out_h, in_d,
+  // in_w);
+  index_t inp_sW = 1;
+  index_t inp_sH = in_w;
+  index_t inp_sD = in_h * in_w;
+  index_t inp_sC = in_d * inp_sD;
+  index_t inp_sN = out_c * inp_sC;
+
+  index_t grid_sCoor = 1;
+  index_t grid_sW = 3;
+  index_t grid_sH = out_w * grid_sW;
+  index_t grid_sD = out_h * grid_sH;
+  index_t grid_sN = out_d * grid_sD;
+
+  index_t out_sW = 1;
+  index_t out_sH = out_w;
+  index_t out_sD = out_h * out_w;
+  index_t out_sC = out_d * out_sD;
+  index_t out_sN = out_c * out_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
+    const index_t w = index % out_w;
+    const index_t h = (index / out_w) % out_h;
+    const index_t d = (index / (out_h * out_w)) % out_d;
+    const index_t n = index / (out_d * out_h * out_w);
+    const index_t grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+    // get the corresponding input x, y, z co-ordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    iz = ComputePositions(iz, in_d, padding_mode, align_corners);
+    // printf("ix: %f, iy: %f, iz: %f \n", ix, iy, iz);
+    if (interpolation_mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      index_t ix_tnw = static_cast<index_t>(std::floor(ix));
+      index_t iy_tnw = static_cast<index_t>(std::floor(iy));
+      index_t iz_tnw = static_cast<index_t>(std::floor(iz));
+
+      index_t ix_tne = ix_tnw + 1;
+      index_t iy_tne = iy_tnw;
+      index_t iz_tne = iz_tnw;
+
+      index_t ix_tsw = ix_tnw;
+      index_t iy_tsw = iy_tnw + 1;
+      index_t iz_tsw = iz_tnw;
+
+      index_t ix_tse = ix_tnw + 1;
+      index_t iy_tse = iy_tnw + 1;
+      index_t iz_tse = iz_tnw;
+
+      index_t ix_bnw = ix_tnw;
+      index_t iy_bnw = iy_tnw;
+      index_t iz_bnw = iz_tnw + 1;
+
+      index_t ix_bne = ix_tnw + 1;
+      index_t iy_bne = iy_tnw;
+      index_t iz_bne = iz_tnw + 1;
+
+      index_t ix_bsw = ix_tnw;
+      index_t iy_bsw = iy_tnw + 1;
+      index_t iz_bsw = iz_tnw + 1;
+
+      index_t ix_bse = ix_tnw + 1;
+      index_t iy_bse = iy_tnw + 1;
+      index_t iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (index_t c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        *out_ptr_NCDHW = static_cast<T>(0);
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == Mode::nearest) {
+      index_t ix_nearest = static_cast<index_t>(std::round(ix));
+      index_t iy_nearest = static_cast<index_t>(std::round(iy));
+      index_t iz_nearest = static_cast<index_t>(std::round(iz));
+
+      // assign nearest neighor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (index_t c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (InBounds3D(iz_nearest, iy_nearest, ix_nearest, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+std::vector<paddle::Tensor> GridSample3DCUDAForward(
+    const paddle::Tensor& x, const paddle::Tensor& grid,
+    const std::string& mode, const std::string& padding_mode,
+    bool align_corners) {
+  CHECK_INPUT_GPU(x);
+  CHECK_INPUT_GPU(grid);
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+  const int n = grid.shape()[0];
+  const int out_d = grid.shape()[1];
+  const int out_h = grid.shape()[2];
+  const int out_w = grid.shape()[3];
+  const int c = x.shape()[1];
+  const int in_d = x.shape()[2];
+  const int in_h = x.shape()[3];
+  const int in_w = x.shape()[4];
+
+  auto output = paddle::full({n, c, out_d, out_h, out_w}, 0,
+                             paddle::DataType::FLOAT32, paddle::GPUPlace());
+  const int count = static_cast<int>(n * out_d * out_h * out_w);
+
+  int max_threads_per_block = 512;
+  int block_num = (count - 1) / max_threads_per_block + 1;
+  // printf("size: %d, %d, %d, %d, %d, %d \n", n, c, out_d, out_h, count,
+  // block_num);
+  GridSample3DCudaKernel<float, int>
+      <<<block_num, max_threads_per_block, 0, x.stream()>>>(
+          count, c, out_d, out_h, out_w, in_d, in_h, in_w, x.data<float>(),
+          grid.data<float>(), output.data<float>(), enum_mode,
+          enum_padding_mode, align_corners);
+
+  cudaError_t error_check;
+  error_check = cudaGetLastError();
+  if (error_check != cudaSuccess) {
+    printf("%s\n", cudaGetErrorString(error_check));
+  }
+  // printf("size: %d, %d, %d, %d, %d, %d \n", n, c, out_d, out_h, count,
+  // block_num);
+  return {output};
+}
+
+template <typename T>
+static __forceinline__ __device__ T UnnormalizeWithMask(T coord, int size,
+                                                        bool align_corners,
+                                                        T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in, int clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ReflectIndexesWithMask(T in, int twice_low,
+                                                           int twice_high,
+                                                           T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord, int size, PaddingMode padding_mode,
+                         bool align_corners, T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+static __forceinline__ __device__ void AtomicAdd3D(
+    T* data, int64_t d, int64_t h, int64_t w, int64_t sD, int64_t sH,
+    int64_t sW, int64_t D, int64_t H, int64_t W, T delta) {
+  if (InBounds3D(d, h, w, D, H, W)) {
+    atomicAdd(data + d * sD + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T, typename index_t>
+__global__ void GridSample3DCudaBackwardKernel(
+    const index_t nthreads, const T* grad_output, const T* input, const T* grid,
+    index_t out_c, index_t out_d, index_t out_h, index_t out_w, index_t in_d,
+    index_t in_h, index_t in_w, T* grad_input, T* grad_grid, const Mode mode,
+    const PaddingMode padding_mode, bool align_corners) {
+  index_t inp_sW = 1;
+  index_t inp_sH = in_w;
+  index_t inp_sD = in_h * in_w;
+  index_t inp_sC = in_d * inp_sD;
+  index_t inp_sN = out_c * inp_sC;
+
+  index_t grid_sCoor = 1;
+  index_t grid_sW = 3;
+  index_t grid_sH = out_w * grid_sW;
+  index_t grid_sD = out_h * grid_sH;
+  index_t grid_sN = out_d * grid_sD;
+
+  index_t gOut_sW = 1;
+  index_t gOut_sH = out_w;
+  index_t gOut_sD = out_h * out_w;
+  index_t gOut_sC = out_d * gOut_sD;
+  index_t gOut_sN = out_c * gOut_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
+    const index_t w = index % out_w;
+    const index_t h = (index / out_w) % out_h;
+    const index_t d = (index / (out_h * out_w)) % out_d;
+    const index_t n = index / (out_d * out_h * out_w);
+    const auto grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z co-ordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+
+    // multipliers for gradients on ix, iy, and iz
+    T gix_mult, giy_mult, giz_mult;
+    ix = ComputePositionsWithMask(ix, in_w, padding_mode, align_corners,
+                                  &gix_mult);
+    iy = ComputePositionsWithMask(iy, in_h, padding_mode, align_corners,
+                                  &giy_mult);
+    iz = ComputePositionsWithMask(iz, in_d, padding_mode, align_corners,
+                                  &giz_mult);
+
+    if (mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      index_t ix_tnw = static_cast<index_t>(std::floor(ix));
+      index_t iy_tnw = static_cast<index_t>(std::floor(iy));
+      index_t iz_tnw = static_cast<index_t>(std::floor(iz));
+
+      index_t ix_tne = ix_tnw + 1;
+      index_t iy_tne = iy_tnw;
+      index_t iz_tne = iz_tnw;
+
+      index_t ix_tsw = ix_tnw;
+      index_t iy_tsw = iy_tnw + 1;
+      index_t iz_tsw = iz_tnw;
+
+      index_t ix_tse = ix_tnw + 1;
+      index_t iy_tse = iy_tnw + 1;
+      index_t iz_tse = iz_tnw;
+
+      index_t ix_bnw = ix_tnw;
+      index_t iy_bnw = iy_tnw;
+      index_t iz_bnw = iz_tnw + 1;
+
+      index_t ix_bne = ix_tnw + 1;
+      index_t iy_bne = iy_tnw;
+      index_t iz_bne = iz_tnw + 1;
+
+      index_t ix_bsw = ix_tnw;
+      index_t iy_bsw = iy_tnw + 1;
+      index_t iz_bsw = iz_tnw + 1;
+
+      index_t ix_bse = ix_tnw + 1;
+      index_t iy_bse = iy_tnw + 1;
+      index_t iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0),
+        giz = static_cast<T>(0);
+      index_t gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      index_t inp_offset_NC = n * inp_sN;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (index_t c = 0; c < out_c; ++c, gOut_offset += gOut_sC,
+                   gInp_ptr_NC += inp_sC, inp_offset_NC += inp_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd3D(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, tnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, tne * gOut);
+        AtomicAdd3D(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, tsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, tse * gOut);
+        AtomicAdd3D(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, bnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, bne * gOut);
+        AtomicAdd3D(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, bsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, inp_sD, inp_sH, inp_sW,
+                    in_d, in_h, in_w, bse * gOut);
+
+        // calculate grad_grid
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          T tnw_val = input[inp_offset_NC + iz_tnw * inp_sD + iy_tnw * inp_sH +
+                            ix_tnw * inp_sW];
+          gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut;
+          giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut;
+          giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          T tne_val = input[inp_offset_NC + iz_tne * inp_sD + iy_tne * inp_sH +
+                            ix_tne * inp_sW];
+          gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut;
+          giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut;
+          giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          T tsw_val = input[inp_offset_NC + iz_tsw * inp_sD + iy_tsw * inp_sH +
+                            ix_tsw * inp_sW];
+          gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut;
+          giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut;
+          giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          T tse_val = input[inp_offset_NC + iz_tse * inp_sD + iy_tse * inp_sH +
+                            ix_tse * inp_sW];
+          gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut;
+          giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut;
+          giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          T bnw_val = input[inp_offset_NC + iz_bnw * inp_sD + iy_bnw * inp_sH +
+                            ix_bnw * inp_sW];
+          gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut;
+          giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut;
+          giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          T bne_val = input[inp_offset_NC + iz_bne * inp_sD + iy_bne * inp_sH +
+                            ix_bne * inp_sW];
+          gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut;
+          giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut;
+          giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          T bsw_val = input[inp_offset_NC + iz_bsw * inp_sD + iy_bsw * inp_sH +
+                            ix_bsw * inp_sW];
+          gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut;
+          giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut;
+          giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          T bse_val = input[inp_offset_NC + iz_bse * inp_sD + iy_bse * inp_sH +
+                            ix_bse * inp_sW];
+          gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut;
+          giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut;
+          giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut;
+        }
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = gix_mult * gix;
+        gGrid_ptr_NDHW[1] = giy_mult * giy;
+        gGrid_ptr_NDHW[2] = giz_mult * giz;
+      }
+    } else if (mode == Mode::nearest) {
+      auto ix_nearest = static_cast<index_t>(std::round(ix));
+      auto iy_nearest = static_cast<index_t>(std::round(iy));
+      auto iz_nearest = static_cast<index_t>(std::round(iz));
+
+      // assign nearest neighor pixel value to output pixel
+      index_t gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (index_t c = 0; c < out_c;
+           ++c, gOut_offset += gOut_sC, gInp_ptr_NC += inp_sC) {
+        AtomicAdd3D(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest, inp_sD,
+                    inp_sH, inp_sW, in_d, in_h, in_w, grad_output[gOut_offset]);
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = static_cast<T>(0);
+        gGrid_ptr_NDHW[1] = static_cast<T>(0);
+        gGrid_ptr_NDHW[2] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+std::vector<paddle::Tensor> GridSample3DCUDABackward(
+    const paddle::Tensor& x, const paddle::Tensor& grid,
+    const paddle::Tensor& grad_out, const std::string& mode,
+    const std::string& padding_mode, bool align_corners) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+  const int out_d = grid.shape()[1];
+  const int out_h = grid.shape()[2];
+  const int out_w = grid.shape()[3];
+  const int n = x.shape()[0];
+  const int c = x.shape()[1];
+  const int in_d = x.shape()[2];
+  const int in_h = x.shape()[3];
+  const int in_w = x.shape()[4];
+
+  auto grid_grad_output =
+      paddle::empty({n, out_d, out_h, out_w, 3}, paddle::DataType::FLOAT32,
+                    paddle::GPUPlace());
+  auto x_grad_output =
+      paddle::full({n, c, in_d, in_h, in_w}, 0, paddle::DataType::FLOAT32,
+                   paddle::GPUPlace());
+
+  const int count = static_cast<int>(n * out_d * out_h * out_w);
+
+  int max_threads_per_block = 512;
+  int block_num = (count - 1) / max_threads_per_block + 1;
+
+  GridSample3DCudaBackwardKernel<float, int>
+      <<<block_num, max_threads_per_block, 0, x.stream()>>>(
+          count, grad_out.data<float>(), x.data<float>(), grid.data<float>(), c,
+          out_d, out_h, out_w, in_d, in_h, in_w, x_grad_output.data<float>(),
+          grid_grad_output.data<float>(), enum_mode, enum_padding_mode,
+          align_corners);
+
+  return {x_grad_output};
+}
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
--- a/fastdeploy/runtime/backends/paddle/ops/grid_sample_3d.h
+++ b/fastdeploy/runtime/backends/paddle/ops/grid_sample_3d.h
@@ -0,0 +1,33 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cassert>
+#include <cmath>
+#include <vector>
+
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+
+enum class Mode { bilinear, nearest };
+
+enum class PaddingMode { zeros, border, reflect };
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
--- a/fastdeploy/runtime/backends/paddle/ops/iou3d_cpu.cc
+++ b/fastdeploy/runtime/backends/paddle/ops/iou3d_cpu.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+3D Rotated IoU Calculation (CPU)
+Written by Shaoshuai Shi
+All Rights Reserved 2020.
+*/
+
+#include "iou3d_cpu.h"
+#include <math.h>
+#include <stdio.h>
+#include <vector>
+
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+static inline float min(float a, float b) { return a > b ? b : a; }
+
+static inline float max(float a, float b) { return a > b ? a : b; }
+
+#if defined(_WIN32)
+#if defined(EPS)
+#undef EPS
+#endif
+#define EPS 1e-8
+#else
+static const float EPS = 1e-8;
+#endif
+
+struct Point {
+  float x, y;
+  Point() {}
+  Point(double _x, double _y) { x = _x, y = _y; }
+
+   void set(float _x, float _y) {
+    x = _x;
+    y = _y;
+  }
+
+   Point operator+(const Point &b) const {
+    return Point(x + b.x, y + b.y);
+  }
+
+   Point operator-(const Point &b) const {
+    return Point(x - b.x, y - b.y);
+  }
+};
+
+static inline float cross(const Point &a, const Point &b) {
+  return a.x * b.y - a.y * b.x;
+}
+
+static inline float cross(const Point &p1, const Point &p2, const Point &p0) {
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+static inline int check_rect_cross(const Point &p1, const Point &p2, const Point &q1,
+                            const Point &q2) {
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+            min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+static inline int check_in_box2d(const float *box, const Point &p) {
+  // params: (7) [x, y, z, dx, dy, dz, heading]
+  const float MARGIN = 1e-2;
+
+  float center_x = box[0], center_y = box[1];
+  float angle_cos = cos(-box[6]),
+        angle_sin =
+            sin(-box[6]);  // rotate the point in the opposite direction of box
+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
+          fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+static inline int intersection(const Point &p1, const Point &p0, const Point &q1,
+                        const Point &q0, Point &ans) {
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans.x = (b0 * c1 - b1 * c0) / D;
+    ans.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+static inline void rotate_around_center(const Point &center, const float angle_cos,
+                                 const float angle_sin, Point &p) {
+  float new_x =
+      (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+  float new_y =
+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+static inline int point_cmp(const Point &a, const Point &b, const Point &center) {
+  return atan2(a.y - center.y, a.x - center.x) >
+         atan2(b.y - center.y, b.x - center.x);
+}
+
+static inline float box_overlap(const float *box_a, const float *box_b) {
+  // params: box_a (7) [x, y, z, dx, dy, dz, heading]
+  // params: box_b (7) [x, y, z, dx, dy, dz, heading]
+
+  //    float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 =
+  //    box_a[3], a_angle = box_a[4];
+  //    float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 =
+  //    box_b[3], b_angle = box_b[4];
+  float a_angle = box_a[6], b_angle = box_b[6];
+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
+        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+  Point center_a(box_a[0], box_a[1]);
+  Point center_b(box_b[0], box_b[1]);
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                          box_b_corners[j + 1], box_b_corners[j],
+                          cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0],
+                  cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+static inline float iou_bev(const float *box_a, const float *box_b) {
+  // params: box_a (7) [x, y, z, dx, dy, dz, heading]
+  // params: box_b (7) [x, y, z, dx, dy, dz, heading]
+  float sa = box_a[3] * box_a[4];
+  float sb = box_b[3] * box_b[4];
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+int boxes_iou_bev_cpu(paddle::Tensor boxes_a_tensor,
+                      paddle::Tensor boxes_b_tensor,
+                      paddle::Tensor ans_iou_tensor) {
+  // params boxes_a_tensor: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b_tensor: (M, 7) [x, y, z, dx, dy, dz, heading]
+  // params ans_iou_tensor: (N, M)
+
+  // CHECK_CONTIGUOUS(boxes_a_tensor);
+  // CHECK_CONTIGUOUS(boxes_b_tensor);
+
+  int num_boxes_a = boxes_a_tensor.shape()[0];
+  int num_boxes_b = boxes_b_tensor.shape()[0];
+  const float *boxes_a = boxes_a_tensor.data<float>();
+  const float *boxes_b = boxes_b_tensor.data<float>();
+  float *ans_iou = ans_iou_tensor.data<float>();
+
+  for (int i = 0; i < num_boxes_a; i++) {
+    for (int j = 0; j < num_boxes_b; j++) {
+      ans_iou[i * num_boxes_b + j] = iou_bev(boxes_a + i * 7, boxes_b + j * 7);
+    }
+  }
+  return 1;
+}
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
--- a/fastdeploy/runtime/backends/paddle/ops/iou3d_cpu.h
+++ b/fastdeploy/runtime/backends/paddle/ops/iou3d_cpu.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLEINFERENCE_API_COMPAT_2_4_x)
+#include "paddle/include/experimental/ext_all.h"
+#elif defined(PADDLEINFERENCE_API_COMPAT_2_5_x)
+#include "paddle/include/paddle/extension.h"
+#else
+#include "paddle/extension.h"
+#endif
+
+#include "fastdeploy/utils/utils.h"
+
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+FASTDEPLOY_DECL int boxes_iou_bev_cpu(
+  paddle::Tensor boxes_a_tensor, paddle::Tensor boxes_b_tensor,
+  paddle::Tensor ans_iou_tensor);
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
--- a/fastdeploy/runtime/backends/paddle/ops/iou3d_nms.cc
+++ b/fastdeploy/runtime/backends/paddle/ops/iou3d_nms.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#if defined(WITH_GPU)
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include "iou3d_nms.h"
+
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+// #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+static inline int DIVUP(const int m, const int n) 
+{ return ((m) / (n) + ((m) % (n) > 0)); }
+
+#define CHECK_ERROR(ans) \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort) exit(code);
+  }
+}
+
+#define D(x)                                                           \
+  PD_THROW('\n', x,                                                    \
+           "\n--------------------------------- where is the error ? " \
+           "---------------------------------------\n");
+
+static const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a,
+                          const int num_b, const float *boxes_b,
+                          float *ans_overlap);
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
+                         const float *boxes_b, float *ans_iou);
+void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
+                 float nms_overlap_thresh);
+void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
+                       int boxes_num, float nms_overlap_thresh);
+
+int boxes_overlap_bev_gpu(paddle::Tensor boxes_a, paddle::Tensor boxes_b,
+                          paddle::Tensor ans_overlap) {
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  // params ans_overlap: (N, M)
+
+  CHECK_INPUT(boxes_a);
+  CHECK_INPUT(boxes_b);
+  CHECK_INPUT(ans_overlap);
+
+  int num_a = boxes_a.shape()[0];
+  int num_b = boxes_b.shape()[0];
+
+  const float *boxes_a_data = boxes_a.data<float>();
+  const float *boxes_b_data = boxes_b.data<float>();
+  float *ans_overlap_data = ans_overlap.data<float>();
+
+  boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
+                       ans_overlap_data);
+
+  return 1;
+}
+
+int boxes_iou_bev_gpu(paddle::Tensor boxes_a, paddle::Tensor boxes_b,
+                      paddle::Tensor ans_iou) {
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  // params ans_overlap: (N, M)
+  CHECK_INPUT(boxes_a);
+  CHECK_INPUT(boxes_b);
+  CHECK_INPUT(ans_iou);
+
+  int num_a = boxes_a.shape()[0];
+  int num_b = boxes_b.shape()[0];
+
+  const float *boxes_a_data = boxes_a.data<float>();
+  const float *boxes_b_data = boxes_b.data<float>();
+  float *ans_iou_data = ans_iou.data<float>();
+
+  boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
+
+  return 1;
+}
+
+std::vector<paddle::Tensor> nms_gpu(const paddle::Tensor &boxes,
+                                    float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+  CHECK_INPUT(boxes);
+  // CHECK_CONTIGUOUS(keep);
+  auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
+                            paddle::CPUPlace());
+  auto num_to_keep_tensor =
+      paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
+  int *num_to_keep_data = num_to_keep_tensor.data<int>();
+
+  int boxes_num = boxes.shape()[0];
+  const float *boxes_data = boxes.data<float>();
+  int *keep_data = keep.data<int>();
+
+  int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  unsigned long long *mask_data = NULL;
+  CHECK_ERROR(cudaMalloc((void **)&mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long)));
+  nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+  // unsigned long long mask_cpu[boxes_num * col_blocks];
+  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
+  // col_blocks];
+  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long),
+                         cudaMemcpyDeviceToHost));
+
+  cudaFree(mask_data);
+   
+  // WARN(qiuyanjun): codes below will throw a compile error on windows with 
+  // msvc. Thus, we choosed to use std::vectored to store the result instead.
+  // unsigned long long remv_cpu[col_blocks];
+  // memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
+  std::vector<unsigned long long> remv_cpu(col_blocks, 0);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+
+  num_to_keep_data[0] = num_to_keep;
+
+  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+
+  return {keep, num_to_keep_tensor};
+}
+
+int nms_normal_gpu(paddle::Tensor boxes, paddle::Tensor keep,
+                   float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+
+  CHECK_INPUT(boxes);
+  // CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.shape()[0];
+  const float *boxes_data = boxes.data<float>();
+  // WARN(qiuyanjun): long type for Tensor::data() API is not exported by paddle, 
+  // it will raise some link error on windows with msvc. Please check:
+  // https://github.com/PaddlePaddle/Paddle/blob/release/2.5/paddle/phi/api/lib/tensor.cc
+#if defined(_WIN32)
+  int *keep_data = keep.data<int>();
+#else
+  long *keep_data = keep.data<long>();
+#endif  
+
+  int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  unsigned long long *mask_data = NULL;
+  CHECK_ERROR(cudaMalloc((void **)&mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long)));
+  nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+  // unsigned long long mask_cpu[boxes_num * col_blocks];
+  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
+  // col_blocks];
+  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long),
+                         cudaMemcpyDeviceToHost));
+
+  cudaFree(mask_data);
+
+  // WARN(qiuyanjun): codes below will throw a compile error on windows with 
+  // msvc. Thus, we choosed to use std::vectored to store the result instead.
+  // unsigned long long remv_cpu[col_blocks];
+  // memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
+  std::vector<unsigned long long> remv_cpu(col_blocks, 0);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+
+  return num_to_keep;
+}
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
+
+#endif
--- a/fastdeploy/runtime/backends/paddle/ops/iou3d_nms.h
+++ b/fastdeploy/runtime/backends/paddle/ops/iou3d_nms.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLEINFERENCE_API_COMPAT_2_4_x)
+#include "paddle/include/experimental/ext_all.h"
+#elif defined(PADDLEINFERENCE_API_COMPAT_2_5_x)
+#include "paddle/include/paddle/extension.h"
+#else
+#include "paddle/extension.h"
+#endif
+
+#include "fastdeploy/utils/utils.h"
+
+#if defined(WITH_GPU)
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+FASTDEPLOY_DECL int boxes_overlap_bev_gpu(
+  paddle::Tensor boxes_a, paddle::Tensor boxes_b,
+  paddle::Tensor ans_overlap);
+FASTDEPLOY_DECL int boxes_iou_bev_gpu(paddle::Tensor boxes_a, 
+                                      paddle::Tensor boxes_b,
+                                      paddle::Tensor ans_iou);
+FASTDEPLOY_DECL std::vector<paddle::Tensor> nms_gpu(
+  const paddle::Tensor& boxes, float nms_overlap_thresh);
+FASTDEPLOY_DECL int nms_normal_gpu(
+  paddle::Tensor boxes, paddle::Tensor keep, float nms_overlap_thresh);
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
+
+#endif
+
--- a/fastdeploy/runtime/backends/paddle/ops/iou3d_nms_api.cc
+++ b/fastdeploy/runtime/backends/paddle/ops/iou3d_nms_api.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#if defined(PADDLEINFERENCE_API_COMPAT_2_4_x)
+#include "paddle/include/experimental/ext_all.h"
+#elif defined(PADDLEINFERENCE_API_COMPAT_2_5_x)
+#include "paddle/include/paddle/extension.h"
+#else
+#include "paddle/extension.h"
+#endif
+
+#include <vector>
+
+#include "iou3d_cpu.h"
+#include "iou3d_nms.h"
+
+namespace fastdeploy {
+namespace paddle_custom_ops {
+
+std::vector<std::vector<int64_t>> NMSInferShape(
+    std::vector<int64_t> boxes_shape) {
+  int64_t keep_num = 1;
+  return {{boxes_shape[0]}, {keep_num}};
+}
+
+std::vector<paddle::DataType> NMSInferDtype(paddle::DataType boxes_dtype) {
+  return {paddle::DataType::INT64, paddle::DataType::INT64};
+}
+
+}  // namespace fastdeploy
+}  // namespace paddle_custom_ops
+
+#if defined(WITH_GPU)
+
+PD_BUILD_OP(nms_gpu)
+    .Inputs({"boxes"})
+    .Outputs({"keep", "num_to_keep"})
+    .Attrs({"nms_overlap_thresh: float"})
+    .SetKernelFn(PD_KERNEL(fastdeploy::paddle_custom_ops::nms_gpu))
+    .SetInferDtypeFn(PD_INFER_DTYPE(fastdeploy::paddle_custom_ops::NMSInferDtype))
+    .SetInferShapeFn(PD_INFER_SHAPE(fastdeploy::paddle_custom_ops::NMSInferShape));
+
+#endif
--- a/fastdeploy/runtime/backends/paddle/ops/iou3d_nms_kernel.cu
+++ b/fastdeploy/runtime/backends/paddle/ops/iou3d_nms_kernel.cu
@@ -1,23 +1,8 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 /*
 3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
 Written by Shaoshuai Shi
 All Rights Reserved 2019-2020.
 */
-
 #include <stdio.h>

 namespace fastdeploy {
@@ -78,20 +63,36 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2,
 __device__ inline int check_in_box2d(const float *box, const Point &p) {
  // params: (7) [x, y, z, dx, dy, dz, heading]
  const float MARGIN = 1e-2;
+  // Align with the setting of mmdet3d
+  // const float MARGIN = 1e-5;

  float center_x = box[0], center_y = box[1];
-  // rotate the point in the opposite direction of box
-  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
+  float angle_cos = cos(-box[6]),
+        angle_sin =
+            sin(-box[6]);  // rotate the point in the opposite direction of box
  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;

  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
          fabs(rot_y) < box[4] / 2 + MARGIN);
+  // Align with the implement of mmdet3d
+  // float rot_x =
+  //     (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x;
+  // float rot_y =
+  //     -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos +
+  //     center_y;
+  // float x1 = center_x - box[3] / 2;
+  // float x2 = center_x + box[3] / 2;
+  // float y1 = center_y - box[4] / 2;
+  // float y2 = center_y + box[4] / 2;
+  // return (rot_x > x1 - MARGIN && rot_x < x2 + MARGIN && rot_y > y1 - MARGIN
+  // &&
+  //         rot_y < y2 + MARGIN);
 }

 __device__ inline int intersection(const Point &p1, const Point &p0,
                                   const Point &q1, const Point &q0,
-                                   Point *ans) {
+                                   Point &ans) {
  // fast exclusion
  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;

@@ -106,16 +107,16 @@ __device__ inline int intersection(const Point &p1, const Point &p0,
  // calculate intersection of two lines
  float s5 = cross(q1, p1, p0);
  if (fabs(s5 - s1) > EPS) {
-    ans->x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
-    ans->y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);

  } else {
    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
    float D = a0 * b1 - a1 * b0;

-    ans->x = (b0 * c1 - b1 * c0) / D;
-    ans->y = (a1 * c0 - a0 * c1) / D;
+    ans.x = (b0 * c1 - b1 * c0) / D;
+    ans.y = (a1 * c0 - a0 * c1) / D;
  }

  return 1;
@@ -123,12 +124,18 @@ __device__ inline int intersection(const Point &p1, const Point &p0,

 __device__ inline void rotate_around_center(const Point &center,
                                            const float angle_cos,
-                                            const float angle_sin, Point *p) {
-  float new_x = (p->x - center.x) * angle_cos +
-                (p->y - center.y) * (-angle_sin) + center.x;
+                                            const float angle_sin, Point &p) {
+  // float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) *
+  // (-angle_sin) + center.x;
+  // float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos +
+  // center.y;
+  // p.set(new_x, new_y);
+  // Aligh with the implement of mmdet3d
+  float new_x =
+      (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x;
  float new_y =
-      (p->x - center.x) * angle_sin + (p->y - center.y) * angle_cos + center.y;
-  p->set(new_x, new_y);
+      -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
 }

 __device__ inline int point_cmp(const Point &a, const Point &b,
@@ -152,6 +159,14 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
  Point center_a(box_a[0], box_a[1]);
  Point center_b(box_b[0], box_b[1]);

+#ifdef DEBUG
+  printf(
+      "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n",
+      a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle);
+  printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y,
+         center_b.x, center_b.y);
+#endif
+
  Point box_a_corners[5];
  box_a_corners[0].set(a_x1, a_y1);
  box_a_corners[1].set(a_x2, a_y1);
@@ -169,8 +184,17 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);

  for (int k = 0; k < 4; k++) {
-    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners + k);
-    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners + k);
+#ifdef DEBUG
+    printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k,
+           box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x,
+           box_b_corners[k].y);
+#endif
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+    printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x,
+           box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
  }

  box_a_corners[4] = box_a_corners[0];
@@ -186,10 +210,19 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
    for (int j = 0; j < 4; j++) {
      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
                          box_b_corners[j + 1], box_b_corners[j],
-                          cross_points + cnt);
+                          cross_points[cnt]);
      if (flag) {
        poly_center = poly_center + cross_points[cnt];
        cnt++;
+#ifdef DEBUG
+        printf(
+            "Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, "
+            "%.3f)->(%.3f, %.3f) \n",
+            cross_points[cnt - 1].x, cross_points[cnt - 1].y,
+            box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x,
+            box_a_corners[i + 1].y, box_b_corners[i].x, box_b_corners[i].y,
+            box_b_corners[i + 1].x, box_b_corners[i + 1].y);
+#endif
      }
    }
  }
@@ -200,11 +233,19 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
      poly_center = poly_center + box_b_corners[k];
      cross_points[cnt] = box_b_corners[k];
      cnt++;
+#ifdef DEBUG
+      printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x,
+             cross_points[cnt - 1].y);
+#endif
    }
    if (check_in_box2d(box_b, box_a_corners[k])) {
      poly_center = poly_center + box_a_corners[k];
      cross_points[cnt] = box_a_corners[k];
      cnt++;
+#ifdef DEBUG
+      printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x,
+             cross_points[cnt - 1].y);
+#endif
    }
  }

@@ -223,6 +264,14 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
    }
  }

+#ifdef DEBUG
+  printf("cnt=%d\n", cnt);
+  for (int i = 0; i < cnt; i++) {
+    printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x,
+           cross_points[i].y);
+  }
+#endif
+
  // get the overlap areas
  float area = 0;
  for (int k = 0; k < cnt - 1; k++) {
@@ -242,10 +291,220 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
 }

-__global__ void nms_kernel(const int num_bboxes, const int num_bboxes_for_nms,
+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a,
+                                     const int num_b, const float *boxes_b,
+                                     float *ans_overlap) {
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+  const float *cur_box_a = boxes_a + a_idx * 7;
+  const float *cur_box_b = boxes_b + b_idx * 7;
+  float s_overlap = box_overlap(cur_box_a, cur_box_b);
+  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a,
+                                     const int num_b, const float *boxes_b,
+                                     float *ans_iou) {
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+
+  const float *cur_box_a = boxes_a + a_idx * 7;
+  const float *cur_box_b = boxes_b + b_idx * 7;
+  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 7 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+    block_boxes[threadIdx.x * 7 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+    block_boxes[threadIdx.x * 7 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+    block_boxes[threadIdx.x * 7 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+    block_boxes[threadIdx.x * 7 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+    block_boxes[threadIdx.x * 7 + 5] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+    block_boxes[threadIdx.x * 7 + 6] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 7;
+
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+  // params: a: [x, y, z, dx, dy, dz, heading]
+  // params: b: [x, y, z, dx, dy, dz, heading]
+
+  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
+        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
+        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = a[3] * a[4];
+  float Sb = b[3] * b[4];
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void nms_normal_kernel(const int boxes_num,
                                  const float nms_overlap_thresh,
-                           const int decode_bboxes_dims, const float *bboxes,
-                           const int *index, const int64_t *sorted_index,
+                                  const float *boxes,
+                                  unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 7 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+    block_boxes[threadIdx.x * 7 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+    block_boxes[threadIdx.x * 7 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+    block_boxes[threadIdx.x * 7 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+    block_boxes[threadIdx.x * 7 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+    block_boxes[threadIdx.x * 7 + 5] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+    block_boxes[threadIdx.x * 7 + 6] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 7;
+
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a,
+                          const int num_b, const float *boxes_b,
+                          float *ans_overlap) {
+  dim3 blocks(
+      DIVUP(num_b, THREADS_PER_BLOCK),
+      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+  boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
+                                            ans_overlap);
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
+                         const float *boxes_b, float *ans_iou) {
+  dim3 blocks(
+      DIVUP(num_b, THREADS_PER_BLOCK),
+      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+  boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
+                                            ans_iou);
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
+                 float nms_overlap_thresh) {
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
+
+void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
+                       int boxes_num, float nms_overlap_thresh) {
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes,
+                                         mask);
+}
+
+__global__ void nms_kernel_centerpoint(const int num_bboxes,
+                                       const int num_bboxes_for_nms,
+                                       const float nms_overlap_thresh,
+                                       const int decode_bboxes_dims,
+                                       const float *bboxes, const int *index,
+                                       const int64_t *sorted_index,
                                       int64_t *mask) {
  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
@@ -304,7 +563,7 @@ __global__ void nms_kernel(const int num_bboxes, const int num_bboxes_for_nms,
        t |= 1ULL << i;
      }
    }
-    const int col_blocks = DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS);
+    int col_blocks = DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS);
    mask[cur_box_idx * col_blocks + col_start] = t;
  }
 }
@@ -317,7 +576,7 @@ void NmsLauncher(const cudaStream_t &stream, const float *bboxes,
  dim3 blocks(DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS),
              DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);
-  nms_kernel<<<blocks, threads, 0, stream>>>(
+  nms_kernel_centerpoint<<<blocks, threads, 0, stream>>>(
      num_bboxes, num_bboxes_for_nms, nms_overlap_thresh, decode_bboxes_dims,
      bboxes, index, sorted_index, mask);
 }
--- a/fastdeploy/vision.h
+++ b/fastdeploy/vision.h
@@ -37,6 +37,7 @@
 #include "fastdeploy/vision/perception/paddle3d/smoke/smoke.h"
 #include "fastdeploy/vision/perception/paddle3d/petr/petr.h"
 #include "fastdeploy/vision/perception/paddle3d/centerpoint/centerpoint.h"
+#include "fastdeploy/vision/perception/paddle3d/caddn/caddn.h"
 #include "fastdeploy/vision/detection/ppdet/model.h"
 #include "fastdeploy/vision/facealign/contrib/face_landmark_1000.h"
 #include "fastdeploy/vision/facealign/contrib/pfld.h"
--- a/fastdeploy/vision/perception/paddle3d/caddn/caddn.cc
+++ b/fastdeploy/vision/perception/paddle3d/caddn/caddn.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/perception/paddle3d/caddn/caddn.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace perception {
+
+Caddn::Caddn(const std::string& model_file, const std::string& params_file,
+             const std::string& config_file, const RuntimeOption& custom_option,
+             const ModelFormat& model_format)
+    : preprocessor_(config_file) {
+  valid_gpu_backends = {Backend::PDINFER};
+
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool Caddn::Initialize() {
+  if (!InitRuntime()) {
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool Caddn::Predict(const cv::Mat& im, std::vector<float>& input_cam_data,
+                    std::vector<float>& input_lidar_data,
+                    PerceptionResult* result) {
+  std::vector<PerceptionResult> results;
+  if (!BatchPredict({im}, input_cam_data, input_lidar_data, &results)) {
+    return false;
+  }
+  if (results.size()) {
+    *result = std::move(results[0]);
+  }
+  return true;
+}
+
+bool Caddn::BatchPredict(const std::vector<cv::Mat>& images,
+                         std::vector<float>& input_cam_data,
+                         std::vector<float>& input_lidar_data,
+                         std::vector<PerceptionResult>* results) {
+  std::vector<FDMat> fd_images = WrapMat(images);
+
+  if (!preprocessor_.Run(&fd_images, input_cam_data, input_lidar_data,
+                         &reused_input_tensors_)) {
+    FDERROR << "Failed to preprocess the input image." << std::endl;
+    return false;
+  }
+
+  reused_input_tensors_[0].name = "images";
+  reused_input_tensors_[1].name = "trans_cam_to_img";
+  reused_input_tensors_[2].name = "trans_lidar_to_cam";
+
+  if (!Infer(reused_input_tensors_, &reused_output_tensors_)) {
+    FDERROR << "Failed to inference by runtime." << std::endl;
+    return false;
+  }
+
+  if (!postprocessor_.Run(reused_output_tensors_, results)) {
+    FDERROR << "Failed to postprocess the inference results by runtime."
+            << std::endl;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace perception
+}  // namespace vision
+}  // namespace fastdeploy
--- a/fastdeploy/vision/perception/paddle3d/caddn/caddn.h
+++ b/fastdeploy/vision/perception/paddle3d/caddn/caddn.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  //NOLINT
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/perception/paddle3d/caddn/preprocessor.h"
+#include "fastdeploy/vision/perception/paddle3d/caddn/postprocessor.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace perception {
+/*! @brief Caddn model object used when to load a Caddn model exported by Caddn.
+ */
+class FASTDEPLOY_DECL Caddn : public FastDeployModel {
+ public:
+  /** \brief  Set path of model file and the configuration of runtime.
+   *
+   * \param[in] model_file Path of model file, e.g Caddn/model.pdiparams
+   * \param[in] params_file Path of parameter file, e.g Caddn/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+   * \param[in] custom_option RuntimeOption for inference, the default will use cpu, and choose the backend defined in "valid_cpu_backends"
+   * \param[in] model_format Model format of the loaded model, default is Paddle format
+   */
+  Caddn(const std::string& model_file, const std::string& params_file,
+         const std::string& config_file,
+         const RuntimeOption& custom_option = RuntimeOption(),
+         const ModelFormat& model_format = ModelFormat::PADDLE);
+
+  std::string ModelName() const { return "Paddle3D/Caddn"; }
+
+  /** \brief Predict the perception result for an input image
+   *
+   * \param[in] img The input image data, comes from cv::imread(), is a 3-D array with layout HWC, BGR format
+   * \param[in] result The output perception result will be writen to this structure
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool Predict(const cv::Mat& im,
+                       std::vector<float>& input_cam_data,
+                       std::vector<float>& input_lidar_data,
+                       PerceptionResult* results);
+
+  /** \brief Predict the perception results for a batch of input images
+   *
+   * \param[in] imgs, The input image list, each element comes from cv::imread()
+   * \param[in] results The output perception result list
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool BatchPredict(const std::vector<cv::Mat>& images,
+                            std::vector<float>& input_cam_data,
+                            std::vector<float>& input_lidar_data,
+                            std::vector<PerceptionResult>* results);
+
+  /// Get preprocessor reference of Caddn
+  virtual CaddnPreprocessor& GetPreprocessor() {
+    return preprocessor_;
+  }
+
+  /// Get postprocessor reference of Caddn
+  virtual CaddnPostprocessor& GetPostprocessor() {
+    return postprocessor_;
+  }
+
+ protected:
+  bool Initialize();
+  CaddnPreprocessor preprocessor_;
+  CaddnPostprocessor postprocessor_;
+  bool initialized_ = false;
+};
+
+}  // namespace perception
+}  // namespace vision
+}  // namespace fastdeploy
--- a/fastdeploy/vision/perception/paddle3d/caddn/caddn_pybind.cc
+++ b/fastdeploy/vision/perception/paddle3d/caddn/caddn_pybind.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindCaddn(pybind11::module& m) {
+  pybind11::class_<vision::perception::CaddnPreprocessor,
+                   vision::ProcessorManager>(m, "CaddnPreprocessor")
+      .def(pybind11::init<std::string>())
+      .def("run",
+           [](vision::perception::CaddnPreprocessor& self,
+              std::vector<pybind11::array>& im_list,
+              std::vector<float>& cam_data, std::vector<float>& lidar_data) {
+             std::vector<vision::FDMat> images;
+             for (size_t i = 0; i < im_list.size(); ++i) {
+               images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
+             }
+             std::vector<FDTensor> outputs;
+             if (!self.Run(&images, cam_data, lidar_data, &outputs)) {
+               throw std::runtime_error(
+                   "Failed to preprocess the input data in CaddnPreprocessor.");
+             }
+             for (size_t i = 0; i < outputs.size(); ++i) {
+               outputs[i].StopSharing();
+             }
+             return outputs;
+           });
+
+  pybind11::class_<vision::perception::CaddnPostprocessor>(m,
+                                                           "CaddnPostprocessor")
+      .def(pybind11::init<>())
+      .def("run",
+           [](vision::perception::CaddnPostprocessor& self,
+              std::vector<FDTensor>& inputs) {
+             std::vector<vision::PerceptionResult> results;
+             if (!self.Run(inputs, &results)) {
+               throw std::runtime_error(
+                   "Failed to postprocess the runtime result in "
+                   "CaddnPostprocessor.");
+             }
+             return results;
+           })
+      .def("run", [](vision::perception::CaddnPostprocessor& self,
+                     std::vector<pybind11::array>& input_array) {
+        std::vector<vision::PerceptionResult> results;
+        std::vector<FDTensor> inputs;
+        PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
+        if (!self.Run(inputs, &results)) {
+          throw std::runtime_error(
+              "Failed to postprocess the runtime result in "
+              "CaddnPostprocessor.");
+        }
+        return results;
+      });
+
+  pybind11::class_<vision::perception::Caddn, FastDeployModel>(m, "Caddn")
+      .def(pybind11::init<std::string, std::string, std::string, RuntimeOption,
+                          ModelFormat>())
+      .def("predict",
+           [](vision::perception::Caddn& self, pybind11::array& data,
+              std::vector<float>& cam_data, std::vector<float>& lidar_data) {
+             auto mat = PyArrayToCvMat(data);
+             vision::PerceptionResult res;
+             self.Predict(mat, cam_data, lidar_data, &res);
+             return res;
+           })
+      .def("batch_predict",
+           [](vision::perception::Caddn& self,
+              std::vector<pybind11::array>& data, std::vector<float>& cam_data,
+              std::vector<float>& lidar_data) {
+             std::vector<cv::Mat> images;
+             for (size_t i = 0; i < data.size(); ++i) {
+               images.push_back(PyArrayToCvMat(data[i]));
+             }
+             std::vector<vision::PerceptionResult> results;
+             self.BatchPredict(images, cam_data, lidar_data, &results);
+             return results;
+           })
+      .def_property_readonly("preprocessor",
+                             &vision::perception::Caddn::GetPreprocessor)
+      .def_property_readonly("postprocessor",
+                             &vision::perception::Caddn::GetPostprocessor);
+}
+}  // namespace fastdeploy
--- a/fastdeploy/vision/perception/paddle3d/caddn/postprocessor.cc
+++ b/fastdeploy/vision/perception/paddle3d/caddn/postprocessor.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/perception/paddle3d/caddn/postprocessor.h"
+
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace perception {
+
+CaddnPostprocessor::CaddnPostprocessor() {}
+
+bool CaddnPostprocessor::Run(const std::vector<FDTensor>& tensors,
+                             std::vector<PerceptionResult>* results) {
+  results->resize(1);
+  (*results)[0].Clear();
+  (*results)[0].Reserve(tensors[0].shape[0]);
+  if (tensors[0].dtype != FDDataType::FP32) {
+    FDERROR << "Only support post process with float32 data." << std::endl;
+    return false;
+  }
+  const float* data_0 = reinterpret_cast<const float*>(tensors[0].Data());
+  auto result = &(*results)[0];
+  for (int i = 0; i < tensors[0].shape[0] * tensors[0].shape[1]; i += 7) {
+    // item 1 ~ 3   :  box3d bottom center x, y, z
+    // item 4 ~ 6   :  box3d w, h, l
+    // item 7       :  box3d yaw angle
+    std::vector<float> vec(data_0 + i, data_0 + i + 7);
+    result->boxes.emplace_back(
+        std::array<float, 7>{0, 0, 0, 0, vec[3], vec[4], vec[5]});
+    result->center.emplace_back(std::array<float, 3>{vec[0], vec[1], vec[2]});
+    result->yaw_angle.push_back(vec[6]);
+  }
+  const float* data_1 = reinterpret_cast<const float*>(tensors[2].Data());
+  for (int i = 0; i < tensors[2].shape[0]; i += 1) {
+    std::vector<float> vec(data_1 + i, data_1 + i + 1);
+    result->scores.push_back(vec[0]);
+  }
+  const float* data_2 = reinterpret_cast<const float*>(tensors[1].Data());
+  for (int i = 0; i < tensors[1].shape[0]; i++) {
+    std::vector<float> vec(data_2 + i, data_2 + i + 1);
+    result->label_ids.push_back(vec[0]);
+  }
+
+  result->valid.push_back(true);   // 0 scores
+  result->valid.push_back(true);   // 1 label_ids
+  result->valid.push_back(true);   // 2 boxes
+  result->valid.push_back(true);   // 3 center
+  result->valid.push_back(false);  // 4 observation_angle
+  result->valid.push_back(true);   // 5 yaw_angle
+  result->valid.push_back(false);  // 6 velocity
+
+  return true;
+}
+
+}  // namespace perception
+}  // namespace vision
+}  // namespace fastdeploy
--- a/fastdeploy/vision/perception/paddle3d/caddn/postprocessor.h
+++ b/fastdeploy/vision/perception/paddle3d/caddn/postprocessor.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+namespace vision {
+
+namespace perception {
+/*! @brief Postprocessor object for Caddn serials model.
+ */
+class FASTDEPLOY_DECL CaddnPostprocessor {
+ public:
+  /** \brief Create a postprocessor instance for Caddn serials model
+   */
+  CaddnPostprocessor();
+
+  /** \brief Process the result of runtime and fill to PerceptionResult structure
+   *
+   * \param[in] tensors The inference result from runtime
+   * \param[in] result The output result of detection
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \return true if the postprocess successed, otherwise false
+   */
+  bool Run(const std::vector<FDTensor>& tensors,
+      std::vector<PerceptionResult>* results);
+
+
+ protected:
+  float conf_threshold_;
+};
+
+}  // namespace perception
+}  // namespace vision
+}  // namespace fastdeploy
--- a/fastdeploy/vision/perception/paddle3d/caddn/preprocessor.cc
+++ b/fastdeploy/vision/perception/paddle3d/caddn/preprocessor.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/perception/paddle3d/caddn/preprocessor.h"
+
+#include "fastdeploy/function/concat.h"
+#include "yaml-cpp/yaml.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace perception {
+
+CaddnPreprocessor::CaddnPreprocessor(const std::string& config_file) {
+  config_file_ = config_file;
+  FDASSERT(BuildPreprocessPipeline(),
+           "Failed to create Paddle3DDetPreprocessor.");
+  initialized_ = true;
+}
+
+bool CaddnPreprocessor::BuildPreprocessPipeline() {
+  processors_.clear();
+
+  // preprocess
+  processors_.push_back(std::make_shared<BGR2RGB>());
+
+  std::vector<float> alpha = {1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0};
+  std::vector<float> beta = {0.0, 0.0, 0.0};
+  processors_.push_back(std::make_shared<Convert>(alpha, beta));
+
+  processors_.push_back(std::make_shared<Cast>("float"));
+  processors_.push_back(std::make_shared<HWC2CHW>());
+
+  // Fusion will improve performance
+  FuseTransforms(&processors_);
+
+  return true;
+}
+
+bool CaddnPreprocessor::Apply(FDMatBatch* image_batch,
+                              std::vector<float>& input_cam_data,
+                              std::vector<float>& input_lidar_data,
+                              std::vector<FDTensor>* outputs) {
+  if (image_batch->mats->empty()) {
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
+    return false;
+  }
+  if (!initialized_) {
+    FDERROR << "The preprocessor is not initialized." << std::endl;
+    return false;
+  }
+  // There are 3 outputs, image, cam_data, lidar_data
+  outputs->resize(3);
+  int batch = static_cast<int>(image_batch->mats->size());
+
+  // Allocate memory for cam_data
+  (*outputs)[1].Resize({batch, 3, 4}, FDDataType::FP32);
+
+  // Allocate memory for lidar_data
+  (*outputs)[2].Resize({batch, 4, 4}, FDDataType::FP32);
+
+  auto* cam_data_ptr = reinterpret_cast<float*>((*outputs)[1].MutableData());
+  auto* lidar_data_ptr = reinterpret_cast<float*>((*outputs)[2].MutableData());
+
+  for (size_t i = 0; i < image_batch->mats->size(); ++i) {
+    FDMat* mat = &(image_batch->mats->at(i));
+    for (size_t j = 0; j < processors_.size(); ++j) {
+      if (!(*(processors_[j].get()))(mat)) {
+        FDERROR << "Failed to processs image:" << i << " in "
+                << processors_[j]->Name() << "." << std::endl;
+        return false;
+      }
+    }
+
+    memcpy(cam_data_ptr + i * 12, input_cam_data.data(), 12 * sizeof(float));
+    memcpy(lidar_data_ptr + i * 16, input_lidar_data.data(),
+           16 * sizeof(float));
+  }
+
+  FDTensor* tensor = image_batch->Tensor();
+  (*outputs)[0].SetExternalData(tensor->Shape(), tensor->Dtype(),
+                                tensor->Data(), tensor->device,
+                                tensor->device_id);
+
+  return true;
+}
+
+bool CaddnPreprocessor::Run(std::vector<FDMat>* images,
+                            std::vector<float>& input_cam_data,
+                            std::vector<float>& input_lidar_data,
+                            std::vector<FDTensor>* outputs) {
+  FDMatBatch image_batch(images);
+  PreApply(&image_batch);
+  bool ret = Apply(&image_batch, input_cam_data, input_lidar_data, outputs);
+  PostApply();
+  return ret;
+}
+
+}  // namespace perception
+}  // namespace vision
+}  // namespace fastdeploy
--- a/fastdeploy/vision/perception/paddle3d/caddn/preprocessor.h
+++ b/fastdeploy/vision/perception/paddle3d/caddn/preprocessor.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/vision/common/processors/manager.h"
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+namespace vision {
+
+namespace perception {
+/*! @brief Preprocessor object for Caddn serials model.
+ */
+class FASTDEPLOY_DECL CaddnPreprocessor : public ProcessorManager  {
+ public:
+  CaddnPreprocessor() = default;
+  /** \brief Create a preprocessor instance for Caddn model
+   *
+   * \param[in] config_file Path of configuration file for deployment, e.g Caddn/infer_cfg.yml
+   */
+  explicit CaddnPreprocessor(const std::string& config_file);
+
+  bool Run(std::vector<FDMat>* images,
+           std::vector<float>& input_cam_data,
+           std::vector<float>& input_lidar_data,
+           std::vector<FDTensor>* outputs);
+
+  /** \brief Process the input image and prepare input tensors for runtime
+   *
+   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] outputs The output tensors which will feed in runtime
+   * \param[in] ims_info The shape info list, record input_shape and output_shape
+   * \return true if the preprocess successed, otherwise false
+   */
+  bool Apply(FDMatBatch* image_batch, std::vector<FDTensor>* outputs) {
+    FDERROR << "CaddnPreprocessor should input cam and lidar datas" << std::endl;
+    return 0;
+  };
+  bool Apply(FDMatBatch* image_batch,
+             std::vector<float>& input_cam_data,
+             std::vector<float>& input_lidar_data,
+             std::vector<FDTensor>* outputs);
+
+ protected:
+  bool BuildPreprocessPipeline();
+  std::vector<std::shared_ptr<Processor>> processors_;
+
+  bool disable_permute_ = false;
+
+  bool initialized_ = false;
+
+  std::string config_file_;
+};
+
+}  // namespace perception
+}  // namespace vision
+}  // namespace fastdeploy
--- a/fastdeploy/vision/perception/paddle3d/centerpoint/preprocessor.cc
+++ b/fastdeploy/vision/perception/paddle3d/centerpoint/preprocessor.cc
@@ -24,8 +24,7 @@ CenterpointPreprocessor::CenterpointPreprocessor(

 bool CenterpointPreprocessor::ReadPoint(const std::string &file_path,
                                        const int64_t num_point_dim,
-                                        std::vector<float> &data,
-                                        int64_t *num_points) {
+                                        std::vector<float> &data, int64_t *num_points) {
  std::ifstream file_in(file_path, std::ios::in | std::ios::binary);
  if (num_point_dim < 4) {
    FDERROR << "Point dimension must not be less than 4, but received "
--- a/fastdeploy/vision/perception/perception_pybind.cc
+++ b/fastdeploy/vision/perception/perception_pybind.cc
@@ -19,6 +19,7 @@ namespace fastdeploy {
 void BindSmoke(pybind11::module& m);
 void BindPetr(pybind11::module& m);
 void BindCenterpoint(pybind11::module& m);
+void BindCaddn(pybind11::module& m);

 void BindPerception(pybind11::module& m) {
  auto perception_module =
@@ -26,5 +27,6 @@ void BindPerception(pybind11::module& m) {
  BindSmoke(perception_module);
  BindPetr(perception_module);
  BindCenterpoint(perception_module);
+  BindCaddn(perception_module);
 }
 }  // namespace fastdeploy
--- a/python/fastdeploy/vision/perception/init.py
+++ b/python/fastdeploy/vision/perception/init.py
@@ -16,3 +16,4 @@ from __future__ import absolute_import
 from .paddle3d.smoke import *
 from .paddle3d.petr import *
 from .paddle3d.centerpoint import *
+from .paddle3d.caddn import *
--- a/python/fastdeploy/vision/perception/paddle3d/caddn.py
+++ b/python/fastdeploy/vision/perception/paddle3d/caddn.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import logging
+from .... import FastDeployModel, ModelFormat
+from .... import c_lib_wrap as C
+
+
+class CaddnPreprocessor:
+    def __init__(self, config_file):
+        """Create a preprocessor for Caddn
+        """
+        self._preprocessor = C.vision.perception.CaddnPreprocessor(config_file)
+
+    def run(self, input_ims, cam_data, lidar_data):
+        """Preprocess input images for Caddn
+
+        :param: input_ims: (list of numpy.ndarray)The input image
+        :return: list of FDTensor
+        """
+        return self._preprocessor.run(input_ims, cam_data, lidar_data)
+
+
+class CaddnPostprocessor:
+    def __init__(self):
+        """Create a postprocessor for Caddn
+        """
+        self._postprocessor = C.vision.perception.CaddnPostprocessor()
+
+    def run(self, runtime_results):
+        """Postprocess the runtime results for Caddn
+
+        :param: runtime_results: (list of FDTensor)The output FDTensor results from runtime
+        :return: list of PerceptionResult(If the runtime_results is predict by batched samples, the length of this list equals to the batch size)
+        """
+        return self._postprocessor.run(runtime_results)
+
+
+class Caddn(FastDeployModel):
+    def __init__(self,
+                 model_file,
+                 params_file,
+                 config_file,
+                 runtime_option=None,
+                 model_format=ModelFormat.PADDLE):
+        """Load a Caddn model exported by Caddn.
+
+        :param model_file: (str)Path of model file, e.g ./Caddn.pdmodel
+        :param params_file: (str)Path of parameters file, e.g ./Caddn.pdiparams
+        :param config_file: (str)Path of config file, e.g ./infer_cfg.yaml
+        :param runtime_option: (fastdeploy.RuntimeOption)RuntimeOption for inference this model, if it's None, will use the default backend on CPU
+        :param model_format: (fastdeploy.ModelForamt)Model format of the loaded model
+        """
+        super(Caddn, self).__init__(runtime_option)
+
+        self._model = C.vision.perception.Caddn(
+            model_file, params_file, config_file, self._runtime_option,
+            model_format)
+        assert self.initialized, "Caddn initialize failed."
+
+    def predict(self, input_image, cam_data, lidar_data):
+        """Detect an input image
+
+        :param input_image: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
+        :param: cam_data: (list)The input camera data
+        :param: lidar_data: (list)The input lidar data
+        :return: PerceptionResult
+        """
+        return self._model.predict(input_image, cam_data, lidar_data)
+
+    def batch_predict(self, images, cam_data, lidar_data):
+        """Classify a batch of input image
+
+        :param im: (list of numpy.ndarray) The input image list, each element is a 3-D array with layout HWC, BGR format
+        :param: cam_data: (list)The input camera data
+        :param: lidar_data: (list)The input lidar data
+        :return list of PerceptionResult
+        """
+
+        return self._model.batch_predict(images, cam_data, lidar_data)
+
+    @property
+    def preprocessor(self):
+        """Get CaddnPreprocessor object of the loaded model
+
+        :return CaddnPreprocessor
+        """
+        return self._model.preprocessor
+
+    @property
+    def postprocessor(self):
+        """Get CaddnPostprocessor object of the loaded model
+
+        :return CaddnPostprocessor
+        """
+        return self._model.postprocessor