[CVCUDA] Utilize CV-CUDA batch processing function (#1223)

* norm and permute batch processing * move cache to mat, batch processors * get batched tensor logic, resize on cpu logic * fix cpu compile error * remove vector mat api * nits * add comments * nits * fix batch size * move initial resize on cpu option to use_cuda api * fix pybind * processor manager pybind * rename mat and matbatch * move initial resize on cpu to ppcls preprocessor --------- Co-authored-by: Jason <jiangjiajun@baidu.com>
2025-10-06 09:07:10 +08:00 · 2023-02-07 13:44:30 +08:00
parent 7c9bf11c44
commit d3d914856d
29 changed files with 710 additions and 241 deletions
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/utils/utils.h"
 #include <algorithm>
 #include <cstring>
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/utils/utils.h"
 #ifdef WITH_GPU
 #include <cuda_runtime_api.h>
 #endif
@@ -142,6 +143,9 @@ void FDTensor::Resize(const std::vector<int64_t>& new_shape,
                      const FDDataType& data_type,
                      const std::string& tensor_name,
                      const Device& new_device) {
  if (device != new_device) {
    FreeFn();
  }
  external_data_ptr = nullptr;
  name = tensor_name;
  device = new_device;
@@ -269,9 +273,10 @@ bool FDTensor::ReallocFn(size_t nbytes) {
    }
    return buffer_ != nullptr;
 #else
-    FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
+    FDASSERT(false,
-                    "-DWITH_GPU=ON,"
+             "The FastDeploy FDTensor allocator didn't compile under "
-                    "so this is an unexpected problem happend.");
+             "-DWITH_GPU=ON,"
             "so this is an unexpected problem happend.");
 #endif
  } else {
    if (is_pinned_memory) {
@@ -285,9 +290,10 @@ bool FDTensor::ReallocFn(size_t nbytes) {
      }
      return buffer_ != nullptr;
 #else
-      FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
+      FDASSERT(false,
-                      "-DWITH_GPU=ON,"
+               "The FastDeploy FDTensor allocator didn't compile under "
-                      "so this is an unexpected problem happend.");
+               "-DWITH_GPU=ON,"
               "so this is an unexpected problem happend.");
 #endif
    }
    buffer_ = realloc(buffer_, nbytes);
@@ -296,8 +302,7 @@ bool FDTensor::ReallocFn(size_t nbytes) {
 }
 void FDTensor::FreeFn() {
-  if (external_data_ptr != nullptr)
+  if (external_data_ptr != nullptr) external_data_ptr = nullptr;
    external_data_ptr = nullptr;
  if (buffer_ != nullptr) {
    if (device == Device::GPU) {
 #ifdef WITH_GPU
@@ -381,13 +386,16 @@ FDTensor::FDTensor(const Scalar& scalar) {
      (reinterpret_cast<double*>(Data()))[0] = scalar.to<double>();
      break;
    default:
-      break;  
+      break;
  }
 }
 FDTensor::FDTensor(const FDTensor& other)
-    : shape(other.shape), name(other.name), dtype(other.dtype),
+    : shape(other.shape),
-      device(other.device), external_data_ptr(other.external_data_ptr),
+      name(other.name),
      dtype(other.dtype),
      device(other.device),
      external_data_ptr(other.external_data_ptr),
      device_id(other.device_id) {
  // Copy buffer
  if (other.buffer_ == nullptr) {
@@ -401,9 +409,12 @@ FDTensor::FDTensor(const FDTensor& other)
 }
 FDTensor::FDTensor(FDTensor&& other)
-    : buffer_(other.buffer_), shape(std::move(other.shape)),
+    : buffer_(other.buffer_),
-      name(std::move(other.name)), dtype(other.dtype),
+      shape(std::move(other.shape)),
-      external_data_ptr(other.external_data_ptr), device(other.device),
+      name(std::move(other.name)),
      dtype(other.dtype),
      external_data_ptr(other.external_data_ptr),
      device(other.device),
      device_id(other.device_id) {
  other.name = "";
  // Note(zhoushunjie): Avoid double free.
--- a/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc
+++ b/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc
@@ -15,33 +15,9 @@
 namespace fastdeploy {
 void BindPaddleClas(pybind11::module& m) {
-  pybind11::class_<vision::classification::PaddleClasPreprocessor>(
+  pybind11::class_<vision::classification::PaddleClasPreprocessor,
-      m, "PaddleClasPreprocessor")
+                   vision::ProcessorManager>(m, "PaddleClasPreprocessor")
      .def(pybind11::init<std::string>())
      .def("run",
           [](vision::classification::PaddleClasPreprocessor& self,
              std::vector<pybind11::array>& im_list) {
             std::vector<vision::FDMat> images;
             for (size_t i = 0; i < im_list.size(); ++i) {
               images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
             }
             std::vector<FDTensor> outputs;
             if (!self.Run(&images, &outputs)) {
               throw std::runtime_error(
                   "Failed to preprocess the input data in "
                   "PaddleClasPreprocessor.");
             }
             if (!self.CudaUsed()) {
               for (size_t i = 0; i < outputs.size(); ++i) {
                 outputs[i].StopSharing();
               }
             }
             return outputs;
           })
      .def("use_cuda",
           [](vision::classification::PaddleClasPreprocessor& self,
              bool enable_cv_cuda = false,
              int gpu_id = -1) { self.UseCuda(enable_cv_cuda, gpu_id); })
      .def("disable_normalize",
           [](vision::classification::PaddleClasPreprocessor& self) {
             self.DisableNormalize();
@@ -49,6 +25,10 @@ void BindPaddleClas(pybind11::module& m) {
      .def("disable_permute",
           [](vision::classification::PaddleClasPreprocessor& self) {
             self.DisablePermute();
           })
      .def("initial_resize_on_cpu",
           [](vision::classification::PaddleClasPreprocessor& self, bool v) {
             self.InitialResizeOnCpu(v);
           });
  pybind11::class_<vision::classification::PaddleClasPostprocessor>(
--- a/fastdeploy/vision/classification/ppcls/preprocessor.cc
+++ b/fastdeploy/vision/classification/ppcls/preprocessor.cc
@@ -100,32 +100,23 @@ void PaddleClasPreprocessor::DisablePermute() {
  }
 }
-bool PaddleClasPreprocessor::Apply(std::vector<FDMat>* images,
+bool PaddleClasPreprocessor::Apply(FDMatBatch* image_batch,
                                   std::vector<FDTensor>* outputs) {
-  for (size_t i = 0; i < images->size(); ++i) {
+  for (size_t j = 0; j < processors_.size(); ++j) {
-    for (size_t j = 0; j < processors_.size(); ++j) {
+    ProcLib lib = ProcLib::DEFAULT;
-      bool ret = false;
+    if (initial_resize_on_cpu_ && j == 0 &&
-      ret = (*(processors_[j].get()))(&((*images)[i]));
+        processors_[j]->Name().find("Resize") == 0) {
-      if (!ret) {
+      lib = ProcLib::OPENCV;
-        FDERROR << "Failed to processs image:" << i << " in "
+    }
-                << processors_[j]->Name() << "." << std::endl;
+    if (!(*(processors_[j].get()))(image_batch, lib)) {
-        return false;
+      FDERROR << "Failed to processs image in " << processors_[j]->Name() << "."
-      }
+              << std::endl;
      return false;
    }
  }
  outputs->resize(1);
-  // Concat all the preprocessed data to a batch tensor
+  (*outputs)[0] = std::move(*(image_batch->Tensor()));
  std::vector<FDTensor> tensors(images->size());
  for (size_t i = 0; i < images->size(); ++i) {
    (*images)[i].ShareWithTensor(&(tensors[i]));
    tensors[i].ExpandDim(0);
  }
  if (tensors.size() == 1) {
    (*outputs)[0] = std::move(tensors[0]);
  } else {
    function::Concat(tensors, &((*outputs)[0]), 0);
  }
  (*outputs)[0].device_id = DeviceId();
  return true;
 }
--- a/fastdeploy/vision/classification/ppcls/preprocessor.h
+++ b/fastdeploy/vision/classification/ppcls/preprocessor.h
@@ -33,11 +33,11 @@ class FASTDEPLOY_DECL PaddleClasPreprocessor : public ProcessorManager {
  /** \brief Process the input image and prepare input tensors for runtime
   *
-   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] image_batch The input image batch
   * \param[in] outputs The output tensors which will feed in runtime
   * \return true if the preprocess successed, otherwise false
   */
-  virtual bool Apply(std::vector<FDMat>* images,
+  virtual bool Apply(FDMatBatch* image_batch,
                     std::vector<FDTensor>* outputs);
  /// This function will disable normalize in preprocessing step.
@@ -45,6 +45,14 @@ class FASTDEPLOY_DECL PaddleClasPreprocessor : public ProcessorManager {
  /// This function will disable hwc2chw in preprocessing step.
  void DisablePermute();
  /** \brief When the initial operator is Resize, and input image size is large,
   *     maybe it's better to run resize on CPU, because the HostToDevice memcpy
   *     is time consuming. Set this true to run the initial resize on CPU.
   *
   * \param[in] v ture or false
   */
  void InitialResizeOnCpu(bool v) { initial_resize_on_cpu_ = v; }
 private:
  bool BuildPreprocessPipelineFromConfig();
  std::vector<std::shared_ptr<Processor>> processors_;
@@ -54,6 +62,7 @@ class FASTDEPLOY_DECL PaddleClasPreprocessor : public ProcessorManager {
  bool disable_normalize_ = false;
  // read config file
  std::string config_file_;
  bool initial_resize_on_cpu_ = false;
 };
 }  // namespace classification
--- a/fastdeploy/vision/common/processors/base.cc
+++ b/fastdeploy/vision/common/processors/base.cc
@@ -20,7 +20,7 @@
 namespace fastdeploy {
 namespace vision {
-bool Processor::operator()(Mat* mat, ProcLib lib) {
+bool Processor::operator()(FDMat* mat, ProcLib lib) {
  ProcLib target = lib;
  if (lib == ProcLib::DEFAULT) {
    target = DefaultProcLib::default_lib;
@@ -52,39 +52,38 @@ bool Processor::operator()(Mat* mat, ProcLib lib) {
  return ImplByOpenCV(mat);
 }
-FDTensor* Processor::UpdateAndGetCachedTensor(
+bool Processor::operator()(FDMatBatch* mat_batch, ProcLib lib) {
-    const std::vector<int64_t>& new_shape, const FDDataType& data_type,
+  ProcLib target = lib;
-    const std::string& tensor_name, const Device& new_device,
+  if (lib == ProcLib::DEFAULT) {
-    const bool& use_pinned_memory) {
+    target = DefaultProcLib::default_lib;
  if (cached_tensors_.count(tensor_name) == 0) {
    cached_tensors_[tensor_name] = FDTensor();
  }
  cached_tensors_[tensor_name].is_pinned_memory = use_pinned_memory;
  cached_tensors_[tensor_name].Resize(new_shape, data_type, tensor_name,
                                      new_device);
  return &cached_tensors_[tensor_name];
 }
 FDTensor* Processor::CreateCachedGpuInputTensor(
    Mat* mat, const std::string& tensor_name) {
 #ifdef WITH_GPU
  FDTensor* src = mat->Tensor();
  if (src->device == Device::GPU) {
    return src;
  } else if (src->device == Device::CPU) {
    FDTensor* tensor = UpdateAndGetCachedTensor(src->Shape(), src->Dtype(),
                                                tensor_name, Device::GPU);
    FDASSERT(cudaMemcpyAsync(tensor->Data(), src->Data(), tensor->Nbytes(),
                             cudaMemcpyHostToDevice, mat->Stream()) == 0,
             "[ERROR] Error occurs while copy memory from CPU to GPU.");
    return tensor;
  } else {
    FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
  }
  if (target == ProcLib::FLYCV) {
 #ifdef ENABLE_FLYCV
    return ImplByFlyCV(mat_batch);
 #else
-  FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
+    FDASSERT(false, "FastDeploy didn't compile with FlyCV.");
 #endif
-  return nullptr;
+  } else if (target == ProcLib::CUDA) {
 #ifdef WITH_GPU
    FDASSERT(
        mat_batch->Stream() != nullptr,
        "CUDA processor requires cuda stream, please set stream for mat_batch");
    return ImplByCuda(mat_batch);
 #else
    FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
 #endif
  } else if (target == ProcLib::CVCUDA) {
 #ifdef ENABLE_CVCUDA
    FDASSERT(mat_batch->Stream() != nullptr,
             "CV-CUDA processor requires cuda stream, please set stream for "
             "mat_batch");
    return ImplByCvCuda(mat_batch);
 #else
    FDASSERT(false, "FastDeploy didn't compile with CV-CUDA.");
 #endif
  }
  // DEFAULT & OPENCV
  return ImplByOpenCV(mat_batch);
 }
 void EnableFlyCV() {
--- a/fastdeploy/vision/common/processors/base.h
+++ b/fastdeploy/vision/common/processors/base.h
@@ -16,6 +16,7 @@
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy/vision/common/processors/mat.h"
 #include "fastdeploy/vision/common/processors/mat_batch.h"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include <unordered_map>
@@ -46,46 +47,63 @@ class FASTDEPLOY_DECL Processor {
  virtual std::string Name() = 0;
-  virtual bool ImplByOpenCV(Mat* mat) {
+  virtual bool ImplByOpenCV(FDMat* mat) {
    FDERROR << Name() << " Not Implement Yet." << std::endl;
    return false;
  }
-  virtual bool ImplByFlyCV(Mat* mat) {
+  virtual bool ImplByOpenCV(FDMatBatch* mat_batch) {
    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
      if (ImplByOpenCV(&(*(mat_batch->mats))[i]) != true) {
        return false;
      }
    }
    return true;
  }
  virtual bool ImplByFlyCV(FDMat* mat) {
    return ImplByOpenCV(mat);
  }
-  virtual bool ImplByCuda(Mat* mat) {
+  virtual bool ImplByFlyCV(FDMatBatch* mat_batch) {
    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
      if (ImplByFlyCV(&(*(mat_batch->mats))[i]) != true) {
        return false;
      }
    }
    return true;
  }
  virtual bool ImplByCuda(FDMat* mat) {
    return ImplByOpenCV(mat);
  }
-  virtual bool ImplByCvCuda(Mat* mat) {
+  virtual bool ImplByCuda(FDMatBatch* mat_batch) {
    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
      if (ImplByCuda(&(*(mat_batch->mats))[i]) != true) {
        return false;
      }
    }
    return true;
  }
  virtual bool ImplByCvCuda(FDMat* mat) {
    return ImplByOpenCV(mat);
  }
-  virtual bool operator()(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
+  virtual bool ImplByCvCuda(FDMatBatch* mat_batch) {
    for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
      if (ImplByCvCuda(&(*(mat_batch->mats))[i]) != true) {
        return false;
      }
    }
    return true;
  }
- protected:
+  virtual bool operator()(FDMat* mat, ProcLib lib = ProcLib::DEFAULT);
  // Update and get the cached tensor from the cached_tensors_ map.
  // The tensor is indexed by a string.
  // If the tensor doesn't exists in the map, then create a new tensor.
  // If the tensor exists and shape is getting larger, then realloc the buffer.
  // If the tensor exists and shape is not getting larger, then return the
  // cached tensor directly.
  FDTensor* UpdateAndGetCachedTensor(
      const std::vector<int64_t>& new_shape, const FDDataType& data_type,
      const std::string& tensor_name, const Device& new_device = Device::CPU,
      const bool& use_pinned_memory = false);
-  // Create an input tensor on GPU and save into cached_tensors_.
+  virtual bool operator()(FDMatBatch* mat_batch,
-  // If the Mat is on GPU, return the mat->Tensor() directly.
+                          ProcLib lib = ProcLib::DEFAULT);
  // If the Mat is on CPU, then create a cached GPU tensor and copy the mat's
  // CPU tensor to this new GPU tensor.
  FDTensor* CreateCachedGpuInputTensor(Mat* mat,
                                       const std::string& tensor_name);
 private:
  std::unordered_map<std::string, FDTensor> cached_tensors_;
 };
 }  // namespace vision
--- a/fastdeploy/vision/common/processors/center_crop.cc
+++ b/fastdeploy/vision/common/processors/center_crop.cc
@@ -23,7 +23,7 @@
 namespace fastdeploy {
 namespace vision {
-bool CenterCrop::ImplByOpenCV(Mat* mat) {
+bool CenterCrop::ImplByOpenCV(FDMat* mat) {
  cv::Mat* im = mat->GetOpenCVMat();
  int height = static_cast<int>(im->rows);
  int width = static_cast<int>(im->cols);
@@ -42,7 +42,7 @@ bool CenterCrop::ImplByOpenCV(Mat* mat) {
 }
 #ifdef ENABLE_FLYCV
-bool CenterCrop::ImplByFlyCV(Mat* mat) {
+bool CenterCrop::ImplByFlyCV(FDMat* mat) {
  fcv::Mat* im = mat->GetFlyCVMat();
  int height = static_cast<int>(im->height());
  int width = static_cast<int>(im->width());
@@ -63,18 +63,15 @@ bool CenterCrop::ImplByFlyCV(Mat* mat) {
 #endif
 #ifdef ENABLE_CVCUDA
-bool CenterCrop::ImplByCvCuda(Mat* mat) {
+bool CenterCrop::ImplByCvCuda(FDMat* mat) {
  // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
  auto src_tensor = CreateCvCudaTensorWrapData(*src);
  // Prepare output tensor
-  tensor_name = Name() + "_cvcuda_dst";
+  mat->output_cache->Resize({height_, width_, mat->Channels()}, src->Dtype(),
-  FDTensor* dst =
+                            "output_cache", Device::GPU);
-      UpdateAndGetCachedTensor({height_, width_, mat->Channels()}, src->Dtype(),
+  auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
                               tensor_name, Device::GPU);
  auto dst_tensor = CreateCvCudaTensorWrapData(*dst);
  int offset_x = static_cast<int>((mat->Width() - width_) / 2);
  int offset_y = static_cast<int>((mat->Height() - height_) / 2);
@@ -82,16 +79,27 @@ bool CenterCrop::ImplByCvCuda(Mat* mat) {
  NVCVRectI crop_roi = {offset_x, offset_y, width_, height_};
  crop_op(mat->Stream(), src_tensor, dst_tensor, crop_roi);
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
  mat->SetWidth(width_);
  mat->SetHeight(height_);
  mat->device = Device::GPU;
  mat->mat_type = ProcLib::CVCUDA;
  return true;
 }
 bool CenterCrop::ImplByCvCuda(FDMatBatch* mat_batch) {
  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
    if (ImplByCvCuda(&((*(mat_batch->mats))[i])) != true) {
      return false;
    }
  }
  mat_batch->device = Device::GPU;
  mat_batch->mat_type = ProcLib::CVCUDA;
  return true;
 }
 #endif
-bool CenterCrop::Run(Mat* mat, const int& width, const int& height,
+bool CenterCrop::Run(FDMat* mat, const int& width, const int& height,
                     ProcLib lib) {
  auto c = CenterCrop(width, height);
  return c(mat, lib);
--- a/fastdeploy/vision/common/processors/center_crop.h
+++ b/fastdeploy/vision/common/processors/center_crop.h
@@ -22,16 +22,17 @@ namespace vision {
 class FASTDEPLOY_DECL CenterCrop : public Processor {
 public:
  CenterCrop(int width, int height) : height_(height), width_(width) {}
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
  bool ImplByCvCuda(FDMatBatch* mat_batch);
 #endif
  std::string Name() { return "CenterCrop"; }
-  static bool Run(Mat* mat, const int& width, const int& height,
+  static bool Run(FDMat* mat, const int& width, const int& height,
                  ProcLib lib = ProcLib::DEFAULT);
 private:
--- a/fastdeploy/vision/common/processors/cvcuda_utils.cc
+++ b/fastdeploy/vision/common/processors/cvcuda_utils.cc
@@ -47,17 +47,19 @@ nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor) {
           "When create CVCUDA tensor from FD tensor,"
           "tensor shape should be 3-Dim, HWC layout");
  int batchsize = 1;
  int h = tensor.Shape()[0];
  int w = tensor.Shape()[1];
  int c = tensor.Shape()[2];
  nvcv::TensorDataStridedCuda::Buffer buf;
  buf.strides[3] = FDDataTypeSize(tensor.Dtype());
-  buf.strides[2] = tensor.shape[2] * buf.strides[3];
+  buf.strides[2] = c * buf.strides[3];
-  buf.strides[1] = tensor.shape[1] * buf.strides[2];
+  buf.strides[1] = w * buf.strides[2];
-  buf.strides[0] = tensor.shape[0] * buf.strides[1];
+  buf.strides[0] = h * buf.strides[1];
  buf.basePtr = reinterpret_cast<NVCVByte*>(const_cast<void*>(tensor.Data()));
  nvcv::Tensor::Requirements req = nvcv::Tensor::CalcRequirements(
-      batchsize, {tensor.shape[1], tensor.shape[0]},
+      batchsize, {w, h}, CreateCvCudaImageFormat(tensor.Dtype(), c));
      CreateCvCudaImageFormat(tensor.Dtype(), tensor.shape[2]));
  nvcv::TensorDataStridedCuda tensor_data(
      nvcv::TensorShape{req.shape, req.rank, req.layout},
@@ -70,6 +72,33 @@ void* GetCvCudaTensorDataPtr(const nvcv::TensorWrapData& tensor) {
      dynamic_cast<const nvcv::ITensorDataStridedCuda*>(tensor.exportData());
  return reinterpret_cast<void*>(data->basePtr());
 }
 nvcv::ImageWrapData CreateImageWrapData(const FDTensor& tensor) {
  FDASSERT(tensor.shape.size() == 3,
           "When create CVCUDA image from FD tensor,"
           "tensor shape should be 3-Dim, HWC layout");
  int h = tensor.Shape()[0];
  int w = tensor.Shape()[1];
  int c = tensor.Shape()[2];
  nvcv::ImageDataStridedCuda::Buffer buf;
  buf.numPlanes = 1;
  buf.planes[0].width = w;
  buf.planes[0].height = h;
  buf.planes[0].rowStride = w * c * FDDataTypeSize(tensor.Dtype());
  buf.planes[0].basePtr =
      reinterpret_cast<NVCVByte*>(const_cast<void*>(tensor.Data()));
  nvcv::ImageWrapData nvimg{nvcv::ImageDataStridedCuda{
      nvcv::ImageFormat{CreateCvCudaImageFormat(tensor.Dtype(), c)}, buf}};
  return nvimg;
 }
 void CreateCvCudaImageBatchVarShape(std::vector<FDTensor*>& tensors,
                                    nvcv::ImageBatchVarShape& img_batch) {
  for (size_t i = 0; i < tensors.size(); ++i) {
    FDASSERT(tensors[i]->device == Device::GPU, "Tensor must on GPU.");
    img_batch.pushBack(CreateImageWrapData(*(tensors[i])));
  }
 }
 #endif
 }  // namespace vision
--- a/fastdeploy/vision/common/processors/cvcuda_utils.h
+++ b/fastdeploy/vision/common/processors/cvcuda_utils.h
@@ -18,6 +18,7 @@
 #ifdef ENABLE_CVCUDA
 #include "nvcv/Tensor.hpp"
 #include <nvcv/ImageBatch.hpp>
 namespace fastdeploy {
 namespace vision {
@@ -25,7 +26,10 @@ namespace vision {
 nvcv::ImageFormat CreateCvCudaImageFormat(FDDataType type, int channel);
 nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor);
 void* GetCvCudaTensorDataPtr(const nvcv::TensorWrapData& tensor);
 nvcv::ImageWrapData CreateImageWrapData(const FDTensor& tensor);
 void CreateCvCudaImageBatchVarShape(std::vector<FDTensor*>& tensors,
                                    nvcv::ImageBatchVarShape& img_batch);
-}
+}  // namespace vision
-}
+}  // namespace fastdeploy
 #endif
--- a/fastdeploy/vision/common/processors/manager.cc
+++ b/fastdeploy/vision/common/processors/manager.cc
@@ -62,13 +62,24 @@ bool ProcessorManager::Run(std::vector<FDMat>* images,
    return false;
  }
-  for (size_t i = 0; i < images->size(); ++i) {
+  if (images->size() > input_caches_.size()) {
-    if (CudaUsed()) {
+    input_caches_.resize(images->size());
-      SetStream(&((*images)[i]));
+    output_caches_.resize(images->size());
    }
  }
-  bool ret = Apply(images, outputs);
+  FDMatBatch image_batch(images);
  image_batch.input_cache = &batch_input_cache_;
  image_batch.output_cache = &batch_output_cache_;
  for (size_t i = 0; i < images->size(); ++i) {
    if (CudaUsed()) {
      SetStream(&image_batch);
    }
    (*images)[i].input_cache = &input_caches_[i];
    (*images)[i].output_cache = &output_caches_[i];
  }
  bool ret = Apply(&image_batch, outputs);
  if (CudaUsed()) {
    SyncStream();
--- a/fastdeploy/vision/common/processors/manager.h
+++ b/fastdeploy/vision/common/processors/manager.h
@@ -16,6 +16,7 @@
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy/vision/common/processors/mat.h"
 #include "fastdeploy/vision/common/processors/mat_batch.h"
 namespace fastdeploy {
 namespace vision {
@@ -24,16 +25,28 @@ class FASTDEPLOY_DECL ProcessorManager {
 public:
  ~ProcessorManager();
  /** \brief Use CUDA to boost the performance of processors
   *
   * \param[in] enable_cv_cuda ture: use CV-CUDA, false: use CUDA only
   * \param[in] gpu_id GPU device id
   * \return true if the preprocess successed, otherwise false
   */
  void UseCuda(bool enable_cv_cuda = false, int gpu_id = -1);
  bool CudaUsed();
-  void SetStream(Mat* mat) {
+  void SetStream(FDMat* mat) {
 #ifdef WITH_GPU
    mat->SetStream(stream_);
 #endif
  }
  void SetStream(FDMatBatch* mat_batch) {
 #ifdef WITH_GPU
    mat_batch->SetStream(stream_);
 #endif
  }
  void SyncStream() {
 #ifdef WITH_GPU
    FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess,
@@ -51,13 +64,13 @@ class FASTDEPLOY_DECL ProcessorManager {
   */
  bool Run(std::vector<FDMat>* images, std::vector<FDTensor>* outputs);
-  /** \brief The body of Run() function which needs to be implemented by a derived class
+  /** \brief Apply() is the body of Run() function, it needs to be implemented by a derived class
   *
-   * \param[in] images The input image data list, all the elements are returned by cv::imread()
+   * \param[in] image_batch The input image batch
   * \param[in] outputs The output tensors which will feed in runtime
   * \return true if the preprocess successed, otherwise false
   */
-  virtual bool Apply(std::vector<FDMat>* images,
+  virtual bool Apply(FDMatBatch* image_batch,
                     std::vector<FDTensor>* outputs) = 0;
 protected:
@@ -68,6 +81,11 @@ class FASTDEPLOY_DECL ProcessorManager {
  cudaStream_t stream_ = nullptr;
 #endif
  int device_id_ = -1;
  std::vector<FDTensor> input_caches_;
  std::vector<FDTensor> output_caches_;
  FDTensor batch_input_cache_;
  FDTensor batch_output_cache_;
 };
 }  // namespace vision
--- a/fastdeploy/vision/common/processors/manager_pybind.cc
+++ b/fastdeploy/vision/common/processors/manager_pybind.cc
@@ -0,0 +1,41 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/pybind/main.h"
 namespace fastdeploy {
 void BindProcessorManager(pybind11::module& m) {
  pybind11::class_<vision::ProcessorManager>(m, "ProcessorManager")
      .def("run",
           [](vision::ProcessorManager& self,
              std::vector<pybind11::array>& im_list) {
             std::vector<vision::FDMat> images;
             for (size_t i = 0; i < im_list.size(); ++i) {
               images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
             }
             std::vector<FDTensor> outputs;
             if (!self.Run(&images, &outputs)) {
               throw std::runtime_error("Failed to process the input data");
             }
             if (!self.CudaUsed()) {
               for (size_t i = 0; i < outputs.size(); ++i) {
                 outputs[i].StopSharing();
               }
             }
             return outputs;
           })
      .def("use_cuda",
           [](vision::ProcessorManager& self, bool enable_cv_cuda = false,
              int gpu_id = -1) { self.UseCuda(enable_cv_cuda, gpu_id); });
 }
 }  // namespace fastdeploy
--- a/fastdeploy/vision/common/processors/mat.cc
+++ b/fastdeploy/vision/common/processors/mat.cc
@@ -247,5 +247,40 @@ std::vector<FDMat> WrapMat(const std::vector<cv::Mat>& images) {
  return mats;
 }
 bool CheckShapeConsistency(std::vector<Mat>* mats) {
  for (size_t i = 1; i < mats->size(); ++i) {
    if ((*mats)[i].Channels() != (*mats)[0].Channels() ||
        (*mats)[i].Width() != (*mats)[0].Width() ||
        (*mats)[i].Height() != (*mats)[0].Height()) {
      return false;
    }
  }
  return true;
 }
 FDTensor* CreateCachedGpuInputTensor(Mat* mat) {
 #ifdef WITH_GPU
  FDTensor* src = mat->Tensor();
  if (src->device == Device::GPU) {
    return src;
  } else if (src->device == Device::CPU) {
    // Mats on CPU, we need copy these tensors from CPU to GPU
    FDASSERT(src->Shape().size() == 3, "The CPU tensor must has 3 dims.")
    mat->input_cache->Resize(src->Shape(), src->Dtype(), "input_cache",
                             Device::GPU);
    FDASSERT(
        cudaMemcpyAsync(mat->input_cache->Data(), src->Data(), src->Nbytes(),
                        cudaMemcpyHostToDevice, mat->Stream()) == 0,
        "[ERROR] Error occurs while copy memory from CPU to GPU.");
    return mat->input_cache;
  } else {
    FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
  }
 #else
  FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
 #endif
  return nullptr;
 }
 }  // namespace vision
 }  // namespace fastdeploy
--- a/fastdeploy/vision/common/processors/mat.h
+++ b/fastdeploy/vision/common/processors/mat.h
@@ -119,6 +119,11 @@ struct FASTDEPLOY_DECL Mat {
  void SetChannels(int s) { channels = s; }
  void SetWidth(int w) { width = w; }
  void SetHeight(int h) { height = h; }
  // When using CV-CUDA/CUDA, please set input/output cache,
  // refer to manager.cc
  FDTensor* input_cache = nullptr;
  FDTensor* output_cache = nullptr;
 #ifdef WITH_GPU
  cudaStream_t Stream() const { return stream; }
  void SetStream(cudaStream_t s) { stream = s; }
@@ -165,5 +170,12 @@ FASTDEPLOY_DECL FDMat WrapMat(const cv::Mat& image);
 */
 FASTDEPLOY_DECL std::vector<FDMat> WrapMat(const std::vector<cv::Mat>& images);
 bool CheckShapeConsistency(std::vector<Mat>* mats);
 // Create an input tensor on GPU and save into input_cache.
 // If the Mat is on GPU, return the mat->Tensor() directly.
 // If the Mat is on CPU, then update the input cache tensor and copy the mat's
 // CPU tensor to this new GPU input cache tensor.
 FDTensor* CreateCachedGpuInputTensor(Mat* mat);
 }  // namespace vision
 }  // namespace fastdeploy
--- a/fastdeploy/vision/common/processors/mat_batch.cc
+++ b/fastdeploy/vision/common/processors/mat_batch.cc
@@ -0,0 +1,81 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/vision/common/processors/mat_batch.h"
 namespace fastdeploy {
 namespace vision {
 #ifdef WITH_GPU
 void FDMatBatch::SetStream(cudaStream_t s) {
  stream = s;
  for (size_t i = 0; i < mats->size(); ++i) {
    (*mats)[i].SetStream(s);
  }
 }
 #endif
 FDTensor* FDMatBatch::Tensor() {
  if (has_batched_tensor) {
    return &fd_tensor;
  }
  FDASSERT(CheckShapeConsistency(mats), "Mats shapes are not consistent.")
  // Each mat has its own tensor,
  // to get a batched tensor, we need copy these tensors to a batched tensor
  FDTensor* src = (*mats)[0].Tensor();
  auto new_shape = src->Shape();
  new_shape.insert(new_shape.begin(), mats->size());
  input_cache->Resize(new_shape, src->Dtype(), "batch_input_cache", device);
  for (size_t i = 0; i < mats->size(); ++i) {
    FDASSERT(device == (*mats)[i].Tensor()->device,
             "Mats and MatBatch are not on the same device");
    uint8_t* p = reinterpret_cast<uint8_t*>(input_cache->Data());
    int num_bytes = (*mats)[i].Tensor()->Nbytes();
    FDTensor::CopyBuffer(p + i * num_bytes, (*mats)[i].Tensor()->Data(),
                         num_bytes, device, false);
  }
  SetTensor(input_cache);
  return &fd_tensor;
 }
 void FDMatBatch::SetTensor(FDTensor* tensor) {
  fd_tensor.SetExternalData(tensor->Shape(), tensor->Dtype(), tensor->Data(),
                            tensor->device, tensor->device_id);
  has_batched_tensor = true;
 }
 FDTensor* CreateCachedGpuInputTensor(FDMatBatch* mat_batch) {
 #ifdef WITH_GPU
  auto mats = mat_batch->mats;
  FDASSERT(CheckShapeConsistency(mats), "Mats shapes are not consistent.")
  FDTensor* src = (*mats)[0].Tensor();
  if (mat_batch->device == Device::GPU) {
    return mat_batch->Tensor();
  } else if (mat_batch->device == Device::CPU) {
    // Mats on CPU, we need copy them to GPU and then get a batched GPU tensor
    for (size_t i = 0; i < mats->size(); ++i) {
      FDTensor* tensor = CreateCachedGpuInputTensor(&(*mats)[i]);
      (*mats)[i].SetTensor(tensor);
    }
    return mat_batch->Tensor();
  } else {
    FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
  }
 #else
  FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
 #endif
  return nullptr;
 }
 }  // namespace vision
 }  // namespace fastdeploy
--- a/fastdeploy/vision/common/processors/mat_batch.h
+++ b/fastdeploy/vision/common/processors/mat_batch.h
@@ -0,0 +1,76 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "fastdeploy/vision/common/processors/mat.h"
 #ifdef WITH_GPU
 #include <cuda_runtime_api.h>
 #endif
 namespace fastdeploy {
 namespace vision {
 enum FDMatBatchLayout { NHWC, NCHW };
 struct FASTDEPLOY_DECL FDMatBatch {
  FDMatBatch() = default;
  // MatBatch is intialized with a list of mats,
  // the data is stored in the mats separately.
  // Call Tensor() function to get a batched 4-dimension tensor.
  explicit FDMatBatch(std::vector<Mat>* _mats) {
    mats = _mats;
    layout = FDMatBatchLayout::NHWC;
    mat_type = ProcLib::OPENCV;
  }
  // Get the batched 4-dimension tensor.
  FDTensor* Tensor();
  void SetTensor(FDTensor* tensor);
 private:
 #ifdef WITH_GPU
  cudaStream_t stream = nullptr;
 #endif
  FDTensor fd_tensor;
 public:
  // When using CV-CUDA/CUDA, please set input/output cache,
  // refer to manager.cc
  FDTensor* input_cache;
  FDTensor* output_cache;
 #ifdef WITH_GPU
  cudaStream_t Stream() const { return stream; }
  void SetStream(cudaStream_t s);
 #endif
  std::vector<FDMat>* mats;
  ProcLib mat_type = ProcLib::OPENCV;
  FDMatBatchLayout layout = FDMatBatchLayout::NHWC;
  Device device = Device::CPU;
  // False: the data is stored in the mats separately
  // True: the data is stored in the fd_tensor continuously in 4 dimensions
  bool has_batched_tensor = false;
 };
 // Create a batched input tensor on GPU and save into input_cache.
 // If the MatBatch is on GPU, return the Tensor() directly.
 // If the MatBatch is on CPU, then copy the CPU tensors to GPU and get a GPU
 // batched input tensor.
 FDTensor* CreateCachedGpuInputTensor(FDMatBatch* mat_batch);
 }  // namespace vision
 }  // namespace fastdeploy
--- a/fastdeploy/vision/common/processors/normalize_and_permute.cc
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.cc
@@ -56,7 +56,7 @@ NormalizeAndPermute::NormalizeAndPermute(const std::vector<float>& mean,
  swap_rb_ = swap_rb;
 }
-bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
+bool NormalizeAndPermute::ImplByOpenCV(FDMat* mat) {
  cv::Mat* im = mat->GetOpenCVMat();
  int origin_w = im->cols;
  int origin_h = im->rows;
@@ -79,7 +79,7 @@ bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
 }
 #ifdef ENABLE_FLYCV
-bool NormalizeAndPermute::ImplByFlyCV(Mat* mat) {
+bool NormalizeAndPermute::ImplByFlyCV(FDMat* mat) {
  if (mat->layout != Layout::HWC) {
    FDERROR << "Only supports input with HWC layout." << std::endl;
    return false;
@@ -109,7 +109,7 @@ bool NormalizeAndPermute::ImplByFlyCV(Mat* mat) {
 }
 #endif
-bool NormalizeAndPermute::Run(Mat* mat, const std::vector<float>& mean,
+bool NormalizeAndPermute::Run(FDMat* mat, const std::vector<float>& mean,
                              const std::vector<float>& std, bool is_scale,
                              const std::vector<float>& min,
                              const std::vector<float>& max, ProcLib lib,
--- a/fastdeploy/vision/common/processors/normalize_and_permute.cu
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.cu
@@ -18,63 +18,110 @@
 namespace fastdeploy {
 namespace vision {
-__global__ void NormalizeAndPermuteKernel(uint8_t* src, float* dst,
+__global__ void NormalizeAndPermuteKernel(const uint8_t* src, float* dst,
                                          const float* alpha, const float* beta,
                                          int num_channel, bool swap_rb,
-                                          int edge) {
+                                          int batch_size, int edge) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  if (idx >= edge) return;
-  if (swap_rb) {
+  int img_size = edge / batch_size;
-    uint8_t tmp = src[num_channel * idx];
+  int n = idx / img_size;        // batch index
-    src[num_channel * idx] = src[num_channel * idx + 2];
+  int p = idx - (n * img_size);  // pixel index within the image
    src[num_channel * idx + 2] = tmp;
  }
  for (int i = 0; i < num_channel; ++i) {
-    dst[idx + edge * i] = src[num_channel * idx + i] * alpha[i] + beta[i];
+    int j = i;
    if (swap_rb) {
      j = 2 - i;
    }
    dst[n * img_size * num_channel + i * img_size + p] =
        src[num_channel * idx + j] * alpha[i] + beta[i];
  }
 }
-bool NormalizeAndPermute::ImplByCuda(Mat* mat) {
+bool NormalizeAndPermute::ImplByCuda(FDMat* mat) {
  // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
  // Prepare output tensor
-  tensor_name = Name() + "_dst";
+  mat->output_cache->Resize(src->Shape(), FDDataType::FP32, "output_cache",
-  FDTensor* dst = UpdateAndGetCachedTensor(src->Shape(), FDDataType::FP32,
+                            Device::GPU);
                                           tensor_name, Device::GPU);
  // Copy alpha and beta to GPU
-  tensor_name = Name() + "_alpha";
+  gpu_alpha_.Resize({1, 1, static_cast<int>(alpha_.size())}, FDDataType::FP32,
-  FDMat alpha_mat =
+                    "alpha", Device::GPU);
-      FDMat::Create(1, 1, alpha_.size(), FDDataType::FP32, alpha_.data());
+  cudaMemcpy(gpu_alpha_.Data(), alpha_.data(), gpu_alpha_.Nbytes(),
-  FDTensor* alpha = CreateCachedGpuInputTensor(&alpha_mat, tensor_name);
+             cudaMemcpyHostToDevice);
-  tensor_name = Name() + "_beta";
+  gpu_beta_.Resize({1, 1, static_cast<int>(beta_.size())}, FDDataType::FP32,
-  FDMat beta_mat =
+                   "beta", Device::GPU);
-      FDMat::Create(1, 1, beta_.size(), FDDataType::FP32, beta_.data());
+  cudaMemcpy(gpu_beta_.Data(), beta_.data(), gpu_beta_.Nbytes(),
-  FDTensor* beta = CreateCachedGpuInputTensor(&beta_mat, tensor_name);
+             cudaMemcpyHostToDevice);
-  int jobs = mat->Width() * mat->Height();
+  int jobs = 1 * mat->Width() * mat->Height();
  int threads = 256;
  int blocks = ceil(jobs / (float)threads);
  NormalizeAndPermuteKernel<<<blocks, threads, 0, mat->Stream()>>>(
      reinterpret_cast<uint8_t*>(src->Data()),
-      reinterpret_cast<float*>(dst->Data()),
+      reinterpret_cast<float*>(mat->output_cache->Data()),
-      reinterpret_cast<float*>(alpha->Data()),
+      reinterpret_cast<float*>(gpu_alpha_.Data()),
-      reinterpret_cast<float*>(beta->Data()), mat->Channels(), swap_rb_, jobs);
+      reinterpret_cast<float*>(gpu_beta_.Data()), mat->Channels(), swap_rb_, 1,
      jobs);
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
  mat->device = Device::GPU;
  mat->layout = Layout::CHW;
  mat->mat_type = ProcLib::CUDA;
  return true;
 }
 bool NormalizeAndPermute::ImplByCuda(FDMatBatch* mat_batch) {
  // Prepare input tensor
  FDTensor* src = CreateCachedGpuInputTensor(mat_batch);
  // Prepare output tensor
  mat_batch->output_cache->Resize(src->Shape(), FDDataType::FP32,
                                  "output_cache", Device::GPU);
  // NHWC -> NCHW
  std::swap(mat_batch->output_cache->shape[1],
            mat_batch->output_cache->shape[3]);
  // Copy alpha and beta to GPU
  gpu_alpha_.Resize({1, 1, static_cast<int>(alpha_.size())}, FDDataType::FP32,
                    "alpha", Device::GPU);
  cudaMemcpy(gpu_alpha_.Data(), alpha_.data(), gpu_alpha_.Nbytes(),
             cudaMemcpyHostToDevice);
  gpu_beta_.Resize({1, 1, static_cast<int>(beta_.size())}, FDDataType::FP32,
                   "beta", Device::GPU);
  cudaMemcpy(gpu_beta_.Data(), beta_.data(), gpu_beta_.Nbytes(),
             cudaMemcpyHostToDevice);
  int jobs =
      mat_batch->output_cache->Numel() / mat_batch->output_cache->shape[1];
  int threads = 256;
  int blocks = ceil(jobs / (float)threads);
  NormalizeAndPermuteKernel<<<blocks, threads, 0, mat_batch->Stream()>>>(
      reinterpret_cast<uint8_t*>(src->Data()),
      reinterpret_cast<float*>(mat_batch->output_cache->Data()),
      reinterpret_cast<float*>(gpu_alpha_.Data()),
      reinterpret_cast<float*>(gpu_beta_.Data()),
      mat_batch->output_cache->shape[1], swap_rb_,
      mat_batch->output_cache->shape[0], jobs);
  mat_batch->SetTensor(mat_batch->output_cache);
  mat_batch->device = Device::GPU;
  mat_batch->layout = FDMatBatchLayout::NCHW;
  mat_batch->mat_type = ProcLib::CUDA;
  return true;
 }
 #ifdef ENABLE_CVCUDA
-bool NormalizeAndPermute::ImplByCvCuda(Mat* mat) { return ImplByCuda(mat); }
+bool NormalizeAndPermute::ImplByCvCuda(FDMat* mat) { return ImplByCuda(mat); }
 bool NormalizeAndPermute::ImplByCvCuda(FDMatBatch* mat_batch) {
  return ImplByCuda(mat_batch);
 }
 #endif
 }  // namespace vision
--- a/fastdeploy/vision/common/processors/normalize_and_permute.h
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.h
@@ -25,15 +25,17 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
                      const std::vector<float>& min = std::vector<float>(),
                      const std::vector<float>& max = std::vector<float>(),
                      bool swap_rb = false);
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef WITH_GPU
-  bool ImplByCuda(Mat* mat);
+  bool ImplByCuda(FDMat* mat);
  bool ImplByCuda(FDMatBatch* mat_batch);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
  bool ImplByCvCuda(FDMatBatch* mat_batch);
 #endif
  std::string Name() { return "NormalizeAndPermute"; }
@@ -47,7 +49,7 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
  // There will be some precomputation in contruct function
  // and the `norm(mat)` only need to compute result = mat * alpha + beta
  // which will reduce lots of time
-  static bool Run(Mat* mat, const std::vector<float>& mean,
+  static bool Run(FDMat* mat, const std::vector<float>& mean,
                  const std::vector<float>& std, bool is_scale = true,
                  const std::vector<float>& min = std::vector<float>(),
                  const std::vector<float>& max = std::vector<float>(),
@@ -76,6 +78,8 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
 private:
  std::vector<float> alpha_;
  std::vector<float> beta_;
  FDTensor gpu_alpha_;
  FDTensor gpu_beta_;
  bool swap_rb_;
 };
 }  // namespace vision
--- a/fastdeploy/vision/common/processors/resize.cc
+++ b/fastdeploy/vision/common/processors/resize.cc
@@ -23,7 +23,7 @@
 namespace fastdeploy {
 namespace vision {
-bool Resize::ImplByOpenCV(Mat* mat) {
+bool Resize::ImplByOpenCV(FDMat* mat) {
  if (mat->layout != Layout::HWC) {
    FDERROR << "Resize: The format of input is not HWC." << std::endl;
    return false;
@@ -61,7 +61,7 @@ bool Resize::ImplByOpenCV(Mat* mat) {
 }
 #ifdef ENABLE_FLYCV
-bool Resize::ImplByFlyCV(Mat* mat) {
+bool Resize::ImplByFlyCV(FDMat* mat) {
  if (mat->layout != Layout::HWC) {
    FDERROR << "Resize: The format of input is not HWC." << std::endl;
    return false;
@@ -123,7 +123,7 @@ bool Resize::ImplByFlyCV(Mat* mat) {
 #endif
 #ifdef ENABLE_CVCUDA
-bool Resize::ImplByCvCuda(Mat* mat) {
+bool Resize::ImplByCvCuda(FDMat* mat) {
  if (width_ == mat->Width() && height_ == mat->Height()) {
    return true;
  }
@@ -143,23 +143,20 @@ bool Resize::ImplByCvCuda(Mat* mat) {
  }
  // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
  auto src_tensor = CreateCvCudaTensorWrapData(*src);
  // Prepare output tensor
-  tensor_name = Name() + "_cvcuda_dst";
+  mat->output_cache->Resize({height_, width_, mat->Channels()}, mat->Type(),
-  FDTensor* dst =
+                            "output_cache", Device::GPU);
-      UpdateAndGetCachedTensor({height_, width_, mat->Channels()}, mat->Type(),
+  auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
                               tensor_name, Device::GPU);
  auto dst_tensor = CreateCvCudaTensorWrapData(*dst);
  // CV-CUDA Interp value is compatible with OpenCV
  cvcuda::Resize resize_op;
  resize_op(mat->Stream(), src_tensor, dst_tensor,
            NVCVInterpolationType(interp_));
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
  mat->SetWidth(width_);
  mat->SetHeight(height_);
  mat->device = Device::GPU;
@@ -168,8 +165,8 @@ bool Resize::ImplByCvCuda(Mat* mat) {
 }
 #endif
-bool Resize::Run(Mat* mat, int width, int height, float scale_w, float scale_h,
+bool Resize::Run(FDMat* mat, int width, int height, float scale_w,
-                 int interp, bool use_scale, ProcLib lib) {
+                 float scale_h, int interp, bool use_scale, ProcLib lib) {
  if (mat->Height() == height && mat->Width() == width) {
    return true;
  }
--- a/fastdeploy/vision/common/processors/resize.h
+++ b/fastdeploy/vision/common/processors/resize.h
@@ -31,16 +31,16 @@ class FASTDEPLOY_DECL Resize : public Processor {
    use_scale_ = use_scale;
  }
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
 #endif
  std::string Name() { return "Resize"; }
-  static bool Run(Mat* mat, int width, int height, float scale_w = -1.0,
+  static bool Run(FDMat* mat, int width, int height, float scale_w = -1.0,
                  float scale_h = -1.0, int interp = 1, bool use_scale = false,
                  ProcLib lib = ProcLib::DEFAULT);
--- a/fastdeploy/vision/common/processors/resize_by_short.cc
+++ b/fastdeploy/vision/common/processors/resize_by_short.cc
@@ -23,7 +23,7 @@
 namespace fastdeploy {
 namespace vision {
-bool ResizeByShort::ImplByOpenCV(Mat* mat) {
+bool ResizeByShort::ImplByOpenCV(FDMat* mat) {
  cv::Mat* im = mat->GetOpenCVMat();
  int origin_w = im->cols;
  int origin_h = im->rows;
@@ -43,7 +43,7 @@ bool ResizeByShort::ImplByOpenCV(Mat* mat) {
 }
 #ifdef ENABLE_FLYCV
-bool ResizeByShort::ImplByFlyCV(Mat* mat) {
+bool ResizeByShort::ImplByFlyCV(FDMat* mat) {
  fcv::Mat* im = mat->GetFlyCVMat();
  int origin_w = im->width();
  int origin_h = im->height();
@@ -87,10 +87,9 @@ bool ResizeByShort::ImplByFlyCV(Mat* mat) {
 #endif
 #ifdef ENABLE_CVCUDA
-bool ResizeByShort::ImplByCvCuda(Mat* mat) {
+bool ResizeByShort::ImplByCvCuda(FDMat* mat) {
  // Prepare input tensor
-  std::string tensor_name = Name() + "_cvcuda_src";
+  FDTensor* src = CreateCachedGpuInputTensor(mat);
  FDTensor* src = CreateCachedGpuInputTensor(mat, tensor_name);
  auto src_tensor = CreateCvCudaTensorWrapData(*src);
  double scale = GenerateScale(mat->Width(), mat->Height());
@@ -98,23 +97,69 @@ bool ResizeByShort::ImplByCvCuda(Mat* mat) {
  int height = static_cast<int>(round(scale * mat->Height()));
  // Prepare output tensor
-  tensor_name = Name() + "_cvcuda_dst";
+  mat->output_cache->Resize({height, width, mat->Channels()}, mat->Type(),
-  FDTensor* dst = UpdateAndGetCachedTensor(
+                            "output_cache", Device::GPU);
-      {height, width, mat->Channels()}, mat->Type(), tensor_name, Device::GPU);
+  auto dst_tensor = CreateCvCudaTensorWrapData(*(mat->output_cache));
  auto dst_tensor = CreateCvCudaTensorWrapData(*dst);
  // CV-CUDA Interp value is compatible with OpenCV
  cvcuda::Resize resize_op;
  resize_op(mat->Stream(), src_tensor, dst_tensor,
            NVCVInterpolationType(interp_));
-  mat->SetTensor(dst);
+  mat->SetTensor(mat->output_cache);
  mat->SetWidth(width);
  mat->SetHeight(height);
  mat->device = Device::GPU;
  mat->mat_type = ProcLib::CVCUDA;
  return true;
 }
 bool ResizeByShort::ImplByCvCuda(FDMatBatch* mat_batch) {
  // TODO(wangxinyu): to support batched tensor as input
  FDASSERT(mat_batch->has_batched_tensor == false,
           "ResizeByShort doesn't support batched tensor as input for now.");
  // Prepare input batch
  std::string tensor_name = Name() + "_cvcuda_src";
  std::vector<FDTensor*> src_tensors;
  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
    FDTensor* src = CreateCachedGpuInputTensor(&(*(mat_batch->mats))[i]);
    src_tensors.push_back(src);
  }
  nvcv::ImageBatchVarShape src_batch(mat_batch->mats->size());
  CreateCvCudaImageBatchVarShape(src_tensors, src_batch);
  // Prepare output batch
  tensor_name = Name() + "_cvcuda_dst";
  std::vector<FDTensor*> dst_tensors;
  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
    FDMat* mat = &(*(mat_batch->mats))[i];
    double scale = GenerateScale(mat->Width(), mat->Height());
    int width = static_cast<int>(round(scale * mat->Width()));
    int height = static_cast<int>(round(scale * mat->Height()));
    mat->output_cache->Resize({height, width, mat->Channels()}, mat->Type(),
                              "output_cache", Device::GPU);
    dst_tensors.push_back(mat->output_cache);
  }
  nvcv::ImageBatchVarShape dst_batch(mat_batch->mats->size());
  CreateCvCudaImageBatchVarShape(dst_tensors, dst_batch);
  // CV-CUDA Interp value is compatible with OpenCV
  cvcuda::Resize resize_op;
  resize_op(mat_batch->Stream(), src_batch, dst_batch,
            NVCVInterpolationType(interp_));
  for (size_t i = 0; i < mat_batch->mats->size(); ++i) {
    FDMat* mat = &(*(mat_batch->mats))[i];
    mat->SetTensor(dst_tensors[i]);
    mat->SetWidth(dst_tensors[i]->Shape()[1]);
    mat->SetHeight(dst_tensors[i]->Shape()[0]);
    mat->device = Device::GPU;
    mat->mat_type = ProcLib::CVCUDA;
  }
  mat_batch->device = Device::GPU;
  mat_batch->mat_type = ProcLib::CVCUDA;
  return true;
 }
 #endif
 double ResizeByShort::GenerateScale(const int origin_w, const int origin_h) {
@@ -143,7 +188,7 @@ double ResizeByShort::GenerateScale(const int origin_w, const int origin_h) {
  return scale;
 }
-bool ResizeByShort::Run(Mat* mat, int target_size, int interp, bool use_scale,
+bool ResizeByShort::Run(FDMat* mat, int target_size, int interp, bool use_scale,
                        const std::vector<int>& max_hw, ProcLib lib) {
  auto r = ResizeByShort(target_size, interp, use_scale, max_hw);
  return r(mat, lib);
--- a/fastdeploy/vision/common/processors/resize_by_short.h
+++ b/fastdeploy/vision/common/processors/resize_by_short.h
@@ -28,16 +28,17 @@ class FASTDEPLOY_DECL ResizeByShort : public Processor {
    interp_ = interp;
    use_scale_ = use_scale;
  }
-  bool ImplByOpenCV(Mat* mat);
+  bool ImplByOpenCV(FDMat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFlyCV(Mat* mat);
+  bool ImplByFlyCV(FDMat* mat);
 #endif
 #ifdef ENABLE_CVCUDA
-  bool ImplByCvCuda(Mat* mat);
+  bool ImplByCvCuda(FDMat* mat);
  bool ImplByCvCuda(FDMatBatch* mat_batch);
 #endif
  std::string Name() { return "ResizeByShort"; }
-  static bool Run(Mat* mat, int target_size, int interp = 1,
+  static bool Run(FDMat* mat, int target_size, int interp = 1,
                  bool use_scale = true,
                  const std::vector<int>& max_hw = std::vector<int>(),
                  ProcLib lib = ProcLib::DEFAULT);
--- a/fastdeploy/vision/vision_pybind.cc
+++ b/fastdeploy/vision/vision_pybind.cc
@@ -16,6 +16,7 @@
 namespace fastdeploy {
 void BindProcessorManager(pybind11::module& m);
 void BindDetection(pybind11::module& m);
 void BindClassification(pybind11::module& m);
 void BindSegmentation(pybind11::module& m);
@@ -204,6 +205,7 @@ void BindVision(pybind11::module& m) {
  m.def("disable_flycv", &vision::DisableFlyCV,
        "Disable image preprocessing by FlyCV, change to use OpenCV.");
  BindProcessorManager(m);
  BindDetection(m);
  BindClassification(m);
  BindSegmentation(m);
--- a/python/fastdeploy/vision/classification/ppcls/init.py
+++ b/python/fastdeploy/vision/classification/ppcls/init.py
@@ -16,44 +16,40 @@ from __future__ import absolute_import
 import logging
 from .... import FastDeployModel, ModelFormat
 from .... import c_lib_wrap as C
 from ...common import ProcessorManager
-class PaddleClasPreprocessor:
+class PaddleClasPreprocessor(ProcessorManager):
    def __init__(self, config_file):
        """Create a preprocessor for PaddleClasModel from configuration file
        :param config_file: (str)Path of configuration file, e.g resnet50/inference_cls.yaml
        """
-        self._preprocessor = C.vision.classification.PaddleClasPreprocessor(
+        super(PaddleClasPreprocessor, self).__init__()
        self._manager = C.vision.classification.PaddleClasPreprocessor(
            config_file)
    def run(self, input_ims):
        """Preprocess input images for PaddleClasModel
        :param: input_ims: (list of numpy.ndarray)The input image
        :return: list of FDTensor
        """
        return self._preprocessor.run(input_ims)
    def use_cuda(self, enable_cv_cuda=False, gpu_id=-1):
        """Use CUDA preprocessors
        :param: enable_cv_cuda: Whether to enable CV-CUDA
        :param: gpu_id: GPU device id
        """
        return self._preprocessor.use_cuda(enable_cv_cuda, gpu_id)
    def disable_normalize(self):
        """
        This function will disable normalize in preprocessing step.
        """
-        self._preprocessor.disable_normalize()
+        self._manager.disable_normalize()
    def disable_permute(self):
        """
        This function will disable hwc2chw in preprocessing step.
        """
-        self._preprocessor.disable_permute()
+        self._manager.disable_permute()
    def initial_resize_on_cpu(self, v):
        """
        When the initial operator is Resize, and input image size is large,
        maybe it's better to run resize on CPU, because the HostToDevice memcpy
        is time consuming. Set this True to run the initial resize on CPU.
        :param: v: True or False
        """
        self._manager.initial_resize_on_cpu(v)
 class PaddleClasPostprocessor:
--- a/python/fastdeploy/vision/common/init.py
+++ b/python/fastdeploy/vision/common/init.py
@@ -0,0 +1,16 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from .manager import ProcessorManager
--- a/python/fastdeploy/vision/common/manager.py
+++ b/python/fastdeploy/vision/common/manager.py
@@ -0,0 +1,36 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 class ProcessorManager:
    def __init__(self):
        self._manager = None
    def run(self, input_ims):
        """Process input image
        :param: input_ims: (list of numpy.ndarray) The input images
        :return: list of FDTensor
        """
        return self._manager.run(input_ims)
    def use_cuda(self, enable_cv_cuda=False, gpu_id=-1):
        """Use CUDA processors
        :param: enable_cv_cuda: Ture: use CV-CUDA, False: use CUDA only
        :param: gpu_id: GPU device id
        """
        return self._manager.use_cuda(enable_cv_cuda, gpu_id)
--- a/python/setup.py
+++ b/python/setup.py
@@ -72,6 +72,7 @@ setup_configs["PADDLELITE_URL"] = os.getenv("PADDLELITE_URL", "OFF")
 setup_configs["ENABLE_VISION"] = os.getenv("ENABLE_VISION", "OFF")
 setup_configs["ENABLE_ENCRYPTION"] = os.getenv("ENABLE_ENCRYPTION", "OFF")
 setup_configs["ENABLE_FLYCV"] = os.getenv("ENABLE_FLYCV", "OFF")
 setup_configs["ENABLE_CVCUDA"] = os.getenv("ENABLE_CVCUDA", "OFF")
 setup_configs["ENABLE_TEXT"] = os.getenv("ENABLE_TEXT", "OFF")
 setup_configs["ENABLE_BENCHMARK"] = os.getenv("ENABLE_BENCHMARK", "OFF")
 setup_configs["WITH_GPU"] = os.getenv("WITH_GPU", "OFF")