[Backend] cuda normalize and permute, cuda concat, optimized ppcls, ppdet & ppseg (#546)

* cuda normalize and permute, cuda concat * add use cuda option for preprocessor * ppyoloe use cuda normalize * ppseg use cuda normalize * add proclib cuda in processor base * ppcls add use cuda preprocess api * ppcls preprocessor set gpu id * fix pybind * refine ppcls preprocessing use gpu logic * fdtensor device id is -1 by default * refine assert message Co-authored-by: heliqi <1101791222@qq.com>
2025-10-11 19:40:25 +08:00 · 2022-11-14 18:44:00 +08:00
parent 8dec2115d5
commit a36f5d3396
20 changed files with 204 additions and 26 deletions
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -252,7 +252,8 @@ void FDTensor::FreeFn() {
  }
 }

-void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes) {
+void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes,
+                          const Device& device, bool is_pinned_memory) {
  if (device == Device::GPU) {
 #ifdef WITH_GPU
    FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0,
@@ -295,7 +296,7 @@ FDTensor::FDTensor(const FDTensor& other)
    size_t nbytes = Nbytes();
    FDASSERT(ReallocFn(nbytes),
             "The FastDeploy FDTensor allocate memory error");
-    CopyBuffer(buffer_, other.buffer_, nbytes);
+    CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory);
  }
 }

@@ -325,7 +326,7 @@ FDTensor& FDTensor::operator=(const FDTensor& other) {
    } else {
      Resize(other.shape, other.dtype, other.name, other.device);
      size_t nbytes = Nbytes();
-      CopyBuffer(buffer_, other.buffer_, nbytes);
+      CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory);
    }
    external_data_ptr = other.external_data_ptr;
  }