[Backend] cuda normalize and permute, cuda concat, optimized ppcls, ppdet & ppseg (#546)

* cuda normalize and permute, cuda concat * add use cuda option for preprocessor * ppyoloe use cuda normalize * ppseg use cuda normalize * add proclib cuda in processor base * ppcls add use cuda preprocess api * ppcls preprocessor set gpu id * fix pybind * refine ppcls preprocessing use gpu logic * fdtensor device id is -1 by default * refine assert message Co-authored-by: heliqi <1101791222@qq.com>
2025-10-06 17:17:14 +08:00 · 2022-11-14 18:44:00 +08:00
parent 8dec2115d5
commit a36f5d3396
20 changed files with 204 additions and 26 deletions
--- a/fastdeploy/core/fd_tensor.h
+++ b/fastdeploy/core/fd_tensor.h
@@ -39,6 +39,9 @@ struct FASTDEPLOY_DECL FDTensor {
  // GPU to inference the model
  // so we can skip data transfer, which may improve the efficience
  Device device = Device::CPU;
+  // By default the device id of FDTensor is -1, which means this value is
+  // invalid, and FDTensor is using the same device id as Runtime.
+  int device_id = -1;

  // Whether the data buffer is in pinned memory, which is allocated
  // with cudaMallocHost()
@@ -130,8 +133,9 @@ struct FASTDEPLOY_DECL FDTensor {

  ~FDTensor() { FreeFn(); }

- private:
-  void CopyBuffer(void* dst, const void* src, size_t nbytes);
+  static void CopyBuffer(void* dst, const void* src, size_t nbytes,
+                         const Device& device = Device::CPU,
+                        bool is_pinned_memory = false);
 };

 }  // namespace fastdeploy