FDTensor support GPU device (#190)

* fdtensor support GPU * TRT backend support GPU FDTensor * FDHostAllocator add FASTDEPLOY_DECL * fix FDTensor Data * fix FDTensor dtype Co-authored-by: Jason <jiangjiajun@baidu.com>
2025-10-07 01:22:59 +08:00 · 2022-09-08 03:53:08 -05:00
parent bc8e9e4dae
commit 4d1f264d01
17 changed files with 432 additions and 153 deletions
--- a/csrc/fastdeploy/backends/tensorrt/trt_backend.cc
+++ b/csrc/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -13,9 +13,11 @@
 // limitations under the License.

 #include "fastdeploy/backends/tensorrt/trt_backend.h"
+
+#include <cstring>
+
 #include "NvInferSafeRuntime.h"
 #include "fastdeploy/utils/utils.h"
-#include <cstring>
 #ifdef ENABLE_PADDLE_FRONTEND
 #include "paddle2onnx/converter.h"
 #endif
@@ -210,9 +212,9 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,
  outputs_desc_.resize(onnx_reader.num_outputs);
  for (int i = 0; i < onnx_reader.num_inputs; ++i) {
    std::string name(onnx_reader.inputs[i].name);
-    std::vector<int64_t> shape(onnx_reader.inputs[i].shape,
-                               onnx_reader.inputs[i].shape +
-                                   onnx_reader.inputs[i].rank);
+    std::vector<int64_t> shape(
+        onnx_reader.inputs[i].shape,
+        onnx_reader.inputs[i].shape + onnx_reader.inputs[i].rank);
    inputs_desc_[i].name = name;
    inputs_desc_[i].shape.assign(shape.begin(), shape.end());
    inputs_desc_[i].dtype = ReaderDtypeToTrtDtype(onnx_reader.inputs[i].dtype);
@@ -231,9 +233,9 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,

  for (int i = 0; i < onnx_reader.num_outputs; ++i) {
    std::string name(onnx_reader.outputs[i].name);
-    std::vector<int64_t> shape(onnx_reader.outputs[i].shape,
-                               onnx_reader.outputs[i].shape +
-                                   onnx_reader.outputs[i].rank);
+    std::vector<int64_t> shape(
+        onnx_reader.outputs[i].shape,
+        onnx_reader.outputs[i].shape + onnx_reader.outputs[i].rank);
    outputs_desc_[i].name = name;
    outputs_desc_[i].shape.assign(shape.begin(), shape.end());
    outputs_desc_[i].dtype =
@@ -286,24 +288,8 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
    BuildTrtEngine();
  }

-  AllocateBufferInDynamicShape(inputs, outputs);
-  std::vector<void*> input_binds(inputs.size());
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    if (inputs[i].dtype == FDDataType::INT64) {
-      int64_t* data = static_cast<int64_t*>(inputs[i].Data());
-      std::vector<int32_t> casted_data(data, data + inputs[i].Numel());
-      FDASSERT(cudaMemcpyAsync(inputs_buffer_[inputs[i].name].data(),
-                               static_cast<void*>(casted_data.data()),
-                               inputs[i].Nbytes() / 2, cudaMemcpyHostToDevice,
-                               stream_) == 0,
-               "[ERROR] Error occurs while copy memory from CPU to GPU.");
-    } else {
-      FDASSERT(cudaMemcpyAsync(inputs_buffer_[inputs[i].name].data(),
-                               inputs[i].Data(), inputs[i].Nbytes(),
-                               cudaMemcpyHostToDevice, stream_) == 0,
-               "[ERROR] Error occurs while copy memory from CPU to GPU.");
-    }
-  }
+  SetInputs(inputs);
+  AllocateOutputsBuffer(outputs);
  if (!context_->enqueueV2(bindings_.data(), stream_, nullptr)) {
    FDERROR << "Failed to Infer with TensorRT." << std::endl;
    return false;
@@ -339,18 +325,50 @@ void TrtBackend::GetInputOutputInfo() {
  bindings_.resize(num_binds);
 }

-void TrtBackend::AllocateBufferInDynamicShape(
-    const std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs) {
+void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
  for (const auto& item : inputs) {
    auto idx = engine_->getBindingIndex(item.name.c_str());
    std::vector<int> shape(item.shape.begin(), item.shape.end());
    auto dims = ToDims(shape);
    context_->setBindingDimensions(idx, dims);
-    if (item.Nbytes() > inputs_buffer_[item.name].nbBytes()) {
+
+    if (item.device == Device::GPU) {
+      if (item.dtype == FDDataType::INT64) {
+        // TODO(liqi): cast int64 to int32
+        // TRT don't support INT64
+        FDASSERT(false,
+                 "TRT don't support INT64 input on GPU, "
+                 "please use INT32 input");
+      } else {
+        // no copy
+        inputs_buffer_[item.name].SetExternalData(dims, item.Data());
+      }
+    } else {
+      // Allocate input buffer memory
      inputs_buffer_[item.name].resize(dims);
-      bindings_[idx] = inputs_buffer_[item.name].data();
+
+      // copy from cpu to gpu
+      if (item.dtype == FDDataType::INT64) {
+        int64_t* data = static_cast<int64_t*>(const_cast<void*>(item.Data()));
+        std::vector<int32_t> casted_data(data, data + item.Numel());
+        FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(),
+                                 static_cast<void*>(casted_data.data()),
+                                 item.Nbytes() / 2, cudaMemcpyHostToDevice,
+                                 stream_) == 0,
+                 "Error occurs while copy memory from CPU to GPU.");
+      } else {
+        FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(), item.Data(),
+                                 item.Nbytes(), cudaMemcpyHostToDevice,
+                                 stream_) == 0,
+                 "Error occurs while copy memory from CPU to GPU.");
+      }
    }
+    // binding input buffer
+    bindings_[idx] = inputs_buffer_[item.name].data();
  }
+}
+
+void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
  if (outputs->size() != outputs_desc_.size()) {
    outputs->resize(outputs_desc_.size());
  }
@@ -365,13 +383,15 @@ void TrtBackend::AllocateBufferInDynamicShape(
        "Cannot find output: %s of tensorrt network from the original model.",
        outputs_desc_[i].name.c_str());
    auto ori_idx = iter->second;
-    std::vector<int64_t> shape(output_dims.d, output_dims.d + output_dims.nbDims);
-    (*outputs)[ori_idx].Allocate(shape, GetFDDataType(outputs_desc_[i].dtype), outputs_desc_[i].name);
-    if ((*outputs)[ori_idx].Nbytes() >
-        outputs_buffer_[outputs_desc_[i].name].nbBytes()) {
-      outputs_buffer_[outputs_desc_[i].name].resize(output_dims);
-      bindings_[idx] = outputs_buffer_[outputs_desc_[i].name].data();
-    }
+    // set user's outputs info
+    std::vector<int64_t> shape(output_dims.d,
+                               output_dims.d + output_dims.nbDims);
+    (*outputs)[ori_idx].Resize(shape, GetFDDataType(outputs_desc_[i].dtype),
+                               outputs_desc_[i].name);
+    // Allocate output buffer memory
+    outputs_buffer_[outputs_desc_[i].name].resize(output_dims);
+    // binding output buffer
+    bindings_[idx] = outputs_buffer_[outputs_desc_[i].name].data();
  }
 }

@@ -580,4 +600,4 @@ TensorInfo TrtBackend::GetOutputInfo(int index) {
  info.dtype = GetFDDataType(outputs_desc_[index].dtype);
  return info;
 }
-} // namespace fastdeploy
+}  // namespace fastdeploy
--- a/csrc/fastdeploy/backends/tensorrt/trt_backend.h
+++ b/csrc/fastdeploy/backends/tensorrt/trt_backend.h
@@ -14,6 +14,8 @@

 #pragma once

+#include <cuda_runtime_api.h>
+
 #include <iostream>
 #include <map>
 #include <string>
@@ -23,7 +25,6 @@
 #include "NvOnnxParser.h"
 #include "fastdeploy/backends/backend.h"
 #include "fastdeploy/backends/tensorrt/utils.h"
-#include <cuda_runtime_api.h>

 namespace fastdeploy {

@@ -109,12 +110,12 @@ class TrtBackend : public BaseBackend {
  std::map<std::string, ShapeRangeInfo> shape_range_info_;

  void GetInputOutputInfo();
-  void AllocateBufferInDynamicShape(const std::vector<FDTensor>& inputs,
-                                    std::vector<FDTensor>* outputs);
  bool CreateTrtEngineFromOnnx(const std::string& onnx_model_buffer);
  bool BuildTrtEngine();
  bool LoadTrtCache(const std::string& trt_engine_file);
  int ShapeRangeInfoUpdated(const std::vector<FDTensor>& inputs);
+  void SetInputs(const std::vector<FDTensor>& inputs);
+  void AllocateOutputsBuffer(std::vector<FDTensor>* outputs);
 };

-} // namespace fastdeploy
+}  // namespace fastdeploy
--- a/csrc/fastdeploy/backends/tensorrt/utils.h
+++ b/csrc/fastdeploy/backends/tensorrt/utils.h
@@ -14,11 +14,9 @@

 #pragma once

-#include "NvInfer.h"
-#include "fastdeploy/core/fd_tensor.h"
-#include "fastdeploy/utils/utils.h"
-#include <algorithm>
 #include <cuda_runtime_api.h>
+
+#include <algorithm>
 #include <iostream>
 #include <map>
 #include <memory>
@@ -26,17 +24,24 @@
 #include <string>
 #include <vector>

+#include "NvInfer.h"
+#include "fastdeploy/core/allocate.h"
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/utils/utils.h"
+
 namespace fastdeploy {

 struct FDInferDeleter {
-  template <typename T> void operator()(T* obj) const {
+  template <typename T>
+  void operator()(T* obj) const {
    if (obj) {
      obj->destroy();
    }
  }
 };

-template <typename T> using FDUniquePtr = std::unique_ptr<T, FDInferDeleter>;
+template <typename T>
+using FDUniquePtr = std::unique_ptr<T, FDInferDeleter>;

 int64_t Volume(const nvinfer1::Dims& d);

@@ -64,13 +69,18 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& vec) {
  return out;
 }

-template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
+template <typename AllocFunc, typename FreeFunc>
+class FDGenericBuffer {
 public:
  //!
  //! \brief Construct an empty buffer.
  //!
  explicit FDGenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT)
-      : mSize(0), mCapacity(0), mType(type), mBuffer(nullptr) {}
+      : mSize(0),
+        mCapacity(0),
+        mType(type),
+        mBuffer(nullptr),
+        mExternal_buffer(nullptr) {}

  //!
  //! \brief Construct a buffer with the specified allocation size in bytes.
@@ -82,8 +92,18 @@ template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
    }
  }

+  //!
+  //! \brief This use to skip memory copy step.
+  //!
+  FDGenericBuffer(size_t size, nvinfer1::DataType type, void* buffer)
+      : mSize(size), mCapacity(size), mType(type) {
+    mExternal_buffer = buffer;
+  }
+
  FDGenericBuffer(FDGenericBuffer&& buf)
-      : mSize(buf.mSize), mCapacity(buf.mCapacity), mType(buf.mType),
+      : mSize(buf.mSize),
+        mCapacity(buf.mCapacity),
+        mType(buf.mType),
        mBuffer(buf.mBuffer) {
    buf.mSize = 0;
    buf.mCapacity = 0;
@@ -109,12 +129,18 @@ template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
  //!
  //! \brief Returns pointer to underlying array.
  //!
-  void* data() { return mBuffer; }
+  void* data() {
+    if (mExternal_buffer != nullptr) return mExternal_buffer;
+    return mBuffer;
+  }

  //!
  //! \brief Returns pointer to underlying array.
  //!
-  const void* data() const { return mBuffer; }
+  const void* data() const {
+    if (mExternal_buffer != nullptr) return mExternal_buffer;
+    return mBuffer;
+  }

  //!
  //! \brief Returns the size (in number of elements) of the buffer.
@@ -126,11 +152,29 @@ template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
  //!
  size_t nbBytes() const { return this->size() * TrtDataTypeSize(mType); }

+  //!
+  //! \brief Set user memory buffer for TRT Buffer
+  //!
+  void SetExternalData(size_t size, nvinfer1::DataType type, void* buffer) {
+    mSize = mCapacity = size;
+    mType = type;
+    mExternal_buffer = const_cast<void*>(buffer);
+  }
+
+  //!
+  //! \brief Set user memory buffer for TRT Buffer
+  //!
+  void SetExternalData(const nvinfer1::Dims& dims, const void* buffer) {
+    mSize = mCapacity = Volume(dims);
+    mExternal_buffer = const_cast<void*>(buffer);
+  }
+
  //!
  //! \brief Resizes the buffer. This is a no-op if the new size is smaller than
  //! or equal to the current capacity.
  //!
  void resize(size_t newSize) {
+    mExternal_buffer = nullptr;
    mSize = newSize;
    if (mCapacity < newSize) {
      freeFn(mBuffer);
@@ -146,28 +190,20 @@ template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
  //!
  void resize(const nvinfer1::Dims& dims) { return this->resize(Volume(dims)); }

-  ~FDGenericBuffer() { freeFn(mBuffer); }
+  ~FDGenericBuffer() {
+    mExternal_buffer = nullptr;
+    freeFn(mBuffer);
+  }

 private:
  size_t mSize{0}, mCapacity{0};
  nvinfer1::DataType mType;
  void* mBuffer;
+  void* mExternal_buffer;
  AllocFunc allocFn;
  FreeFunc freeFn;
 };

-class FDDeviceAllocator {
- public:
-  bool operator()(void** ptr, size_t size) const {
-    return cudaMalloc(ptr, size) == cudaSuccess;
-  }
-};
-
-class FDDeviceFree {
- public:
-  void operator()(void* ptr) const { cudaFree(ptr); }
-};
-
 using FDDeviceBuffer = FDGenericBuffer<FDDeviceAllocator, FDDeviceFree>;

 class FDTrtLogger : public nvinfer1::ILogger {
@@ -197,7 +233,7 @@ class FDTrtLogger : public nvinfer1::ILogger {
 };

 struct ShapeRangeInfo {
-  ShapeRangeInfo(const std::vector<int64_t>& new_shape) {
+  explicit ShapeRangeInfo(const std::vector<int64_t>& new_shape) {
    shape.assign(new_shape.begin(), new_shape.end());
    min.resize(new_shape.size());
    max.resize(new_shape.size());
@@ -239,4 +275,4 @@ struct ShapeRangeInfo {
  }
 };

-} // namespace fastdeploy
+}  // namespace fastdeploy