first commit

2025-10-06 17:17:14 +08:00 · 2022-07-05 09:30:15 +00:00
parent 4df7366d62
commit 9d87046d78
781 changed files with 225888 additions and 6184 deletions
--- a/fastdeploy/backends/tensorrt/common/sampleDevice.h
+++ b/fastdeploy/backends/tensorrt/common/sampleDevice.h
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_DEVICE_H
+#define TRT_SAMPLE_DEVICE_H
+
+#include <cassert>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <thread>
+
+namespace sample {
+
+inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) {
+  if (ret != cudaSuccess) {
+    err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl;
+    abort();
+  }
+}
+
+class TrtCudaEvent;
+
+namespace {
+
+void cudaSleep(void* sleep) {
+  std::this_thread::sleep_for(
+      std::chrono::duration<float, std::milli>(*static_cast<float*>(sleep)));
+}
+
+} // namespace
+
+//!
+//! \class TrtCudaStream
+//! \brief Managed CUDA stream
+//!
+class TrtCudaStream {
+ public:
+  TrtCudaStream() { cudaCheck(cudaStreamCreate(&mStream)); }
+
+  TrtCudaStream(const TrtCudaStream&) = delete;
+
+  TrtCudaStream& operator=(const TrtCudaStream&) = delete;
+
+  TrtCudaStream(TrtCudaStream&&) = delete;
+
+  TrtCudaStream& operator=(TrtCudaStream&&) = delete;
+
+  ~TrtCudaStream() { cudaCheck(cudaStreamDestroy(mStream)); }
+
+  cudaStream_t get() const { return mStream; }
+
+  void synchronize() { cudaCheck(cudaStreamSynchronize(mStream)); }
+
+  void wait(TrtCudaEvent& event);
+
+  void sleep(float* ms) {
+    cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms));
+  }
+
+ private:
+  cudaStream_t mStream{};
+};
+
+//!
+//! \class TrtCudaEvent
+//! \brief Managed CUDA event
+//!
+class TrtCudaEvent {
+ public:
+  explicit TrtCudaEvent(bool blocking = true) {
+    const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault;
+    cudaCheck(cudaEventCreateWithFlags(&mEvent, flags));
+  }
+
+  TrtCudaEvent(const TrtCudaEvent&) = delete;
+
+  TrtCudaEvent& operator=(const TrtCudaEvent&) = delete;
+
+  TrtCudaEvent(TrtCudaEvent&&) = delete;
+
+  TrtCudaEvent& operator=(TrtCudaEvent&&) = delete;
+
+  ~TrtCudaEvent() { cudaCheck(cudaEventDestroy(mEvent)); }
+
+  cudaEvent_t get() const { return mEvent; }
+
+  void record(const TrtCudaStream& stream) {
+    cudaCheck(cudaEventRecord(mEvent, stream.get()));
+  }
+
+  void synchronize() { cudaCheck(cudaEventSynchronize(mEvent)); }
+
+  // Returns time elapsed time in milliseconds
+  float operator-(const TrtCudaEvent& e) const {
+    float time{0};
+    cudaCheck(cudaEventElapsedTime(&time, e.get(), get()));
+    return time;
+  }
+
+ private:
+  cudaEvent_t mEvent{};
+};
+
+inline void TrtCudaStream::wait(TrtCudaEvent& event) {
+  cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0));
+}
+
+//!
+//! \class TrtCudaGraph
+//! \brief Managed CUDA graph
+//!
+class TrtCudaGraph {
+ public:
+  explicit TrtCudaGraph() = default;
+
+  TrtCudaGraph(const TrtCudaGraph&) = delete;
+
+  TrtCudaGraph& operator=(const TrtCudaGraph&) = delete;
+
+  TrtCudaGraph(TrtCudaGraph&&) = delete;
+
+  TrtCudaGraph& operator=(TrtCudaGraph&&) = delete;
+
+  ~TrtCudaGraph() {
+    if (mGraphExec) {
+      cudaGraphExecDestroy(mGraphExec);
+    }
+  }
+
+  void beginCapture(TrtCudaStream& stream) {
+    cudaCheck(
+        cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal));
+  }
+
+  bool launch(TrtCudaStream& stream) {
+    return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess;
+  }
+
+  void endCapture(TrtCudaStream& stream) {
+    cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph));
+    cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0));
+    cudaCheck(cudaGraphDestroy(mGraph));
+  }
+
+  void endCaptureOnError(TrtCudaStream& stream) {
+    // There are two possibilities why stream capture would fail:
+    // (1) stream is in cudaErrorStreamCaptureInvalidated state.
+    // (2) TRT reports a failure.
+    // In case (1), the returning mGraph should be nullptr.
+    // In case (2), the returning mGraph is not nullptr, but it should not be
+    // used.
+    const auto ret = cudaStreamEndCapture(stream.get(), &mGraph);
+    if (ret == cudaErrorStreamCaptureInvalidated) {
+      assert(mGraph == nullptr);
+    } else {
+      assert(ret == cudaSuccess);
+      assert(mGraph != nullptr);
+      cudaCheck(cudaGraphDestroy(mGraph));
+      mGraph = nullptr;
+    }
+    // Clean up any CUDA error.
+    cudaGetLastError();
+    sample::gLogWarning << "The CUDA graph capture on the stream has failed."
+                        << std::endl;
+  }
+
+ private:
+  cudaGraph_t mGraph{};
+  cudaGraphExec_t mGraphExec{};
+};
+
+//!
+//! \class TrtCudaBuffer
+//! \brief Managed buffer for host and device
+//!
+template <typename A, typename D> class TrtCudaBuffer {
+ public:
+  TrtCudaBuffer() = default;
+
+  TrtCudaBuffer(const TrtCudaBuffer&) = delete;
+
+  TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete;
+
+  TrtCudaBuffer(TrtCudaBuffer&& rhs) {
+    reset(rhs.mPtr);
+    rhs.mPtr = nullptr;
+  }
+
+  TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) {
+    if (this != &rhs) {
+      reset(rhs.mPtr);
+      rhs.mPtr = nullptr;
+    }
+    return *this;
+  }
+
+  ~TrtCudaBuffer() { reset(); }
+
+  TrtCudaBuffer(size_t size) { A()(&mPtr, size); }
+
+  void allocate(size_t size) {
+    reset();
+    A()(&mPtr, size);
+  }
+
+  void reset(void* ptr = nullptr) {
+    if (mPtr) {
+      D()(mPtr);
+    }
+    mPtr = ptr;
+  }
+
+  void* get() const { return mPtr; }
+
+ private:
+  void* mPtr{nullptr};
+};
+
+struct DeviceAllocator {
+  void operator()(void** ptr, size_t size) { cudaCheck(cudaMalloc(ptr, size)); }
+};
+
+struct DeviceDeallocator {
+  void operator()(void* ptr) { cudaCheck(cudaFree(ptr)); }
+};
+
+struct ManagedAllocator {
+  void operator()(void** ptr, size_t size) {
+    cudaCheck(cudaMallocManaged(ptr, size));
+  }
+};
+
+struct HostAllocator {
+  void operator()(void** ptr, size_t size) {
+    cudaCheck(cudaMallocHost(ptr, size));
+  }
+};
+
+struct HostDeallocator {
+  void operator()(void* ptr) { cudaCheck(cudaFreeHost(ptr)); }
+};
+
+using TrtDeviceBuffer = TrtCudaBuffer<DeviceAllocator, DeviceDeallocator>;
+using TrtManagedBuffer = TrtCudaBuffer<ManagedAllocator, DeviceDeallocator>;
+
+using TrtHostBuffer = TrtCudaBuffer<HostAllocator, HostDeallocator>;
+
+//!
+//! \class MirroredBuffer
+//! \brief Coupled host and device buffers
+//!
+class IMirroredBuffer {
+ public:
+  //!
+  //! Allocate memory for the mirrored buffer give the size
+  //! of the allocation.
+  //!
+  virtual void allocate(size_t size) = 0;
+
+  //!
+  //! Get the pointer to the device side buffer.
+  //!
+  //! \return pointer to device memory or nullptr if uninitialized.
+  //!
+  virtual void* getDeviceBuffer() const = 0;
+
+  //!
+  //! Get the pointer to the host side buffer.
+  //!
+  //! \return pointer to host memory or nullptr if uninitialized.
+  //!
+  virtual void* getHostBuffer() const = 0;
+
+  //!
+  //! Copy the memory from host to device.
+  //!
+  virtual void hostToDevice(TrtCudaStream& stream) = 0;
+
+  //!
+  //! Copy the memory from device to host.
+  //!
+  virtual void deviceToHost(TrtCudaStream& stream) = 0;
+
+  //!
+  //! Interface to get the size of the memory
+  //!
+  //! \return the size of memory allocated.
+  //!
+  virtual size_t getSize() const = 0;
+
+  //!
+  //! Virtual destructor declaraion
+  //!
+  virtual ~IMirroredBuffer() = default;
+
+}; // class IMirroredBuffer
+
+//!
+//! Class to have a seperate memory buffer for discrete device and host
+//! allocations.
+//!
+class DiscreteMirroredBuffer : public IMirroredBuffer {
+ public:
+  void allocate(size_t size) {
+    mSize = size;
+    mHostBuffer.allocate(size);
+    mDeviceBuffer.allocate(size);
+  }
+
+  void* getDeviceBuffer() const { return mDeviceBuffer.get(); }
+
+  void* getHostBuffer() const { return mHostBuffer.get(); }
+
+  void hostToDevice(TrtCudaStream& stream) {
+    cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize,
+                              cudaMemcpyHostToDevice, stream.get()));
+  }
+
+  void deviceToHost(TrtCudaStream& stream) {
+    cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize,
+                              cudaMemcpyDeviceToHost, stream.get()));
+  }
+
+  size_t getSize() const { return mSize; }
+
+ private:
+  size_t mSize{0};
+  TrtHostBuffer mHostBuffer;
+  TrtDeviceBuffer mDeviceBuffer;
+}; // class DiscreteMirroredBuffer
+
+//!
+//! Class to have a unified memory buffer for embedded devices.
+//!
+class UnifiedMirroredBuffer : public IMirroredBuffer {
+ public:
+  void allocate(size_t size) {
+    mSize = size;
+    mBuffer.allocate(size);
+  }
+
+  void* getDeviceBuffer() const { return mBuffer.get(); }
+
+  void* getHostBuffer() const { return mBuffer.get(); }
+
+  void hostToDevice(TrtCudaStream& stream) {
+    // Does nothing since we are using unified memory.
+  }
+
+  void deviceToHost(TrtCudaStream& stream) {
+    // Does nothing since we are using unified memory.
+  }
+
+  size_t getSize() const { return mSize; }
+
+ private:
+  size_t mSize{0};
+  TrtManagedBuffer mBuffer;
+}; // class UnifiedMirroredBuffer
+
+inline void setCudaDevice(int device, std::ostream& os) {
+  cudaCheck(cudaSetDevice(device));
+
+  cudaDeviceProp properties;
+  cudaCheck(cudaGetDeviceProperties(&properties, device));
+
+  // clang-format off
+    os << "=== Device Information ===" << std::endl;
+    os << "Selected Device: "      << properties.name                                               << std::endl;
+    os << "Compute Capability: "   << properties.major << "." << properties.minor                   << std::endl;
+    os << "SMs: "                  << properties.multiProcessorCount                                << std::endl;
+    os << "Compute Clock Rate: "   << properties.clockRate / 1000000.0F << " GHz"                   << std::endl;
+    os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB"                   << std::endl;
+    os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB"       << std::endl;
+    os << "Memory Bus Width: "     << properties.memoryBusWidth << " bits"
+                        << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
+    os << "Memory Clock Rate: "    << properties.memoryClockRate / 1000000.0F << " GHz"             << std::endl;
+  // clang-format on
+}
+
+} // namespace sample
+
+#endif // TRT_SAMPLE_DEVICE_H