mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 03:46:40 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			398 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			398 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
 | |
|  *
 | |
|  * Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  * you may not use this file except in compliance with the License.
 | |
|  * You may obtain a copy of the License at
 | |
|  *
 | |
|  *     http://www.apache.org/licenses/LICENSE-2.0
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  * See the License for the specific language governing permissions and
 | |
|  * limitations under the License.
 | |
|  */
 | |
| 
 | |
| #ifndef TRT_SAMPLE_DEVICE_H
 | |
| #define TRT_SAMPLE_DEVICE_H
 | |
| 
 | |
| #include <cassert>
 | |
| #include <cuda.h>
 | |
| #include <cuda_runtime.h>
 | |
| #include <iostream>
 | |
| #include <thread>
 | |
| 
 | |
| namespace sample {
 | |
| 
 | |
| inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) {
 | |
|   if (ret != cudaSuccess) {
 | |
|     err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl;
 | |
|     abort();
 | |
|   }
 | |
| }
 | |
| 
 | |
| class TrtCudaEvent;
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| void cudaSleep(void* sleep) {
 | |
|   std::this_thread::sleep_for(
 | |
|       std::chrono::duration<float, std::milli>(*static_cast<float*>(sleep)));
 | |
| }
 | |
| 
 | |
| } // namespace
 | |
| 
 | |
| //!
 | |
| //! \class TrtCudaStream
 | |
| //! \brief Managed CUDA stream
 | |
| //!
 | |
| class TrtCudaStream {
 | |
|  public:
 | |
|   TrtCudaStream() { cudaCheck(cudaStreamCreate(&mStream)); }
 | |
| 
 | |
|   TrtCudaStream(const TrtCudaStream&) = delete;
 | |
| 
 | |
|   TrtCudaStream& operator=(const TrtCudaStream&) = delete;
 | |
| 
 | |
|   TrtCudaStream(TrtCudaStream&&) = delete;
 | |
| 
 | |
|   TrtCudaStream& operator=(TrtCudaStream&&) = delete;
 | |
| 
 | |
|   ~TrtCudaStream() { cudaCheck(cudaStreamDestroy(mStream)); }
 | |
| 
 | |
|   cudaStream_t get() const { return mStream; }
 | |
| 
 | |
|   void synchronize() { cudaCheck(cudaStreamSynchronize(mStream)); }
 | |
| 
 | |
|   void wait(TrtCudaEvent& event);
 | |
| 
 | |
|   void sleep(float* ms) {
 | |
|     cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms));
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   cudaStream_t mStream{};
 | |
| };
 | |
| 
 | |
| //!
 | |
| //! \class TrtCudaEvent
 | |
| //! \brief Managed CUDA event
 | |
| //!
 | |
| class TrtCudaEvent {
 | |
|  public:
 | |
|   explicit TrtCudaEvent(bool blocking = true) {
 | |
|     const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault;
 | |
|     cudaCheck(cudaEventCreateWithFlags(&mEvent, flags));
 | |
|   }
 | |
| 
 | |
|   TrtCudaEvent(const TrtCudaEvent&) = delete;
 | |
| 
 | |
|   TrtCudaEvent& operator=(const TrtCudaEvent&) = delete;
 | |
| 
 | |
|   TrtCudaEvent(TrtCudaEvent&&) = delete;
 | |
| 
 | |
|   TrtCudaEvent& operator=(TrtCudaEvent&&) = delete;
 | |
| 
 | |
|   ~TrtCudaEvent() { cudaCheck(cudaEventDestroy(mEvent)); }
 | |
| 
 | |
|   cudaEvent_t get() const { return mEvent; }
 | |
| 
 | |
|   void record(const TrtCudaStream& stream) {
 | |
|     cudaCheck(cudaEventRecord(mEvent, stream.get()));
 | |
|   }
 | |
| 
 | |
|   void synchronize() { cudaCheck(cudaEventSynchronize(mEvent)); }
 | |
| 
 | |
|   // Returns time elapsed time in milliseconds
 | |
|   float operator-(const TrtCudaEvent& e) const {
 | |
|     float time{0};
 | |
|     cudaCheck(cudaEventElapsedTime(&time, e.get(), get()));
 | |
|     return time;
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   cudaEvent_t mEvent{};
 | |
| };
 | |
| 
 | |
| inline void TrtCudaStream::wait(TrtCudaEvent& event) {
 | |
|   cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0));
 | |
| }
 | |
| 
 | |
| //!
 | |
| //! \class TrtCudaGraph
 | |
| //! \brief Managed CUDA graph
 | |
| //!
 | |
| class TrtCudaGraph {
 | |
|  public:
 | |
|   explicit TrtCudaGraph() = default;
 | |
| 
 | |
|   TrtCudaGraph(const TrtCudaGraph&) = delete;
 | |
| 
 | |
|   TrtCudaGraph& operator=(const TrtCudaGraph&) = delete;
 | |
| 
 | |
|   TrtCudaGraph(TrtCudaGraph&&) = delete;
 | |
| 
 | |
|   TrtCudaGraph& operator=(TrtCudaGraph&&) = delete;
 | |
| 
 | |
|   ~TrtCudaGraph() {
 | |
|     if (mGraphExec) {
 | |
|       cudaGraphExecDestroy(mGraphExec);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   void beginCapture(TrtCudaStream& stream) {
 | |
|     cudaCheck(
 | |
|         cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal));
 | |
|   }
 | |
| 
 | |
|   bool launch(TrtCudaStream& stream) {
 | |
|     return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess;
 | |
|   }
 | |
| 
 | |
|   void endCapture(TrtCudaStream& stream) {
 | |
|     cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph));
 | |
|     cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0));
 | |
|     cudaCheck(cudaGraphDestroy(mGraph));
 | |
|   }
 | |
| 
 | |
|   void endCaptureOnError(TrtCudaStream& stream) {
 | |
|     // There are two possibilities why stream capture would fail:
 | |
|     // (1) stream is in cudaErrorStreamCaptureInvalidated state.
 | |
|     // (2) TRT reports a failure.
 | |
|     // In case (1), the returning mGraph should be nullptr.
 | |
|     // In case (2), the returning mGraph is not nullptr, but it should not be
 | |
|     // used.
 | |
|     const auto ret = cudaStreamEndCapture(stream.get(), &mGraph);
 | |
|     if (ret == cudaErrorStreamCaptureInvalidated) {
 | |
|       assert(mGraph == nullptr);
 | |
|     } else {
 | |
|       assert(ret == cudaSuccess);
 | |
|       assert(mGraph != nullptr);
 | |
|       cudaCheck(cudaGraphDestroy(mGraph));
 | |
|       mGraph = nullptr;
 | |
|     }
 | |
|     // Clean up any CUDA error.
 | |
|     cudaGetLastError();
 | |
|     sample::gLogWarning << "The CUDA graph capture on the stream has failed."
 | |
|                         << std::endl;
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   cudaGraph_t mGraph{};
 | |
|   cudaGraphExec_t mGraphExec{};
 | |
| };
 | |
| 
 | |
| //!
 | |
| //! \class TrtCudaBuffer
 | |
| //! \brief Managed buffer for host and device
 | |
| //!
 | |
| template <typename A, typename D> class TrtCudaBuffer {
 | |
|  public:
 | |
|   TrtCudaBuffer() = default;
 | |
| 
 | |
|   TrtCudaBuffer(const TrtCudaBuffer&) = delete;
 | |
| 
 | |
|   TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete;
 | |
| 
 | |
|   TrtCudaBuffer(TrtCudaBuffer&& rhs) {
 | |
|     reset(rhs.mPtr);
 | |
|     rhs.mPtr = nullptr;
 | |
|   }
 | |
| 
 | |
|   TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) {
 | |
|     if (this != &rhs) {
 | |
|       reset(rhs.mPtr);
 | |
|       rhs.mPtr = nullptr;
 | |
|     }
 | |
|     return *this;
 | |
|   }
 | |
| 
 | |
|   ~TrtCudaBuffer() { reset(); }
 | |
| 
 | |
|   TrtCudaBuffer(size_t size) { A()(&mPtr, size); }
 | |
| 
 | |
|   void allocate(size_t size) {
 | |
|     reset();
 | |
|     A()(&mPtr, size);
 | |
|   }
 | |
| 
 | |
|   void reset(void* ptr = nullptr) {
 | |
|     if (mPtr) {
 | |
|       D()(mPtr);
 | |
|     }
 | |
|     mPtr = ptr;
 | |
|   }
 | |
| 
 | |
|   void* get() const { return mPtr; }
 | |
| 
 | |
|  private:
 | |
|   void* mPtr{nullptr};
 | |
| };
 | |
| 
 | |
| struct DeviceAllocator {
 | |
|   void operator()(void** ptr, size_t size) { cudaCheck(cudaMalloc(ptr, size)); }
 | |
| };
 | |
| 
 | |
| struct DeviceDeallocator {
 | |
|   void operator()(void* ptr) { cudaCheck(cudaFree(ptr)); }
 | |
| };
 | |
| 
 | |
| struct ManagedAllocator {
 | |
|   void operator()(void** ptr, size_t size) {
 | |
|     cudaCheck(cudaMallocManaged(ptr, size));
 | |
|   }
 | |
| };
 | |
| 
 | |
| struct HostAllocator {
 | |
|   void operator()(void** ptr, size_t size) {
 | |
|     cudaCheck(cudaMallocHost(ptr, size));
 | |
|   }
 | |
| };
 | |
| 
 | |
| struct HostDeallocator {
 | |
|   void operator()(void* ptr) { cudaCheck(cudaFreeHost(ptr)); }
 | |
| };
 | |
| 
 | |
| using TrtDeviceBuffer = TrtCudaBuffer<DeviceAllocator, DeviceDeallocator>;
 | |
| using TrtManagedBuffer = TrtCudaBuffer<ManagedAllocator, DeviceDeallocator>;
 | |
| 
 | |
| using TrtHostBuffer = TrtCudaBuffer<HostAllocator, HostDeallocator>;
 | |
| 
 | |
| //!
 | |
| //! \class MirroredBuffer
 | |
| //! \brief Coupled host and device buffers
 | |
| //!
 | |
| class IMirroredBuffer {
 | |
|  public:
 | |
|   //!
 | |
|   //! Allocate memory for the mirrored buffer give the size
 | |
|   //! of the allocation.
 | |
|   //!
 | |
|   virtual void allocate(size_t size) = 0;
 | |
| 
 | |
|   //!
 | |
|   //! Get the pointer to the device side buffer.
 | |
|   //!
 | |
|   //! \return pointer to device memory or nullptr if uninitialized.
 | |
|   //!
 | |
|   virtual void* getDeviceBuffer() const = 0;
 | |
| 
 | |
|   //!
 | |
|   //! Get the pointer to the host side buffer.
 | |
|   //!
 | |
|   //! \return pointer to host memory or nullptr if uninitialized.
 | |
|   //!
 | |
|   virtual void* getHostBuffer() const = 0;
 | |
| 
 | |
|   //!
 | |
|   //! Copy the memory from host to device.
 | |
|   //!
 | |
|   virtual void hostToDevice(TrtCudaStream& stream) = 0;
 | |
| 
 | |
|   //!
 | |
|   //! Copy the memory from device to host.
 | |
|   //!
 | |
|   virtual void deviceToHost(TrtCudaStream& stream) = 0;
 | |
| 
 | |
|   //!
 | |
|   //! Interface to get the size of the memory
 | |
|   //!
 | |
|   //! \return the size of memory allocated.
 | |
|   //!
 | |
|   virtual size_t getSize() const = 0;
 | |
| 
 | |
|   //!
 | |
|   //! Virtual destructor declaraion
 | |
|   //!
 | |
|   virtual ~IMirroredBuffer() = default;
 | |
| 
 | |
| }; // class IMirroredBuffer
 | |
| 
 | |
| //!
 | |
| //! Class to have a seperate memory buffer for discrete device and host
 | |
| //! allocations.
 | |
| //!
 | |
| class DiscreteMirroredBuffer : public IMirroredBuffer {
 | |
|  public:
 | |
|   void allocate(size_t size) {
 | |
|     mSize = size;
 | |
|     mHostBuffer.allocate(size);
 | |
|     mDeviceBuffer.allocate(size);
 | |
|   }
 | |
| 
 | |
|   void* getDeviceBuffer() const { return mDeviceBuffer.get(); }
 | |
| 
 | |
|   void* getHostBuffer() const { return mHostBuffer.get(); }
 | |
| 
 | |
|   void hostToDevice(TrtCudaStream& stream) {
 | |
|     cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize,
 | |
|                               cudaMemcpyHostToDevice, stream.get()));
 | |
|   }
 | |
| 
 | |
|   void deviceToHost(TrtCudaStream& stream) {
 | |
|     cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize,
 | |
|                               cudaMemcpyDeviceToHost, stream.get()));
 | |
|   }
 | |
| 
 | |
|   size_t getSize() const { return mSize; }
 | |
| 
 | |
|  private:
 | |
|   size_t mSize{0};
 | |
|   TrtHostBuffer mHostBuffer;
 | |
|   TrtDeviceBuffer mDeviceBuffer;
 | |
| }; // class DiscreteMirroredBuffer
 | |
| 
 | |
| //!
 | |
| //! Class to have a unified memory buffer for embedded devices.
 | |
| //!
 | |
| class UnifiedMirroredBuffer : public IMirroredBuffer {
 | |
|  public:
 | |
|   void allocate(size_t size) {
 | |
|     mSize = size;
 | |
|     mBuffer.allocate(size);
 | |
|   }
 | |
| 
 | |
|   void* getDeviceBuffer() const { return mBuffer.get(); }
 | |
| 
 | |
|   void* getHostBuffer() const { return mBuffer.get(); }
 | |
| 
 | |
|   void hostToDevice(TrtCudaStream& stream) {
 | |
|     // Does nothing since we are using unified memory.
 | |
|   }
 | |
| 
 | |
|   void deviceToHost(TrtCudaStream& stream) {
 | |
|     // Does nothing since we are using unified memory.
 | |
|   }
 | |
| 
 | |
|   size_t getSize() const { return mSize; }
 | |
| 
 | |
|  private:
 | |
|   size_t mSize{0};
 | |
|   TrtManagedBuffer mBuffer;
 | |
| }; // class UnifiedMirroredBuffer
 | |
| 
 | |
| inline void setCudaDevice(int device, std::ostream& os) {
 | |
|   cudaCheck(cudaSetDevice(device));
 | |
| 
 | |
|   cudaDeviceProp properties;
 | |
|   cudaCheck(cudaGetDeviceProperties(&properties, device));
 | |
| 
 | |
|   // clang-format off
 | |
|     os << "=== Device Information ===" << std::endl;
 | |
|     os << "Selected Device: "      << properties.name                                               << std::endl;
 | |
|     os << "Compute Capability: "   << properties.major << "." << properties.minor                   << std::endl;
 | |
|     os << "SMs: "                  << properties.multiProcessorCount                                << std::endl;
 | |
|     os << "Compute Clock Rate: "   << properties.clockRate / 1000000.0F << " GHz"                   << std::endl;
 | |
|     os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB"                   << std::endl;
 | |
|     os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB"       << std::endl;
 | |
|     os << "Memory Bus Width: "     << properties.memoryBusWidth << " bits"
 | |
|                         << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
 | |
|     os << "Memory Clock Rate: "    << properties.memoryClockRate / 1000000.0F << " GHz"             << std::endl;
 | |
|   // clang-format on
 | |
| }
 | |
| 
 | |
| } // namespace sample
 | |
| 
 | |
| #endif // TRT_SAMPLE_DEVICE_H
 | 
