Files
FastDeploy/csrcs/fastdeploy/backends/tensorrt/common/buffers.h
Jason ffbc5cc42d Move cpp code to directory csrcs (#42)
* move cpp code to csrcs

* move cpp code to csrcs
2022-07-26 17:59:02 +08:00

427 lines
14 KiB
C++

/*
* Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TENSORRT_BUFFERS_H
#define TENSORRT_BUFFERS_H
#include "NvInfer.h"
#include "common.h"
#include "half.h"
#include <cassert>
#include <cuda_runtime_api.h>
#include <iostream>
#include <iterator>
#include <memory>
#include <new>
#include <numeric>
#include <string>
#include <vector>
namespace samplesCommon {
//!
//! \brief The GenericBuffer class is a templated class for buffers.
//!
//! \details This templated RAII (Resource Acquisition Is Initialization) class
//! handles the allocation,
//! deallocation, querying of buffers on both the device and the host.
//! It can handle data of arbitrary types because it stores byte
//! buffers.
//! The template parameters AllocFunc and FreeFunc are used for the
//! allocation and deallocation of the buffer.
//! AllocFunc must be a functor that takes in (void** ptr, size_t size)
//! and returns bool. ptr is a pointer to where the allocated buffer
//! address should be stored.
//! size is the amount of memory in bytes to allocate.
//! The boolean indicates whether or not the memory allocation was
//! successful.
//! FreeFunc must be a functor that takes in (void* ptr) and returns
//! void.
//! ptr is the allocated buffer address. It must work with nullptr
//! input.
//!
template <typename AllocFunc, typename FreeFunc> class GenericBuffer {
public:
//!
//! \brief Construct an empty buffer.
//!
GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT)
: mSize(0), mCapacity(0), mType(type), mBuffer(nullptr) {}
//!
//! \brief Construct a buffer with the specified allocation size in bytes.
//!
GenericBuffer(size_t size, nvinfer1::DataType type)
: mSize(size), mCapacity(size), mType(type) {
if (!allocFn(&mBuffer, this->nbBytes())) {
throw std::bad_alloc();
}
}
GenericBuffer(GenericBuffer&& buf)
: mSize(buf.mSize), mCapacity(buf.mCapacity), mType(buf.mType),
mBuffer(buf.mBuffer) {
buf.mSize = 0;
buf.mCapacity = 0;
buf.mType = nvinfer1::DataType::kFLOAT;
buf.mBuffer = nullptr;
}
GenericBuffer& operator=(GenericBuffer&& buf) {
if (this != &buf) {
freeFn(mBuffer);
mSize = buf.mSize;
mCapacity = buf.mCapacity;
mType = buf.mType;
mBuffer = buf.mBuffer;
// Reset buf.
buf.mSize = 0;
buf.mCapacity = 0;
buf.mBuffer = nullptr;
}
return *this;
}
//!
//! \brief Returns pointer to underlying array.
//!
void* data() { return mBuffer; }
//!
//! \brief Returns pointer to underlying array.
//!
const void* data() const { return mBuffer; }
//!
//! \brief Returns the size (in number of elements) of the buffer.
//!
size_t size() const { return mSize; }
//!
//! \brief Returns the size (in bytes) of the buffer.
//!
size_t nbBytes() const {
return this->size() * samplesCommon::getElementSize(mType);
}
//!
//! \brief Resizes the buffer. This is a no-op if the new size is smaller than
//! or equal to the current capacity.
//!
void resize(size_t newSize) {
mSize = newSize;
if (mCapacity < newSize) {
freeFn(mBuffer);
if (!allocFn(&mBuffer, this->nbBytes())) {
throw std::bad_alloc{};
}
mCapacity = newSize;
}
}
//!
//! \brief Overload of resize that accepts Dims
//!
void resize(const nvinfer1::Dims& dims) {
return this->resize(samplesCommon::volume(dims));
}
~GenericBuffer() { freeFn(mBuffer); }
private:
size_t mSize{0}, mCapacity{0};
nvinfer1::DataType mType;
void* mBuffer;
AllocFunc allocFn;
FreeFunc freeFn;
};
class DeviceAllocator {
public:
bool operator()(void** ptr, size_t size) const {
return cudaMalloc(ptr, size) == cudaSuccess;
}
};
class DeviceFree {
public:
void operator()(void* ptr) const { cudaFree(ptr); }
};
class HostAllocator {
public:
bool operator()(void** ptr, size_t size) const {
*ptr = malloc(size);
return *ptr != nullptr;
}
};
class HostFree {
public:
void operator()(void* ptr) const { free(ptr); }
};
using DeviceBuffer = GenericBuffer<DeviceAllocator, DeviceFree>;
using HostBuffer = GenericBuffer<HostAllocator, HostFree>;
//!
//! \brief The ManagedBuffer class groups together a pair of corresponding
//! device and host buffers.
//!
class ManagedBuffer {
public:
DeviceBuffer deviceBuffer;
HostBuffer hostBuffer;
};
//!
//! \brief The BufferManager class handles host and device buffer allocation
//! and deallocation.
//!
//! \details This RAII class handles host and device buffer allocation and
//! deallocation,
//! memcpy between host and device buffers to aid with inference,
//! and debugging dumps to validate inference. The BufferManager class
//! is meant to be
//! used to simplify buffer management and any interactions between
//! buffers and the engine.
//!
class BufferManager {
public:
static const size_t kINVALID_SIZE_VALUE = ~size_t(0);
//!
//! \brief Create a BufferManager for handling buffer interactions with
//! engine.
//!
BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine,
const int batchSize = 0,
const nvinfer1::IExecutionContext* context = nullptr)
: mEngine(engine), mBatchSize(batchSize) {
// Full Dims implies no batch size.
assert(engine->hasImplicitBatchDimension() || mBatchSize == 0);
// Create host and device buffers
for (int i = 0; i < mEngine->getNbBindings(); i++) {
auto dims = context ? context->getBindingDimensions(i)
: mEngine->getBindingDimensions(i);
size_t vol = context || !mBatchSize ? 1 : static_cast<size_t>(mBatchSize);
nvinfer1::DataType type = mEngine->getBindingDataType(i);
int vecDim = mEngine->getBindingVectorizedDim(i);
if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
{
int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
vol *= scalarsPerVec;
}
vol *= samplesCommon::volume(dims);
std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
manBuf->deviceBuffer = DeviceBuffer(vol, type);
manBuf->hostBuffer = HostBuffer(vol, type);
mDeviceBindings.emplace_back(manBuf->deviceBuffer.data());
mManagedBuffers.emplace_back(std::move(manBuf));
}
}
//!
//! \brief Returns a vector of device buffers that you can use directly as
//! bindings for the execute and enqueue methods of IExecutionContext.
//!
std::vector<void*>& getDeviceBindings() { return mDeviceBindings; }
//!
//! \brief Returns a vector of device buffers.
//!
const std::vector<void*>& getDeviceBindings() const {
return mDeviceBindings;
}
//!
//! \brief Returns the device buffer corresponding to tensorName.
//! Returns nullptr if no such tensor can be found.
//!
void* getDeviceBuffer(const std::string& tensorName) const {
return getBuffer(false, tensorName);
}
//!
//! \brief Returns the host buffer corresponding to tensorName.
//! Returns nullptr if no such tensor can be found.
//!
void* getHostBuffer(const std::string& tensorName) const {
return getBuffer(true, tensorName);
}
//!
//! \brief Returns the size of the host and device buffers that correspond to
//! tensorName.
//! Returns kINVALID_SIZE_VALUE if no such tensor can be found.
//!
size_t size(const std::string& tensorName) const {
int index = mEngine->getBindingIndex(tensorName.c_str());
if (index == -1)
return kINVALID_SIZE_VALUE;
return mManagedBuffers[index]->hostBuffer.nbBytes();
}
//!
//! \brief Dump host buffer with specified tensorName to ostream.
//! Prints error message to std::ostream if no such tensor can be
//! found.
//!
void dumpBuffer(std::ostream& os, const std::string& tensorName) {
int index = mEngine->getBindingIndex(tensorName.c_str());
if (index == -1) {
os << "Invalid tensor name" << std::endl;
return;
}
void* buf = mManagedBuffers[index]->hostBuffer.data();
size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes();
nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index);
size_t rowCount = static_cast<size_t>(
bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
int leadDim = mBatchSize;
int* trailDims = bufDims.d;
int nbDims = bufDims.nbDims;
// Fix explicit Dimension networks
if (!leadDim && nbDims > 0) {
leadDim = bufDims.d[0];
++trailDims;
--nbDims;
}
os << "[" << leadDim;
for (int i = 0; i < nbDims; i++)
os << ", " << trailDims[i];
os << "]" << std::endl;
switch (mEngine->getBindingDataType(index)) {
case nvinfer1::DataType::kINT32:
print<int32_t>(os, buf, bufSize, rowCount);
break;
case nvinfer1::DataType::kFLOAT:
print<float>(os, buf, bufSize, rowCount);
break;
case nvinfer1::DataType::kHALF:
print<half_float::half>(os, buf, bufSize, rowCount);
break;
case nvinfer1::DataType::kINT8:
assert(0 && "Int8 network-level input and output is not supported");
break;
case nvinfer1::DataType::kBOOL:
assert(0 && "Bool network-level input and output are not supported");
break;
}
}
//!
//! \brief Templated print function that dumps buffers of arbitrary type to
//! std::ostream.
//! rowCount parameter controls how many elements are on each line.
//! A rowCount of 1 means that there is only 1 element on each line.
//!
template <typename T>
void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount) {
assert(rowCount != 0);
assert(bufSize % sizeof(T) == 0);
T* typedBuf = static_cast<T*>(buf);
size_t numItems = bufSize / sizeof(T);
for (int i = 0; i < static_cast<int>(numItems); i++) {
// Handle rowCount == 1 case
if (rowCount == 1 && i != static_cast<int>(numItems) - 1)
os << typedBuf[i] << std::endl;
else if (rowCount == 1)
os << typedBuf[i];
// Handle rowCount > 1 case
else if (i % rowCount == 0)
os << typedBuf[i];
else if (i % rowCount == rowCount - 1)
os << " " << typedBuf[i] << std::endl;
else
os << " " << typedBuf[i];
}
}
//!
//! \brief Copy the contents of input host buffers to input device buffers
//! synchronously.
//!
void copyInputToDevice() { memcpyBuffers(true, false, false); }
//!
//! \brief Copy the contents of output device buffers to output host buffers
//! synchronously.
//!
void copyOutputToHost() { memcpyBuffers(false, true, false); }
//!
//! \brief Copy the contents of input host buffers to input device buffers
//! asynchronously.
//!
void copyInputToDeviceAsync(const cudaStream_t& stream = 0) {
memcpyBuffers(true, false, true, stream);
}
//!
//! \brief Copy the contents of output device buffers to output host buffers
//! asynchronously.
//!
void copyOutputToHostAsync(const cudaStream_t& stream = 0) {
memcpyBuffers(false, true, true, stream);
}
~BufferManager() = default;
private:
void* getBuffer(const bool isHost, const std::string& tensorName) const {
int index = mEngine->getBindingIndex(tensorName.c_str());
if (index == -1)
return nullptr;
return (isHost ? mManagedBuffers[index]->hostBuffer.data()
: mManagedBuffers[index]->deviceBuffer.data());
}
void memcpyBuffers(const bool copyInput, const bool deviceToHost,
const bool async, const cudaStream_t& stream = 0) {
for (int i = 0; i < mEngine->getNbBindings(); i++) {
void* dstPtr = deviceToHost ? mManagedBuffers[i]->hostBuffer.data()
: mManagedBuffers[i]->deviceBuffer.data();
const void* srcPtr = deviceToHost
? mManagedBuffers[i]->deviceBuffer.data()
: mManagedBuffers[i]->hostBuffer.data();
const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes();
const cudaMemcpyKind memcpyType =
deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
if ((copyInput && mEngine->bindingIsInput(i)) ||
(!copyInput && !mEngine->bindingIsInput(i))) {
if (async)
CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
else
CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType));
}
}
}
std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The pointer to the engine
int mBatchSize; //!< The batch size for legacy networks, 0 otherwise.
std::vector<std::unique_ptr<ManagedBuffer>>
mManagedBuffers; //!< The vector of pointers to managed buffers
std::vector<void*> mDeviceBindings; //!< The vector of device buffers needed
//! for engine execution
};
} // namespace samplesCommon
#endif // TENSORRT_BUFFERS_H