mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 11:56:44 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			944 lines
		
	
	
		
			32 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			944 lines
		
	
	
		
			32 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
 | |
|  *
 | |
|  * Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  * you may not use this file except in compliance with the License.
 | |
|  * You may obtain a copy of the License at
 | |
|  *
 | |
|  *     http://www.apache.org/licenses/LICENSE-2.0
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  * See the License for the specific language governing permissions and
 | |
|  * limitations under the License.
 | |
|  */
 | |
| 
 | |
| #include <algorithm>
 | |
| #include <array>
 | |
| #include <chrono>
 | |
| #include <cuda_profiler_api.h>
 | |
| #include <functional>
 | |
| #include <limits>
 | |
| #include <memory>
 | |
| #include <mutex>
 | |
| #include <numeric>
 | |
| #include <thread>
 | |
| #include <utility>
 | |
| #include <vector>
 | |
| 
 | |
| #if defined(__QNX__)
 | |
| #include <sys/neutrino.h>
 | |
| #include <sys/syspage.h>
 | |
| #endif
 | |
| 
 | |
| #include "NvInfer.h"
 | |
| 
 | |
| #include "ErrorRecorder.h"
 | |
| #include "logger.h"
 | |
| #include "sampleDevice.h"
 | |
| #include "sampleEngines.h"
 | |
| #include "sampleInference.h"
 | |
| #include "sampleOptions.h"
 | |
| #include "sampleReporting.h"
 | |
| #include "sampleUtils.h"
 | |
| 
 | |
| namespace sample {
 | |
| 
 | |
| template <class MapType, class EngineType>
 | |
| bool validateTensorNames(const MapType& map, const EngineType* engine,
 | |
|                          const int32_t endBindingIndex) {
 | |
|   // Check if the provided input tensor names match the input tensors of the
 | |
|   // engine.
 | |
|   // Throw an error if the provided input tensor names cannot be found because
 | |
|   // it implies a potential typo.
 | |
|   for (const auto& item : map) {
 | |
|     bool tensorNameFound{false};
 | |
|     for (int32_t b = 0; b < endBindingIndex; ++b) {
 | |
|       if (engine->bindingIsInput(b) &&
 | |
|           engine->getBindingName(b) == item.first) {
 | |
|         tensorNameFound = true;
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
|     if (!tensorNameFound) {
 | |
|       sample::gLogError
 | |
|           << "Cannot find input tensor with name \"" << item.first
 | |
|           << "\" in the engine bindings! "
 | |
|           << "Please make sure the input tensor names are correct."
 | |
|           << std::endl;
 | |
|       return false;
 | |
|     }
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| template <class EngineType, class ContextType> class FillBindingClosure {
 | |
|  private:
 | |
|   using InputsMap = std::unordered_map<std::string, std::string>;
 | |
|   using BindingsVector = std::vector<std::unique_ptr<Bindings>>;
 | |
| 
 | |
|   EngineType const* engine;
 | |
|   ContextType const* context;
 | |
|   InputsMap const& inputs;
 | |
|   BindingsVector& bindings;
 | |
|   int32_t batch;
 | |
|   int32_t endBindingIndex;
 | |
| 
 | |
|   void fillOneBinding(int32_t bindingIndex, int64_t vol) {
 | |
|     auto const dims = getDims(bindingIndex);
 | |
|     auto const name = engine->getBindingName(bindingIndex);
 | |
|     auto const isInput = engine->bindingIsInput(bindingIndex);
 | |
|     auto const dataType = engine->getBindingDataType(bindingIndex);
 | |
|     auto const* bindingInOutStr = isInput ? "input" : "output";
 | |
|     for (auto& binding : bindings) {
 | |
|       const auto input = inputs.find(name);
 | |
|       if (isInput && input != inputs.end()) {
 | |
|         sample::gLogInfo << "Using values loaded from " << input->second
 | |
|                          << " for input " << name << std::endl;
 | |
|         binding->addBinding(bindingIndex, name, isInput, vol, dataType,
 | |
|                             input->second);
 | |
|       } else {
 | |
|         sample::gLogInfo << "Using random values for " << bindingInOutStr << " "
 | |
|                          << name << std::endl;
 | |
|         binding->addBinding(bindingIndex, name, isInput, vol, dataType);
 | |
|       }
 | |
|       sample::gLogInfo << "Created " << bindingInOutStr << " binding for "
 | |
|                        << name << " with dimensions " << dims << std::endl;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   bool fillAllBindings(int32_t batch, int32_t endBindingIndex) {
 | |
|     if (!validateTensorNames(inputs, engine, endBindingIndex)) {
 | |
|       sample::gLogError << "Invalid tensor names found in --loadInputs flag."
 | |
|                         << std::endl;
 | |
|       return false;
 | |
|     }
 | |
| 
 | |
|     for (int32_t b = 0; b < endBindingIndex; b++) {
 | |
|       auto const dims = getDims(b);
 | |
|       auto const comps = engine->getBindingComponentsPerElement(b);
 | |
|       auto const strides = context->getStrides(b);
 | |
|       int32_t const vectorDimIndex = engine->getBindingVectorizedDim(b);
 | |
|       auto const vol = volume(dims, strides, vectorDimIndex, comps, batch);
 | |
|       fillOneBinding(b, vol);
 | |
|     }
 | |
|     return true;
 | |
|   }
 | |
| 
 | |
|   Dims getDims(int32_t bindingIndex);
 | |
| 
 | |
|  public:
 | |
|   FillBindingClosure(EngineType const* _engine, ContextType const* _context,
 | |
|                      InputsMap const& _inputs, BindingsVector& _bindings,
 | |
|                      int32_t _batch, int32_t _endBindingIndex)
 | |
|       : engine(_engine), context(_context), inputs(_inputs),
 | |
|         bindings(_bindings), batch(_batch), endBindingIndex(_endBindingIndex) {}
 | |
| 
 | |
|   bool operator()() { return fillAllBindings(batch, endBindingIndex); }
 | |
| };
 | |
| 
 | |
| template <>
 | |
| Dims FillBindingClosure<nvinfer1::ICudaEngine, nvinfer1::IExecutionContext>::
 | |
|     getDims(int32_t bindingIndex) {
 | |
|   return context->getBindingDimensions(bindingIndex);
 | |
| }
 | |
| 
 | |
| template <>
 | |
| Dims FillBindingClosure<
 | |
|     nvinfer1::safe::ICudaEngine,
 | |
|     nvinfer1::safe::IExecutionContext>::getDims(int32_t bindingIndex) {
 | |
|   return engine->getBindingDimensions(bindingIndex);
 | |
| }
 | |
| 
 | |
| bool setUpInference(InferenceEnvironment& iEnv,
 | |
|                     const InferenceOptions& inference) {
 | |
|   int32_t device{};
 | |
|   cudaCheck(cudaGetDevice(&device));
 | |
| 
 | |
|   cudaDeviceProp properties;
 | |
|   cudaCheck(cudaGetDeviceProperties(&properties, device));
 | |
|   // Use managed memory on integrated devices when transfers are skipped
 | |
|   // and when it is explicitly requested on the commandline.
 | |
|   bool useManagedMemory{(inference.skipTransfers && properties.integrated) ||
 | |
|                         inference.useManaged};
 | |
|   using FillSafeBindings =
 | |
|       FillBindingClosure<nvinfer1::safe::ICudaEngine,
 | |
|                          nvinfer1::safe::IExecutionContext>;
 | |
|   if (iEnv.safe) {
 | |
|     ASSERT(sample::hasSafeRuntime());
 | |
|     auto* safeEngine = iEnv.safeEngine.get();
 | |
|     for (int32_t s = 0; s < inference.streams; ++s) {
 | |
|       iEnv.safeContext.emplace_back(safeEngine->createExecutionContext());
 | |
|       iEnv.bindings.emplace_back(new Bindings(useManagedMemory));
 | |
|     }
 | |
|     const int32_t nBindings = safeEngine->getNbBindings();
 | |
|     auto const* safeContext = iEnv.safeContext.front().get();
 | |
|     // batch is set to 1 because safety only support explicit batch.
 | |
|     return FillSafeBindings(iEnv.safeEngine.get(), safeContext,
 | |
|                             inference.inputs, iEnv.bindings, 1, nBindings)();
 | |
|   }
 | |
| 
 | |
|   using FillStdBindings =
 | |
|       FillBindingClosure<nvinfer1::ICudaEngine, nvinfer1::IExecutionContext>;
 | |
| 
 | |
|   for (int32_t s = 0; s < inference.streams; ++s) {
 | |
|     auto ec = iEnv.engine->createExecutionContext();
 | |
|     if (ec == nullptr) {
 | |
|       sample::gLogError << "Unable to create execution context for stream " << s
 | |
|                         << "." << std::endl;
 | |
|       return false;
 | |
|     }
 | |
|     iEnv.context.emplace_back(ec);
 | |
|     iEnv.bindings.emplace_back(new Bindings(useManagedMemory));
 | |
|   }
 | |
|   if (iEnv.profiler) {
 | |
|     iEnv.context.front()->setProfiler(iEnv.profiler.get());
 | |
|     // Always run reportToProfiler() after enqueue launch
 | |
|     iEnv.context.front()->setEnqueueEmitsProfile(false);
 | |
|   }
 | |
| 
 | |
|   const int32_t nOptProfiles = iEnv.engine->getNbOptimizationProfiles();
 | |
|   const int32_t nBindings = iEnv.engine->getNbBindings();
 | |
|   const int32_t bindingsInProfile =
 | |
|       nOptProfiles > 0 ? nBindings / nOptProfiles : 0;
 | |
|   const int32_t endBindingIndex =
 | |
|       bindingsInProfile ? bindingsInProfile : iEnv.engine->getNbBindings();
 | |
| 
 | |
|   if (nOptProfiles > 1) {
 | |
|     sample::gLogWarning << "Multiple profiles are currently not supported. "
 | |
|                            "Running with one profile."
 | |
|                         << std::endl;
 | |
|   }
 | |
| 
 | |
|   // Make sure that the tensor names provided in command-line args actually
 | |
|   // exist in any of the engine bindings
 | |
|   // to avoid silent typos.
 | |
|   if (!validateTensorNames(inference.shapes, iEnv.engine.get(),
 | |
|                            endBindingIndex)) {
 | |
|     sample::gLogError << "Invalid tensor names found in --shapes flag."
 | |
|                       << std::endl;
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   // Set all input dimensions before all bindings can be allocated
 | |
|   for (int32_t b = 0; b < endBindingIndex; ++b) {
 | |
|     if (iEnv.engine->bindingIsInput(b)) {
 | |
|       auto dims = iEnv.context.front()->getBindingDimensions(b);
 | |
|       const bool isScalar = dims.nbDims == 0;
 | |
|       const bool isDynamicInput =
 | |
|           std::any_of(dims.d, dims.d + dims.nbDims,
 | |
|                       [](int32_t dim) { return dim == -1; }) ||
 | |
|           iEnv.engine->isShapeBinding(b);
 | |
|       if (isDynamicInput) {
 | |
|         auto shape = inference.shapes.find(iEnv.engine->getBindingName(b));
 | |
| 
 | |
|         std::vector<int32_t> staticDims;
 | |
|         if (shape == inference.shapes.end()) {
 | |
|           // If no shape is provided, set dynamic dimensions to 1.
 | |
|           constexpr int32_t DEFAULT_DIMENSION = 1;
 | |
|           if (iEnv.engine->isShapeBinding(b)) {
 | |
|             if (isScalar) {
 | |
|               staticDims.push_back(1);
 | |
|             } else {
 | |
|               staticDims.resize(dims.d[0]);
 | |
|               std::fill(staticDims.begin(), staticDims.end(),
 | |
|                         DEFAULT_DIMENSION);
 | |
|             }
 | |
|           } else {
 | |
|             staticDims.resize(dims.nbDims);
 | |
|             std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(),
 | |
|                            [&](int32_t dimension) {
 | |
|                              return dimension >= 0 ? dimension
 | |
|                                                    : DEFAULT_DIMENSION;
 | |
|                            });
 | |
|           }
 | |
|           sample::gLogWarning << "Dynamic dimensions required for input: "
 | |
|                               << iEnv.engine->getBindingName(b)
 | |
|                               << ", but no shapes were provided. Automatically "
 | |
|                                  "overriding shape to: "
 | |
|                               << staticDims << std::endl;
 | |
|         } else if (inference.inputs.count(shape->first) &&
 | |
|                    iEnv.engine->isShapeBinding(b)) {
 | |
|           if (isScalar || dims.nbDims == 1) {
 | |
|             // Load shape tensor from file.
 | |
|             size_t const size = isScalar ? 1 : dims.d[0];
 | |
|             staticDims.resize(size);
 | |
|             auto const& filename = inference.inputs.at(shape->first);
 | |
|             auto dst = reinterpret_cast<char*>(staticDims.data());
 | |
|             loadFromFile(filename, dst,
 | |
|                          size * sizeof(decltype(staticDims)::value_type));
 | |
|           } else {
 | |
|             sample::gLogWarning << "Cannot load shape tensor " << shape->first
 | |
|                                 << " from file, "
 | |
|                                 << "ND-Shape isn't supported yet" << std::endl;
 | |
|             // Fallback
 | |
|             staticDims = shape->second;
 | |
|           }
 | |
|         } else {
 | |
|           staticDims = shape->second;
 | |
|         }
 | |
| 
 | |
|         for (auto& c : iEnv.context) {
 | |
|           if (iEnv.engine->isShapeBinding(b)) {
 | |
|             if (!c->setInputShapeBinding(b, staticDims.data())) {
 | |
|               return false;
 | |
|             }
 | |
|           } else {
 | |
|             if (!c->setBindingDimensions(b, toDims(staticDims))) {
 | |
|               return false;
 | |
|             }
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   auto* engine = iEnv.engine.get();
 | |
|   auto const* context = iEnv.context.front().get();
 | |
|   int32_t const batch =
 | |
|       engine->hasImplicitBatchDimension() ? inference.batch : 1;
 | |
|   return FillStdBindings(engine, context, inference.inputs, iEnv.bindings,
 | |
|                          batch, endBindingIndex)();
 | |
| }
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| #if defined(__QNX__)
 | |
| using TimePoint = double;
 | |
| #else
 | |
| using TimePoint = std::chrono::time_point<std::chrono::high_resolution_clock>;
 | |
| #endif
 | |
| 
 | |
| TimePoint getCurrentTime() {
 | |
| #if defined(__QNX__)
 | |
|   uint64_t const currentCycles = ClockCycles();
 | |
|   uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec;
 | |
|   // Return current timestamp in ms.
 | |
|   return static_cast<TimePoint>(currentCycles) * 1000. / cyclesPerSecond;
 | |
| #else
 | |
|   return std::chrono::high_resolution_clock::now();
 | |
| #endif
 | |
| }
 | |
| 
 | |
| //!
 | |
| //! \struct SyncStruct
 | |
| //! \brief Threads synchronization structure
 | |
| //!
 | |
| struct SyncStruct {
 | |
|   std::mutex mutex;
 | |
|   TrtCudaStream mainStream;
 | |
|   TrtCudaEvent gpuStart{cudaEventBlockingSync};
 | |
|   TimePoint cpuStart{};
 | |
|   float sleep{};
 | |
| };
 | |
| 
 | |
| struct Enqueue {
 | |
|   explicit Enqueue(nvinfer1::IExecutionContext& context, void** buffers)
 | |
|       : mContext(context), mBuffers(buffers) {}
 | |
| 
 | |
|   nvinfer1::IExecutionContext& mContext;
 | |
|   void** mBuffers{};
 | |
| };
 | |
| 
 | |
| //!
 | |
| //! \class EnqueueImplicit
 | |
| //! \brief Functor to enqueue inference with implict batch
 | |
| //!
 | |
| class EnqueueImplicit : private Enqueue {
 | |
|  public:
 | |
|   explicit EnqueueImplicit(nvinfer1::IExecutionContext& context, void** buffers,
 | |
|                            int32_t batch)
 | |
|       : Enqueue(context, buffers), mBatch(batch) {}
 | |
| 
 | |
|   bool operator()(TrtCudaStream& stream) const {
 | |
|     if (mContext.enqueue(mBatch, mBuffers, stream.get(), nullptr)) {
 | |
|       // Collecting layer timing info from current profile index of execution
 | |
|       // context
 | |
|       if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() &&
 | |
|           !mContext.reportToProfiler()) {
 | |
|         gLogWarning
 | |
|             << "Failed to collect layer timing info from previous enqueue()"
 | |
|             << std::endl;
 | |
|       }
 | |
|       return true;
 | |
|     }
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   int32_t mBatch;
 | |
| };
 | |
| 
 | |
| //!
 | |
| //! \class EnqueueExplicit
 | |
| //! \brief Functor to enqueue inference with explict batch
 | |
| //!
 | |
| class EnqueueExplicit : private Enqueue {
 | |
|  public:
 | |
|   explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, void** buffers)
 | |
|       : Enqueue(context, buffers) {}
 | |
| 
 | |
|   bool operator()(TrtCudaStream& stream) const {
 | |
|     if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) {
 | |
|       // Collecting layer timing info from current profile index of execution
 | |
|       // context
 | |
|       if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() &&
 | |
|           !mContext.reportToProfiler()) {
 | |
|         gLogWarning
 | |
|             << "Failed to collect layer timing info from previous enqueueV2()"
 | |
|             << std::endl;
 | |
|       }
 | |
|       return true;
 | |
|     }
 | |
|     return false;
 | |
|   }
 | |
| };
 | |
| 
 | |
| //!
 | |
| //! \class EnqueueGraph
 | |
| //! \brief Functor to enqueue inference from CUDA Graph
 | |
| //!
 | |
| class EnqueueGraph {
 | |
|  public:
 | |
|   explicit EnqueueGraph(nvinfer1::IExecutionContext& context,
 | |
|                         TrtCudaGraph& graph)
 | |
|       : mGraph(graph), mContext(context) {}
 | |
| 
 | |
|   bool operator()(TrtCudaStream& stream) const {
 | |
|     if (mGraph.launch(stream)) {
 | |
|       // Collecting layer timing info from current profile index of execution
 | |
|       // context
 | |
|       if (mContext.getProfiler() && !mContext.reportToProfiler()) {
 | |
|         gLogWarning << "Failed to collect layer timing info from previous CUDA "
 | |
|                        "graph launch"
 | |
|                     << std::endl;
 | |
|       }
 | |
|       return true;
 | |
|     }
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   TrtCudaGraph& mGraph;
 | |
|   nvinfer1::IExecutionContext& mContext;
 | |
| };
 | |
| 
 | |
| //!
 | |
| //! \class EnqueueSafe
 | |
| //! \brief Functor to enqueue safe execution context
 | |
| //!
 | |
| class EnqueueSafe {
 | |
|  public:
 | |
|   explicit EnqueueSafe(nvinfer1::safe::IExecutionContext& context,
 | |
|                        void** buffers)
 | |
|       : mContext(context), mBuffers(buffers) {}
 | |
| 
 | |
|   bool operator()(TrtCudaStream& stream) const {
 | |
|     if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) {
 | |
|       return true;
 | |
|     }
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   nvinfer1::safe::IExecutionContext& mContext;
 | |
|   void** mBuffers{};
 | |
| };
 | |
| 
 | |
| using EnqueueFunction = std::function<bool(TrtCudaStream&)>;
 | |
| 
 | |
| enum class StreamType : int32_t {
 | |
|   kINPUT = 0,
 | |
|   kCOMPUTE = 1,
 | |
|   kOUTPUT = 2,
 | |
|   kNUM = 3
 | |
| };
 | |
| 
 | |
| enum class EventType : int32_t {
 | |
|   kINPUT_S = 0,
 | |
|   kINPUT_E = 1,
 | |
|   kCOMPUTE_S = 2,
 | |
|   kCOMPUTE_E = 3,
 | |
|   kOUTPUT_S = 4,
 | |
|   kOUTPUT_E = 5,
 | |
|   kNUM = 6
 | |
| };
 | |
| 
 | |
| using MultiStream =
 | |
|     std::array<TrtCudaStream, static_cast<int32_t>(StreamType::kNUM)>;
 | |
| 
 | |
| using MultiEvent = std::array<std::unique_ptr<TrtCudaEvent>,
 | |
|                               static_cast<int32_t>(EventType::kNUM)>;
 | |
| 
 | |
| using EnqueueTimes = std::array<TimePoint, 2>;
 | |
| 
 | |
| //!
 | |
| //! \class Iteration
 | |
| //! \brief Inference iteration and streams management
 | |
| //!
 | |
| template <class ContextType> class Iteration {
 | |
|  public:
 | |
|   Iteration(int32_t id, const InferenceOptions& inference, ContextType& context,
 | |
|             Bindings& bindings)
 | |
|       : mBindings(bindings), mStreamId(id), mDepth(1 + inference.overlap),
 | |
|         mActive(mDepth), mEvents(mDepth), mEnqueueTimes(mDepth),
 | |
|         mContext(&context) {
 | |
|     for (int32_t d = 0; d < mDepth; ++d) {
 | |
|       for (int32_t e = 0; e < static_cast<int32_t>(EventType::kNUM); ++e) {
 | |
|         mEvents[d][e].reset(new TrtCudaEvent(!inference.spin));
 | |
|       }
 | |
|     }
 | |
|     createEnqueueFunction(inference, context, bindings);
 | |
|   }
 | |
| 
 | |
|   bool query(bool skipTransfers) {
 | |
|     if (mActive[mNext]) {
 | |
|       return true;
 | |
|     }
 | |
| 
 | |
|     if (!skipTransfers) {
 | |
|       record(EventType::kINPUT_S, StreamType::kINPUT);
 | |
|       mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
 | |
|       record(EventType::kINPUT_E, StreamType::kINPUT);
 | |
|       wait(EventType::kINPUT_E,
 | |
|            StreamType::kCOMPUTE); // Wait for input DMA before compute
 | |
|     }
 | |
| 
 | |
|     record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE);
 | |
|     recordEnqueueTime();
 | |
|     if (!mEnqueue(getStream(StreamType::kCOMPUTE))) {
 | |
|       return false;
 | |
|     }
 | |
|     recordEnqueueTime();
 | |
|     record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE);
 | |
| 
 | |
|     if (!skipTransfers) {
 | |
|       wait(EventType::kCOMPUTE_E,
 | |
|            StreamType::kOUTPUT); // Wait for compute before output DMA
 | |
|       record(EventType::kOUTPUT_S, StreamType::kOUTPUT);
 | |
|       mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
 | |
|       record(EventType::kOUTPUT_E, StreamType::kOUTPUT);
 | |
|     }
 | |
| 
 | |
|     mActive[mNext] = true;
 | |
|     moveNext();
 | |
|     return true;
 | |
|   }
 | |
| 
 | |
|   float sync(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart,
 | |
|              std::vector<InferenceTrace>& trace, bool skipTransfers) {
 | |
|     if (mActive[mNext]) {
 | |
|       if (skipTransfers) {
 | |
|         getEvent(EventType::kCOMPUTE_E).synchronize();
 | |
|       } else {
 | |
|         getEvent(EventType::kOUTPUT_E).synchronize();
 | |
|       }
 | |
|       trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers));
 | |
|       mActive[mNext] = false;
 | |
|       return getEvent(EventType::kCOMPUTE_S) - gpuStart;
 | |
|     }
 | |
|     return 0;
 | |
|   }
 | |
| 
 | |
|   void syncAll(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart,
 | |
|                std::vector<InferenceTrace>& trace, bool skipTransfers) {
 | |
|     for (int32_t d = 0; d < mDepth; ++d) {
 | |
|       sync(cpuStart, gpuStart, trace, skipTransfers);
 | |
|       moveNext();
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   void wait(TrtCudaEvent& gpuStart) {
 | |
|     getStream(StreamType::kINPUT).wait(gpuStart);
 | |
|   }
 | |
| 
 | |
|   void setInputData() {
 | |
|     mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
 | |
|   }
 | |
| 
 | |
|   void fetchOutputData() {
 | |
|     mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   void moveNext() { mNext = mDepth - 1 - mNext; }
 | |
| 
 | |
|   TrtCudaStream& getStream(StreamType t) {
 | |
|     return mStream[static_cast<int32_t>(t)];
 | |
|   }
 | |
| 
 | |
|   TrtCudaEvent& getEvent(EventType t) {
 | |
|     return *mEvents[mNext][static_cast<int32_t>(t)];
 | |
|   }
 | |
| 
 | |
|   void record(EventType e, StreamType s) { getEvent(e).record(getStream(s)); }
 | |
| 
 | |
|   void recordEnqueueTime() {
 | |
|     mEnqueueTimes[mNext][enqueueStart] = getCurrentTime();
 | |
|     enqueueStart = 1 - enqueueStart;
 | |
|   }
 | |
| 
 | |
|   TimePoint getEnqueueTime(bool start) {
 | |
|     return mEnqueueTimes[mNext][start ? 0 : 1];
 | |
|   }
 | |
| 
 | |
|   void wait(EventType e, StreamType s) { getStream(s).wait(getEvent(e)); }
 | |
| 
 | |
|   InferenceTrace getTrace(const TimePoint& cpuStart,
 | |
|                           const TrtCudaEvent& gpuStart, bool skipTransfers) {
 | |
|     float is = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart
 | |
|                              : getEvent(EventType::kINPUT_S) - gpuStart;
 | |
|     float ie = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart
 | |
|                              : getEvent(EventType::kINPUT_E) - gpuStart;
 | |
|     float os = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart
 | |
|                              : getEvent(EventType::kOUTPUT_S) - gpuStart;
 | |
|     float oe = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart
 | |
|                              : getEvent(EventType::kOUTPUT_E) - gpuStart;
 | |
| 
 | |
|     return InferenceTrace(mStreamId,
 | |
|                           std::chrono::duration<float, std::milli>(
 | |
|                               getEnqueueTime(true) - cpuStart)
 | |
|                               .count(),
 | |
|                           std::chrono::duration<float, std::milli>(
 | |
|                               getEnqueueTime(false) - cpuStart)
 | |
|                               .count(),
 | |
|                           is, ie, getEvent(EventType::kCOMPUTE_S) - gpuStart,
 | |
|                           getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe);
 | |
|   }
 | |
| 
 | |
|   void createEnqueueFunction(const InferenceOptions& inference,
 | |
|                              nvinfer1::IExecutionContext& context,
 | |
|                              Bindings& bindings) {
 | |
|     if (inference.batch) {
 | |
|       mEnqueue = EnqueueFunction(EnqueueImplicit(
 | |
|           context, mBindings.getDeviceBuffers(), inference.batch));
 | |
|     } else {
 | |
|       mEnqueue = EnqueueFunction(
 | |
|           EnqueueExplicit(context, mBindings.getDeviceBuffers()));
 | |
|     }
 | |
|     if (inference.graph) {
 | |
|       TrtCudaStream& stream = getStream(StreamType::kCOMPUTE);
 | |
|       // Avoid capturing initialization calls by executing the enqueue function
 | |
|       // at least
 | |
|       // once before starting CUDA graph capture.
 | |
|       const auto ret = mEnqueue(stream);
 | |
|       assert(ret);
 | |
|       stream.synchronize();
 | |
| 
 | |
|       mGraph.beginCapture(stream);
 | |
|       // The built TRT engine may contain operations that are not permitted
 | |
|       // under CUDA graph capture mode.
 | |
|       // When the stream is capturing, the enqueue call may return false if the
 | |
|       // current CUDA graph capture fails.
 | |
|       if (mEnqueue(stream)) {
 | |
|         mGraph.endCapture(stream);
 | |
|         mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph));
 | |
|       } else {
 | |
|         mGraph.endCaptureOnError(stream);
 | |
|         // Ensure any CUDA error has been cleaned up.
 | |
|         cudaCheck(cudaGetLastError());
 | |
|         sample::gLogWarning << "The built TensorRT engine contains operations "
 | |
|                                "that are not permitted under "
 | |
|                                "CUDA graph capture mode."
 | |
|                             << std::endl;
 | |
|         sample::gLogWarning << "The specified --useCudaGraph flag has been "
 | |
|                                "ignored. The inference will be "
 | |
|                                "launched without using CUDA graph launch."
 | |
|                             << std::endl;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   void createEnqueueFunction(const InferenceOptions&,
 | |
|                              nvinfer1::safe::IExecutionContext& context,
 | |
|                              Bindings&) {
 | |
|     mEnqueue =
 | |
|         EnqueueFunction(EnqueueSafe(context, mBindings.getDeviceBuffers()));
 | |
|   }
 | |
| 
 | |
|   Bindings& mBindings;
 | |
| 
 | |
|   TrtCudaGraph mGraph;
 | |
|   EnqueueFunction mEnqueue;
 | |
| 
 | |
|   int32_t mStreamId{0};
 | |
|   int32_t mNext{0};
 | |
|   int32_t mDepth{2}; // default to double buffer to hide DMA transfers
 | |
| 
 | |
|   std::vector<bool> mActive;
 | |
|   MultiStream mStream;
 | |
|   std::vector<MultiEvent> mEvents;
 | |
| 
 | |
|   int32_t enqueueStart{0};
 | |
|   std::vector<EnqueueTimes> mEnqueueTimes;
 | |
|   ContextType* mContext{nullptr};
 | |
| };
 | |
| 
 | |
| template <class ContextType>
 | |
| bool inferenceLoop(
 | |
|     std::vector<std::unique_ptr<Iteration<ContextType>>>& iStreams,
 | |
|     const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, int iterations,
 | |
|     float maxDurationMs, float warmupMs, std::vector<InferenceTrace>& trace,
 | |
|     bool skipTransfers, float idleMs) {
 | |
|   float durationMs = 0;
 | |
|   int32_t skip = 0;
 | |
| 
 | |
|   for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs;
 | |
|        ++i) {
 | |
|     for (auto& s : iStreams) {
 | |
|       if (!s->query(skipTransfers)) {
 | |
|         return false;
 | |
|       }
 | |
|     }
 | |
|     for (auto& s : iStreams) {
 | |
|       durationMs = std::max(durationMs,
 | |
|                             s->sync(cpuStart, gpuStart, trace, skipTransfers));
 | |
|     }
 | |
|     if (durationMs < warmupMs) // Warming up
 | |
|     {
 | |
|       if (durationMs) // Skip complete iterations
 | |
|       {
 | |
|         ++skip;
 | |
|       }
 | |
|       continue;
 | |
|     }
 | |
|     if (idleMs != 0.F) {
 | |
|       std::this_thread::sleep_for(
 | |
|           std::chrono::duration<float, std::milli>(idleMs));
 | |
|     }
 | |
|   }
 | |
|   for (auto& s : iStreams) {
 | |
|     s->syncAll(cpuStart, gpuStart, trace, skipTransfers);
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| template <class ContextType>
 | |
| void inferenceExecution(const InferenceOptions& inference,
 | |
|                         InferenceEnvironment& iEnv, SyncStruct& sync,
 | |
|                         const int32_t threadIdx, const int32_t streamsPerThread,
 | |
|                         int32_t device, std::vector<InferenceTrace>& trace) {
 | |
|   float warmupMs = inference.warmup;
 | |
|   float durationMs = inference.duration * 1000.F + warmupMs;
 | |
| 
 | |
|   cudaCheck(cudaSetDevice(device));
 | |
| 
 | |
|   std::vector<std::unique_ptr<Iteration<ContextType>>> iStreams;
 | |
| 
 | |
|   for (int32_t s = 0; s < streamsPerThread; ++s) {
 | |
|     const int32_t streamId{threadIdx * streamsPerThread + s};
 | |
|     auto* iteration = new Iteration<ContextType>(
 | |
|         streamId, inference, *iEnv.template getContext<ContextType>(streamId),
 | |
|         *iEnv.bindings[streamId]);
 | |
|     if (inference.skipTransfers) {
 | |
|       iteration->setInputData();
 | |
|     }
 | |
|     iStreams.emplace_back(iteration);
 | |
|   }
 | |
| 
 | |
|   for (auto& s : iStreams) {
 | |
|     s->wait(sync.gpuStart);
 | |
|   }
 | |
| 
 | |
|   std::vector<InferenceTrace> localTrace;
 | |
|   if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart,
 | |
|                      inference.iterations, durationMs, warmupMs, localTrace,
 | |
|                      inference.skipTransfers, inference.idle)) {
 | |
|     iEnv.error = true;
 | |
|   }
 | |
| 
 | |
|   if (inference.skipTransfers) {
 | |
|     for (auto& s : iStreams) {
 | |
|       s->fetchOutputData();
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   sync.mutex.lock();
 | |
|   trace.insert(trace.end(), localTrace.begin(), localTrace.end());
 | |
|   sync.mutex.unlock();
 | |
| }
 | |
| 
 | |
| inline std::thread makeThread(const InferenceOptions& inference,
 | |
|                               InferenceEnvironment& iEnv, SyncStruct& sync,
 | |
|                               int32_t threadIdx, int32_t streamsPerThread,
 | |
|                               int32_t device,
 | |
|                               std::vector<InferenceTrace>& trace) {
 | |
|   if (iEnv.safe) {
 | |
|     ASSERT(sample::hasSafeRuntime());
 | |
|     return std::thread(inferenceExecution<nvinfer1::safe::IExecutionContext>,
 | |
|                        std::cref(inference), std::ref(iEnv), std::ref(sync),
 | |
|                        threadIdx, streamsPerThread, device, std::ref(trace));
 | |
|   }
 | |
| 
 | |
|   return std::thread(inferenceExecution<nvinfer1::IExecutionContext>,
 | |
|                      std::cref(inference), std::ref(iEnv), std::ref(sync),
 | |
|                      threadIdx, streamsPerThread, device, std::ref(trace));
 | |
| }
 | |
| 
 | |
| } // namespace
 | |
| 
 | |
| bool runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv,
 | |
|                   int32_t device, std::vector<InferenceTrace>& trace) {
 | |
|   cudaCheck(cudaProfilerStart());
 | |
| 
 | |
|   trace.resize(0);
 | |
| 
 | |
|   SyncStruct sync;
 | |
|   sync.sleep = inference.sleep;
 | |
|   sync.mainStream.sleep(&sync.sleep);
 | |
|   sync.cpuStart = getCurrentTime();
 | |
|   sync.gpuStart.record(sync.mainStream);
 | |
| 
 | |
|   // When multiple streams are used, trtexec can run inference in two modes:
 | |
|   // (1) if inference.threads is true, then run each stream on each thread.
 | |
|   // (2) if inference.threads is false, then run all streams on the same thread.
 | |
|   const int32_t numThreads = inference.threads ? inference.streams : 1;
 | |
|   const int32_t streamsPerThread = inference.threads ? 1 : inference.streams;
 | |
| 
 | |
|   std::vector<std::thread> threads;
 | |
|   for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) {
 | |
|     threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx,
 | |
|                                     streamsPerThread, device, trace));
 | |
|   }
 | |
|   for (auto& th : threads) {
 | |
|     th.join();
 | |
|   }
 | |
| 
 | |
|   cudaCheck(cudaProfilerStop());
 | |
| 
 | |
|   auto cmpTrace = [](const InferenceTrace& a, const InferenceTrace& b) {
 | |
|     return a.h2dStart < b.h2dStart;
 | |
|   };
 | |
|   std::sort(trace.begin(), trace.end(), cmpTrace);
 | |
| 
 | |
|   return !iEnv.error;
 | |
| }
 | |
| 
 | |
| namespace {
 | |
| size_t reportGpuMemory() {
 | |
|   static size_t prevFree{0};
 | |
|   size_t free{0};
 | |
|   size_t total{0};
 | |
|   size_t newlyAllocated{0};
 | |
|   cudaCheck(cudaMemGetInfo(&free, &total));
 | |
|   sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB";
 | |
|   if (prevFree != 0) {
 | |
|     newlyAllocated = (prevFree - free);
 | |
|     sample::gLogInfo << ", newly allocated GPU memory = "
 | |
|                      << newlyAllocated / 1024.0_MiB << " GiB";
 | |
|   }
 | |
|   sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB"
 | |
|                    << std::endl;
 | |
|   prevFree = free;
 | |
|   return newlyAllocated;
 | |
| }
 | |
| } // namespace
 | |
| 
 | |
| //! Returns true if deserialization is slower than expected or fails.
 | |
| bool timeDeserialize(InferenceEnvironment& iEnv) {
 | |
|   constexpr int32_t kNB_ITERS{20};
 | |
|   std::unique_ptr<IRuntime> rt{
 | |
|       createInferRuntime(sample::gLogger.getTRTLogger())};
 | |
|   std::unique_ptr<ICudaEngine> engine;
 | |
| 
 | |
|   std::unique_ptr<safe::IRuntime> safeRT{
 | |
|       sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())};
 | |
|   std::unique_ptr<safe::ICudaEngine> safeEngine;
 | |
| 
 | |
|   if (iEnv.safe) {
 | |
|     ASSERT(sample::hasSafeRuntime() && safeRT != nullptr);
 | |
|     safeRT->setErrorRecorder(&gRecorder);
 | |
|   }
 | |
| 
 | |
|   auto timeDeserializeFn = [&]() -> float {
 | |
|     bool deserializeOK{false};
 | |
|     engine.reset(nullptr);
 | |
|     safeEngine.reset(nullptr);
 | |
|     auto startClock = std::chrono::high_resolution_clock::now();
 | |
|     if (iEnv.safe) {
 | |
|       safeEngine.reset(safeRT->deserializeCudaEngine(iEnv.engineBlob.data(),
 | |
|                                                      iEnv.engineBlob.size()));
 | |
|       deserializeOK = (safeEngine != nullptr);
 | |
|     } else {
 | |
|       engine.reset(rt->deserializeCudaEngine(iEnv.engineBlob.data(),
 | |
|                                              iEnv.engineBlob.size(), nullptr));
 | |
|       deserializeOK = (engine != nullptr);
 | |
|     }
 | |
|     auto endClock = std::chrono::high_resolution_clock::now();
 | |
|     // return NAN if deserialization failed.
 | |
|     return deserializeOK
 | |
|                ? std::chrono::duration<float, std::milli>(endClock - startClock)
 | |
|                      .count()
 | |
|                : NAN;
 | |
|   };
 | |
| 
 | |
|   // Warmup the caches to make sure that cache thrashing isn't throwing off the
 | |
|   // results
 | |
|   {
 | |
|     sample::gLogInfo << "Begin deserialization warmup..." << std::endl;
 | |
|     for (int32_t i = 0, e = 2; i < e; ++i) {
 | |
|       timeDeserializeFn();
 | |
|     }
 | |
|   }
 | |
|   sample::gLogInfo << "Begin deserialization engine timing..." << std::endl;
 | |
|   float const first = timeDeserializeFn();
 | |
| 
 | |
|   // Check if first deserialization suceeded.
 | |
|   if (std::isnan(first)) {
 | |
|     sample::gLogError << "Engine deserialization failed." << std::endl;
 | |
|     return true;
 | |
|   }
 | |
| 
 | |
|   sample::gLogInfo << "First deserialization time = " << first
 | |
|                    << " milliseconds" << std::endl;
 | |
| 
 | |
|   // Record initial gpu memory state.
 | |
|   reportGpuMemory();
 | |
| 
 | |
|   float totalTime{0.F};
 | |
|   for (int32_t i = 0; i < kNB_ITERS; ++i) {
 | |
|     totalTime += timeDeserializeFn();
 | |
|   }
 | |
|   const auto averageTime = totalTime / kNB_ITERS;
 | |
|   // reportGpuMemory sometimes reports zero after a single deserialization of a
 | |
|   // small engine,
 | |
|   // so use the size of memory for all the iterations.
 | |
|   const auto totalEngineSizeGpu = reportGpuMemory();
 | |
|   sample::gLogInfo << "Total deserialization time = " << totalTime
 | |
|                    << " milliseconds in " << kNB_ITERS
 | |
|                    << " iterations, average time = " << averageTime
 | |
|                    << " milliseconds, first time = " << first
 | |
|                    << " milliseconds." << std::endl;
 | |
|   sample::gLogInfo << "Deserialization Bandwidth = "
 | |
|                    << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s"
 | |
|                    << std::endl;
 | |
| 
 | |
|   // If the first deserialization is more than tolerance slower than
 | |
|   // the average deserialization, return true, which means an error occurred.
 | |
|   // The tolerance is set to 2x since the deserialization time is quick and
 | |
|   // susceptible
 | |
|   // to caching issues causing problems in the first timing.
 | |
|   const auto tolerance = 2.0F;
 | |
|   const bool isSlowerThanExpected = first > averageTime * tolerance;
 | |
|   if (isSlowerThanExpected) {
 | |
|     sample::gLogInfo << "First deserialization time divided by average time is "
 | |
|                      << (first / averageTime) << ". Exceeds tolerance of "
 | |
|                      << tolerance << "x." << std::endl;
 | |
|   }
 | |
|   return isSlowerThanExpected;
 | |
| }
 | |
| 
 | |
| std::string getLayerInformation(const InferenceEnvironment& iEnv,
 | |
|                                 nvinfer1::LayerInformationFormat format) {
 | |
|   auto runtime = std::unique_ptr<IRuntime>(
 | |
|       createInferRuntime(sample::gLogger.getTRTLogger()));
 | |
|   auto inspector =
 | |
|       std::unique_ptr<IEngineInspector>(iEnv.engine->createEngineInspector());
 | |
|   if (!iEnv.context.empty()) {
 | |
|     inspector->setExecutionContext(iEnv.context.front().get());
 | |
|   }
 | |
|   std::string result = inspector->getEngineInformation(format);
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| } // namespace sample
 | 
