FastDeploy/csrcs/fastdeploy/backends/tensorrt/common/sampleInference.cpp

/*
 * Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <algorithm>
#include <array>
#include <chrono>
#include <cuda_profiler_api.h>
#include <functional>
#include <limits>
#include <memory>
#include <mutex>
#include <numeric>
#include <thread>
#include <utility>
#include <vector>

#if defined(__QNX__)
#include <sys/neutrino.h>
#include <sys/syspage.h>
#endif

#include "NvInfer.h"

#include "ErrorRecorder.h"
#include "logger.h"
#include "sampleDevice.h"
#include "sampleEngines.h"
#include "sampleInference.h"
#include "sampleOptions.h"
#include "sampleReporting.h"
#include "sampleUtils.h"

namespace sample {

template <class MapType, class EngineType>
bool validateTensorNames(const MapType& map, const EngineType* engine,
                         const int32_t endBindingIndex) {
  // Check if the provided input tensor names match the input tensors of the
  // engine.
  // Throw an error if the provided input tensor names cannot be found because
  // it implies a potential typo.
  for (const auto& item : map) {
    bool tensorNameFound{false};
    for (int32_t b = 0; b < endBindingIndex; ++b) {
      if (engine->bindingIsInput(b) &&
          engine->getBindingName(b) == item.first) {
        tensorNameFound = true;
        break;
      }
    }
    if (!tensorNameFound) {
      sample::gLogError
          << "Cannot find input tensor with name \"" << item.first
          << "\" in the engine bindings! "
          << "Please make sure the input tensor names are correct."
          << std::endl;
      return false;
    }
  }
  return true;
}

template <class EngineType, class ContextType> class FillBindingClosure {
 private:
  using InputsMap = std::unordered_map<std::string, std::string>;
  using BindingsVector = std::vector<std::unique_ptr<Bindings>>;

  EngineType const* engine;
  ContextType const* context;
  InputsMap const& inputs;
  BindingsVector& bindings;
  int32_t batch;
  int32_t endBindingIndex;

  void fillOneBinding(int32_t bindingIndex, int64_t vol) {
    auto const dims = getDims(bindingIndex);
    auto const name = engine->getBindingName(bindingIndex);
    auto const isInput = engine->bindingIsInput(bindingIndex);
    auto const dataType = engine->getBindingDataType(bindingIndex);
    auto const* bindingInOutStr = isInput ? "input" : "output";
    for (auto& binding : bindings) {
      const auto input = inputs.find(name);
      if (isInput && input != inputs.end()) {
        sample::gLogInfo << "Using values loaded from " << input->second
                         << " for input " << name << std::endl;
        binding->addBinding(bindingIndex, name, isInput, vol, dataType,
                            input->second);
      } else {
        sample::gLogInfo << "Using random values for " << bindingInOutStr << " "
                         << name << std::endl;
        binding->addBinding(bindingIndex, name, isInput, vol, dataType);
      }
      sample::gLogInfo << "Created " << bindingInOutStr << " binding for "
                       << name << " with dimensions " << dims << std::endl;
    }
  }

  bool fillAllBindings(int32_t batch, int32_t endBindingIndex) {
    if (!validateTensorNames(inputs, engine, endBindingIndex)) {
      sample::gLogError << "Invalid tensor names found in --loadInputs flag."
                        << std::endl;
      return false;
    }

    for (int32_t b = 0; b < endBindingIndex; b++) {
      auto const dims = getDims(b);
      auto const comps = engine->getBindingComponentsPerElement(b);
      auto const strides = context->getStrides(b);
      int32_t const vectorDimIndex = engine->getBindingVectorizedDim(b);
      auto const vol = volume(dims, strides, vectorDimIndex, comps, batch);
      fillOneBinding(b, vol);
    }
    return true;
  }

  Dims getDims(int32_t bindingIndex);

 public:
  FillBindingClosure(EngineType const* _engine, ContextType const* _context,
                     InputsMap const& _inputs, BindingsVector& _bindings,
                     int32_t _batch, int32_t _endBindingIndex)
      : engine(_engine), context(_context), inputs(_inputs),
        bindings(_bindings), batch(_batch), endBindingIndex(_endBindingIndex) {}

  bool operator()() { return fillAllBindings(batch, endBindingIndex); }
};

template <>
Dims FillBindingClosure<nvinfer1::ICudaEngine, nvinfer1::IExecutionContext>::
    getDims(int32_t bindingIndex) {
  return context->getBindingDimensions(bindingIndex);
}

template <>
Dims FillBindingClosure<
    nvinfer1::safe::ICudaEngine,
    nvinfer1::safe::IExecutionContext>::getDims(int32_t bindingIndex) {
  return engine->getBindingDimensions(bindingIndex);
}

bool setUpInference(InferenceEnvironment& iEnv,
                    const InferenceOptions& inference) {
  int32_t device{};
  cudaCheck(cudaGetDevice(&device));

  cudaDeviceProp properties;
  cudaCheck(cudaGetDeviceProperties(&properties, device));
  // Use managed memory on integrated devices when transfers are skipped
  // and when it is explicitly requested on the commandline.
  bool useManagedMemory{(inference.skipTransfers && properties.integrated) ||
                        inference.useManaged};
  using FillSafeBindings =
      FillBindingClosure<nvinfer1::safe::ICudaEngine,
                         nvinfer1::safe::IExecutionContext>;
  if (iEnv.safe) {
    ASSERT(sample::hasSafeRuntime());
    auto* safeEngine = iEnv.safeEngine.get();
    for (int32_t s = 0; s < inference.streams; ++s) {
      iEnv.safeContext.emplace_back(safeEngine->createExecutionContext());
      iEnv.bindings.emplace_back(new Bindings(useManagedMemory));
    }
    const int32_t nBindings = safeEngine->getNbBindings();
    auto const* safeContext = iEnv.safeContext.front().get();
    // batch is set to 1 because safety only support explicit batch.
    return FillSafeBindings(iEnv.safeEngine.get(), safeContext,
                            inference.inputs, iEnv.bindings, 1, nBindings)();
  }

  using FillStdBindings =
      FillBindingClosure<nvinfer1::ICudaEngine, nvinfer1::IExecutionContext>;

  for (int32_t s = 0; s < inference.streams; ++s) {
    auto ec = iEnv.engine->createExecutionContext();
    if (ec == nullptr) {
      sample::gLogError << "Unable to create execution context for stream " << s
                        << "." << std::endl;
      return false;
    }
    iEnv.context.emplace_back(ec);
    iEnv.bindings.emplace_back(new Bindings(useManagedMemory));
  }
  if (iEnv.profiler) {
    iEnv.context.front()->setProfiler(iEnv.profiler.get());
    // Always run reportToProfiler() after enqueue launch
    iEnv.context.front()->setEnqueueEmitsProfile(false);
  }

  const int32_t nOptProfiles = iEnv.engine->getNbOptimizationProfiles();
  const int32_t nBindings = iEnv.engine->getNbBindings();
  const int32_t bindingsInProfile =
      nOptProfiles > 0 ? nBindings / nOptProfiles : 0;
  const int32_t endBindingIndex =
      bindingsInProfile ? bindingsInProfile : iEnv.engine->getNbBindings();

  if (nOptProfiles > 1) {
    sample::gLogWarning << "Multiple profiles are currently not supported. "
                           "Running with one profile."
                        << std::endl;
  }

  // Make sure that the tensor names provided in command-line args actually
  // exist in any of the engine bindings
  // to avoid silent typos.
  if (!validateTensorNames(inference.shapes, iEnv.engine.get(),
                           endBindingIndex)) {
    sample::gLogError << "Invalid tensor names found in --shapes flag."
                      << std::endl;
    return false;
  }

  // Set all input dimensions before all bindings can be allocated
  for (int32_t b = 0; b < endBindingIndex; ++b) {
    if (iEnv.engine->bindingIsInput(b)) {
      auto dims = iEnv.context.front()->getBindingDimensions(b);
      const bool isScalar = dims.nbDims == 0;
      const bool isDynamicInput =
          std::any_of(dims.d, dims.d + dims.nbDims,
                      [](int32_t dim) { return dim == -1; }) ||
          iEnv.engine->isShapeBinding(b);
      if (isDynamicInput) {
        auto shape = inference.shapes.find(iEnv.engine->getBindingName(b));

        std::vector<int32_t> staticDims;
        if (shape == inference.shapes.end()) {
          // If no shape is provided, set dynamic dimensions to 1.
          constexpr int32_t DEFAULT_DIMENSION = 1;
          if (iEnv.engine->isShapeBinding(b)) {
            if (isScalar) {
              staticDims.push_back(1);
            } else {
              staticDims.resize(dims.d[0]);
              std::fill(staticDims.begin(), staticDims.end(),
                        DEFAULT_DIMENSION);
            }
          } else {
            staticDims.resize(dims.nbDims);
            std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(),
                           [&](int32_t dimension) {
                             return dimension >= 0 ? dimension
                                                   : DEFAULT_DIMENSION;
                           });
          }
          sample::gLogWarning << "Dynamic dimensions required for input: "
                              << iEnv.engine->getBindingName(b)
                              << ", but no shapes were provided. Automatically "
                                 "overriding shape to: "
                              << staticDims << std::endl;
        } else if (inference.inputs.count(shape->first) &&
                   iEnv.engine->isShapeBinding(b)) {
          if (isScalar || dims.nbDims == 1) {
            // Load shape tensor from file.
            size_t const size = isScalar ? 1 : dims.d[0];
            staticDims.resize(size);
            auto const& filename = inference.inputs.at(shape->first);
            auto dst = reinterpret_cast<char*>(staticDims.data());
            loadFromFile(filename, dst,
                         size * sizeof(decltype(staticDims)::value_type));
          } else {
            sample::gLogWarning << "Cannot load shape tensor " << shape->first
                                << " from file, "
                                << "ND-Shape isn't supported yet" << std::endl;
            // Fallback
            staticDims = shape->second;
          }
        } else {
          staticDims = shape->second;
        }

        for (auto& c : iEnv.context) {
          if (iEnv.engine->isShapeBinding(b)) {
            if (!c->setInputShapeBinding(b, staticDims.data())) {
              return false;
            }
          } else {
            if (!c->setBindingDimensions(b, toDims(staticDims))) {
              return false;
            }
          }
        }
      }
    }
  }

  auto* engine = iEnv.engine.get();
  auto const* context = iEnv.context.front().get();
  int32_t const batch =
      engine->hasImplicitBatchDimension() ? inference.batch : 1;
  return FillStdBindings(engine, context, inference.inputs, iEnv.bindings,
                         batch, endBindingIndex)();
}

namespace {

#if defined(__QNX__)
using TimePoint = double;
#else
using TimePoint = std::chrono::time_point<std::chrono::high_resolution_clock>;
#endif

TimePoint getCurrentTime() {
#if defined(__QNX__)
  uint64_t const currentCycles = ClockCycles();
  uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec;
  // Return current timestamp in ms.
  return static_cast<TimePoint>(currentCycles) * 1000. / cyclesPerSecond;
#else
  return std::chrono::high_resolution_clock::now();
#endif
}

//!
//! \struct SyncStruct
//! \brief Threads synchronization structure
//!
struct SyncStruct {
  std::mutex mutex;
  TrtCudaStream mainStream;
  TrtCudaEvent gpuStart{cudaEventBlockingSync};
  TimePoint cpuStart{};
  float sleep{};
};

struct Enqueue {
  explicit Enqueue(nvinfer1::IExecutionContext& context, void** buffers)
      : mContext(context), mBuffers(buffers) {}

  nvinfer1::IExecutionContext& mContext;
  void** mBuffers{};
};

//!
//! \class EnqueueImplicit
//! \brief Functor to enqueue inference with implict batch
//!
class EnqueueImplicit : private Enqueue {
 public:
  explicit EnqueueImplicit(nvinfer1::IExecutionContext& context, void** buffers,
                           int32_t batch)
      : Enqueue(context, buffers), mBatch(batch) {}

  bool operator()(TrtCudaStream& stream) const {
    if (mContext.enqueue(mBatch, mBuffers, stream.get(), nullptr)) {
      // Collecting layer timing info from current profile index of execution
      // context
      if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() &&
          !mContext.reportToProfiler()) {
        gLogWarning
            << "Failed to collect layer timing info from previous enqueue()"
            << std::endl;
      }
      return true;
    }
    return false;
  }

 private:
  int32_t mBatch;
};

//!
//! \class EnqueueExplicit
//! \brief Functor to enqueue inference with explict batch
//!
class EnqueueExplicit : private Enqueue {
 public:
  explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, void** buffers)
      : Enqueue(context, buffers) {}

  bool operator()(TrtCudaStream& stream) const {
    if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) {
      // Collecting layer timing info from current profile index of execution
      // context
      if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() &&
          !mContext.reportToProfiler()) {
        gLogWarning
            << "Failed to collect layer timing info from previous enqueueV2()"
            << std::endl;
      }
      return true;
    }
    return false;
  }
};

//!
//! \class EnqueueGraph
//! \brief Functor to enqueue inference from CUDA Graph
//!
class EnqueueGraph {
 public:
  explicit EnqueueGraph(nvinfer1::IExecutionContext& context,
                        TrtCudaGraph& graph)
      : mGraph(graph), mContext(context) {}

  bool operator()(TrtCudaStream& stream) const {
    if (mGraph.launch(stream)) {
      // Collecting layer timing info from current profile index of execution
      // context
      if (mContext.getProfiler() && !mContext.reportToProfiler()) {
        gLogWarning << "Failed to collect layer timing info from previous CUDA "
                       "graph launch"
                    << std::endl;
      }
      return true;
    }
    return false;
  }

  TrtCudaGraph& mGraph;
  nvinfer1::IExecutionContext& mContext;
};

//!
//! \class EnqueueSafe
//! \brief Functor to enqueue safe execution context
//!
class EnqueueSafe {
 public:
  explicit EnqueueSafe(nvinfer1::safe::IExecutionContext& context,
                       void** buffers)
      : mContext(context), mBuffers(buffers) {}

  bool operator()(TrtCudaStream& stream) const {
    if (mContext.enqueueV2(mBuffers, stream.get(), nullptr)) {
      return true;
    }
    return false;
  }

  nvinfer1::safe::IExecutionContext& mContext;
  void** mBuffers{};
};

using EnqueueFunction = std::function<bool(TrtCudaStream&)>;

enum class StreamType : int32_t {
  kINPUT = 0,
  kCOMPUTE = 1,
  kOUTPUT = 2,
  kNUM = 3
};

enum class EventType : int32_t {
  kINPUT_S = 0,
  kINPUT_E = 1,
  kCOMPUTE_S = 2,
  kCOMPUTE_E = 3,
  kOUTPUT_S = 4,
  kOUTPUT_E = 5,
  kNUM = 6
};

using MultiStream =
    std::array<TrtCudaStream, static_cast<int32_t>(StreamType::kNUM)>;

using MultiEvent = std::array<std::unique_ptr<TrtCudaEvent>,
                              static_cast<int32_t>(EventType::kNUM)>;

using EnqueueTimes = std::array<TimePoint, 2>;

//!
//! \class Iteration
//! \brief Inference iteration and streams management
//!
template <class ContextType> class Iteration {
 public:
  Iteration(int32_t id, const InferenceOptions& inference, ContextType& context,
            Bindings& bindings)
      : mBindings(bindings), mStreamId(id), mDepth(1 + inference.overlap),
        mActive(mDepth), mEvents(mDepth), mEnqueueTimes(mDepth),
        mContext(&context) {
    for (int32_t d = 0; d < mDepth; ++d) {
      for (int32_t e = 0; e < static_cast<int32_t>(EventType::kNUM); ++e) {
        mEvents[d][e].reset(new TrtCudaEvent(!inference.spin));
      }
    }
    createEnqueueFunction(inference, context, bindings);
  }

  bool query(bool skipTransfers) {
    if (mActive[mNext]) {
      return true;
    }

    if (!skipTransfers) {
      record(EventType::kINPUT_S, StreamType::kINPUT);
      mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
      record(EventType::kINPUT_E, StreamType::kINPUT);
      wait(EventType::kINPUT_E,
           StreamType::kCOMPUTE); // Wait for input DMA before compute
    }

    record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE);
    recordEnqueueTime();
    if (!mEnqueue(getStream(StreamType::kCOMPUTE))) {
      return false;
    }
    recordEnqueueTime();
    record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE);

    if (!skipTransfers) {
      wait(EventType::kCOMPUTE_E,
           StreamType::kOUTPUT); // Wait for compute before output DMA
      record(EventType::kOUTPUT_S, StreamType::kOUTPUT);
      mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
      record(EventType::kOUTPUT_E, StreamType::kOUTPUT);
    }

    mActive[mNext] = true;
    moveNext();
    return true;
  }

  float sync(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart,
             std::vector<InferenceTrace>& trace, bool skipTransfers) {
    if (mActive[mNext]) {
      if (skipTransfers) {
        getEvent(EventType::kCOMPUTE_E).synchronize();
      } else {
        getEvent(EventType::kOUTPUT_E).synchronize();
      }
      trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers));
      mActive[mNext] = false;
      return getEvent(EventType::kCOMPUTE_S) - gpuStart;
    }
    return 0;
  }

  void syncAll(const TimePoint& cpuStart, const TrtCudaEvent& gpuStart,
               std::vector<InferenceTrace>& trace, bool skipTransfers) {
    for (int32_t d = 0; d < mDepth; ++d) {
      sync(cpuStart, gpuStart, trace, skipTransfers);
      moveNext();
    }
  }

  void wait(TrtCudaEvent& gpuStart) {
    getStream(StreamType::kINPUT).wait(gpuStart);
  }

  void setInputData() {
    mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
  }

  void fetchOutputData() {
    mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
  }

 private:
  void moveNext() { mNext = mDepth - 1 - mNext; }

  TrtCudaStream& getStream(StreamType t) {
    return mStream[static_cast<int32_t>(t)];
  }

  TrtCudaEvent& getEvent(EventType t) {
    return *mEvents[mNext][static_cast<int32_t>(t)];
  }

  void record(EventType e, StreamType s) { getEvent(e).record(getStream(s)); }

  void recordEnqueueTime() {
    mEnqueueTimes[mNext][enqueueStart] = getCurrentTime();
    enqueueStart = 1 - enqueueStart;
  }

  TimePoint getEnqueueTime(bool start) {
    return mEnqueueTimes[mNext][start ? 0 : 1];
  }

  void wait(EventType e, StreamType s) { getStream(s).wait(getEvent(e)); }

  InferenceTrace getTrace(const TimePoint& cpuStart,
                          const TrtCudaEvent& gpuStart, bool skipTransfers) {
    float is = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart
                             : getEvent(EventType::kINPUT_S) - gpuStart;
    float ie = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart
                             : getEvent(EventType::kINPUT_E) - gpuStart;
    float os = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart
                             : getEvent(EventType::kOUTPUT_S) - gpuStart;
    float oe = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart
                             : getEvent(EventType::kOUTPUT_E) - gpuStart;

    return InferenceTrace(mStreamId,
                          std::chrono::duration<float, std::milli>(
                              getEnqueueTime(true) - cpuStart)
                              .count(),
                          std::chrono::duration<float, std::milli>(
                              getEnqueueTime(false) - cpuStart)
                              .count(),
                          is, ie, getEvent(EventType::kCOMPUTE_S) - gpuStart,
                          getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe);
  }

  void createEnqueueFunction(const InferenceOptions& inference,
                             nvinfer1::IExecutionContext& context,
                             Bindings& bindings) {
    if (inference.batch) {
      mEnqueue = EnqueueFunction(EnqueueImplicit(
          context, mBindings.getDeviceBuffers(), inference.batch));
    } else {
      mEnqueue = EnqueueFunction(
          EnqueueExplicit(context, mBindings.getDeviceBuffers()));
    }
    if (inference.graph) {
      TrtCudaStream& stream = getStream(StreamType::kCOMPUTE);
      // Avoid capturing initialization calls by executing the enqueue function
      // at least
      // once before starting CUDA graph capture.
      const auto ret = mEnqueue(stream);
      assert(ret);
      stream.synchronize();

      mGraph.beginCapture(stream);
      // The built TRT engine may contain operations that are not permitted
      // under CUDA graph capture mode.
      // When the stream is capturing, the enqueue call may return false if the
      // current CUDA graph capture fails.
      if (mEnqueue(stream)) {
        mGraph.endCapture(stream);
        mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph));
      } else {
        mGraph.endCaptureOnError(stream);
        // Ensure any CUDA error has been cleaned up.
        cudaCheck(cudaGetLastError());
        sample::gLogWarning << "The built TensorRT engine contains operations "
                               "that are not permitted under "
                               "CUDA graph capture mode."
                            << std::endl;
        sample::gLogWarning << "The specified --useCudaGraph flag has been "
                               "ignored. The inference will be "
                               "launched without using CUDA graph launch."
                            << std::endl;
      }
    }
  }

  void createEnqueueFunction(const InferenceOptions&,
                             nvinfer1::safe::IExecutionContext& context,
                             Bindings&) {
    mEnqueue =
        EnqueueFunction(EnqueueSafe(context, mBindings.getDeviceBuffers()));
  }

  Bindings& mBindings;

  TrtCudaGraph mGraph;
  EnqueueFunction mEnqueue;

  int32_t mStreamId{0};
  int32_t mNext{0};
  int32_t mDepth{2}; // default to double buffer to hide DMA transfers

  std::vector<bool> mActive;
  MultiStream mStream;
  std::vector<MultiEvent> mEvents;

  int32_t enqueueStart{0};
  std::vector<EnqueueTimes> mEnqueueTimes;
  ContextType* mContext{nullptr};
};

template <class ContextType>
bool inferenceLoop(
    std::vector<std::unique_ptr<Iteration<ContextType>>>& iStreams,
    const TimePoint& cpuStart, const TrtCudaEvent& gpuStart, int iterations,
    float maxDurationMs, float warmupMs, std::vector<InferenceTrace>& trace,
    bool skipTransfers, float idleMs) {
  float durationMs = 0;
  int32_t skip = 0;

  for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs;
       ++i) {
    for (auto& s : iStreams) {
      if (!s->query(skipTransfers)) {
        return false;
      }
    }
    for (auto& s : iStreams) {
      durationMs = std::max(durationMs,
                            s->sync(cpuStart, gpuStart, trace, skipTransfers));
    }
    if (durationMs < warmupMs) // Warming up
    {
      if (durationMs) // Skip complete iterations
      {
        ++skip;
      }
      continue;
    }
    if (idleMs != 0.F) {
      std::this_thread::sleep_for(
          std::chrono::duration<float, std::milli>(idleMs));
    }
  }
  for (auto& s : iStreams) {
    s->syncAll(cpuStart, gpuStart, trace, skipTransfers);
  }
  return true;
}

template <class ContextType>
void inferenceExecution(const InferenceOptions& inference,
                        InferenceEnvironment& iEnv, SyncStruct& sync,
                        const int32_t threadIdx, const int32_t streamsPerThread,
                        int32_t device, std::vector<InferenceTrace>& trace) {
  float warmupMs = inference.warmup;
  float durationMs = inference.duration * 1000.F + warmupMs;

  cudaCheck(cudaSetDevice(device));

  std::vector<std::unique_ptr<Iteration<ContextType>>> iStreams;

  for (int32_t s = 0; s < streamsPerThread; ++s) {
    const int32_t streamId{threadIdx * streamsPerThread + s};
    auto* iteration = new Iteration<ContextType>(
        streamId, inference, *iEnv.template getContext<ContextType>(streamId),
        *iEnv.bindings[streamId]);
    if (inference.skipTransfers) {
      iteration->setInputData();
    }
    iStreams.emplace_back(iteration);
  }

  for (auto& s : iStreams) {
    s->wait(sync.gpuStart);
  }

  std::vector<InferenceTrace> localTrace;
  if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart,
                     inference.iterations, durationMs, warmupMs, localTrace,
                     inference.skipTransfers, inference.idle)) {
    iEnv.error = true;
  }

  if (inference.skipTransfers) {
    for (auto& s : iStreams) {
      s->fetchOutputData();
    }
  }

  sync.mutex.lock();
  trace.insert(trace.end(), localTrace.begin(), localTrace.end());
  sync.mutex.unlock();
}

inline std::thread makeThread(const InferenceOptions& inference,
                              InferenceEnvironment& iEnv, SyncStruct& sync,
                              int32_t threadIdx, int32_t streamsPerThread,
                              int32_t device,
                              std::vector<InferenceTrace>& trace) {
  if (iEnv.safe) {
    ASSERT(sample::hasSafeRuntime());
    return std::thread(inferenceExecution<nvinfer1::safe::IExecutionContext>,
                       std::cref(inference), std::ref(iEnv), std::ref(sync),
                       threadIdx, streamsPerThread, device, std::ref(trace));
  }

  return std::thread(inferenceExecution<nvinfer1::IExecutionContext>,
                     std::cref(inference), std::ref(iEnv), std::ref(sync),
                     threadIdx, streamsPerThread, device, std::ref(trace));
}

} // namespace

bool runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv,
                  int32_t device, std::vector<InferenceTrace>& trace) {
  cudaCheck(cudaProfilerStart());

  trace.resize(0);

  SyncStruct sync;
  sync.sleep = inference.sleep;
  sync.mainStream.sleep(&sync.sleep);
  sync.cpuStart = getCurrentTime();
  sync.gpuStart.record(sync.mainStream);

  // When multiple streams are used, trtexec can run inference in two modes:
  // (1) if inference.threads is true, then run each stream on each thread.
  // (2) if inference.threads is false, then run all streams on the same thread.
  const int32_t numThreads = inference.threads ? inference.streams : 1;
  const int32_t streamsPerThread = inference.threads ? 1 : inference.streams;

  std::vector<std::thread> threads;
  for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) {
    threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx,
                                    streamsPerThread, device, trace));
  }
  for (auto& th : threads) {
    th.join();
  }

  cudaCheck(cudaProfilerStop());

  auto cmpTrace = [](const InferenceTrace& a, const InferenceTrace& b) {
    return a.h2dStart < b.h2dStart;
  };
  std::sort(trace.begin(), trace.end(), cmpTrace);

  return !iEnv.error;
}

namespace {
size_t reportGpuMemory() {
  static size_t prevFree{0};
  size_t free{0};
  size_t total{0};
  size_t newlyAllocated{0};
  cudaCheck(cudaMemGetInfo(&free, &total));
  sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB";
  if (prevFree != 0) {
    newlyAllocated = (prevFree - free);
    sample::gLogInfo << ", newly allocated GPU memory = "
                     << newlyAllocated / 1024.0_MiB << " GiB";
  }
  sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB"
                   << std::endl;
  prevFree = free;
  return newlyAllocated;
}
} // namespace

//! Returns true if deserialization is slower than expected or fails.
bool timeDeserialize(InferenceEnvironment& iEnv) {
  constexpr int32_t kNB_ITERS{20};
  std::unique_ptr<IRuntime> rt{
      createInferRuntime(sample::gLogger.getTRTLogger())};
  std::unique_ptr<ICudaEngine> engine;

  std::unique_ptr<safe::IRuntime> safeRT{
      sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())};
  std::unique_ptr<safe::ICudaEngine> safeEngine;

  if (iEnv.safe) {
    ASSERT(sample::hasSafeRuntime() && safeRT != nullptr);
    safeRT->setErrorRecorder(&gRecorder);
  }

  auto timeDeserializeFn = [&]() -> float {
    bool deserializeOK{false};
    engine.reset(nullptr);
    safeEngine.reset(nullptr);
    auto startClock = std::chrono::high_resolution_clock::now();
    if (iEnv.safe) {
      safeEngine.reset(safeRT->deserializeCudaEngine(iEnv.engineBlob.data(),
                                                     iEnv.engineBlob.size()));
      deserializeOK = (safeEngine != nullptr);
    } else {
      engine.reset(rt->deserializeCudaEngine(iEnv.engineBlob.data(),
                                             iEnv.engineBlob.size(), nullptr));
      deserializeOK = (engine != nullptr);
    }
    auto endClock = std::chrono::high_resolution_clock::now();
    // return NAN if deserialization failed.
    return deserializeOK
               ? std::chrono::duration<float, std::milli>(endClock - startClock)
                     .count()
               : NAN;
  };

  // Warmup the caches to make sure that cache thrashing isn't throwing off the
  // results
  {
    sample::gLogInfo << "Begin deserialization warmup..." << std::endl;
    for (int32_t i = 0, e = 2; i < e; ++i) {
      timeDeserializeFn();
    }
  }
  sample::gLogInfo << "Begin deserialization engine timing..." << std::endl;
  float const first = timeDeserializeFn();

  // Check if first deserialization suceeded.
  if (std::isnan(first)) {
    sample::gLogError << "Engine deserialization failed." << std::endl;
    return true;
  }

  sample::gLogInfo << "First deserialization time = " << first
                   << " milliseconds" << std::endl;

  // Record initial gpu memory state.
  reportGpuMemory();

  float totalTime{0.F};
  for (int32_t i = 0; i < kNB_ITERS; ++i) {
    totalTime += timeDeserializeFn();
  }
  const auto averageTime = totalTime / kNB_ITERS;
  // reportGpuMemory sometimes reports zero after a single deserialization of a
  // small engine,
  // so use the size of memory for all the iterations.
  const auto totalEngineSizeGpu = reportGpuMemory();
  sample::gLogInfo << "Total deserialization time = " << totalTime
                   << " milliseconds in " << kNB_ITERS
                   << " iterations, average time = " << averageTime
                   << " milliseconds, first time = " << first
                   << " milliseconds." << std::endl;
  sample::gLogInfo << "Deserialization Bandwidth = "
                   << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s"
                   << std::endl;

  // If the first deserialization is more than tolerance slower than
  // the average deserialization, return true, which means an error occurred.
  // The tolerance is set to 2x since the deserialization time is quick and
  // susceptible
  // to caching issues causing problems in the first timing.
  const auto tolerance = 2.0F;
  const bool isSlowerThanExpected = first > averageTime * tolerance;
  if (isSlowerThanExpected) {
    sample::gLogInfo << "First deserialization time divided by average time is "
                     << (first / averageTime) << ". Exceeds tolerance of "
                     << tolerance << "x." << std::endl;
  }
  return isSlowerThanExpected;
}

std::string getLayerInformation(const InferenceEnvironment& iEnv,
                                nvinfer1::LayerInformationFormat format) {
  auto runtime = std::unique_ptr<IRuntime>(
      createInferRuntime(sample::gLogger.getTRTLogger()));
  auto inspector =
      std::unique_ptr<IEngineInspector>(iEnv.engine->createEngineInspector());
  if (!iEnv.context.empty()) {
    inspector->setExecutionContext(iEnv.context.front().get());
  }
  std::string result = inspector->getEngineInformation(format);
  return result;
}

} // namespace sample