From 969531dcc8ecfa1a69ca30d00d711da4fcad43a9 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Tue, 6 Sep 2022 10:53:05 +0800
Subject: [PATCH] Optimize TensorRT backend to support rebuild engine (#189)

* optimize tensorrt usage

* format code

* fix input shape error for onnx model

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
---
 .../backends/tensorrt/trt_backend.cc          | 480 ++++++++++--------
 .../backends/tensorrt/trt_backend.h           |  32 +-
 csrc/fastdeploy/backends/tensorrt/utils.cc    | 138 +++++
 csrc/fastdeploy/backends/tensorrt/utils.h     | 135 +++--
 external/eigen.cmake                          |   3 +-
 external/paddle2onnx.cmake                    |   4 +-
 6 files changed, 526 insertions(+), 266 deletions(-)
 create mode 100644 csrc/fastdeploy/backends/tensorrt/utils.cc
diff --git a/csrc/fastdeploy/backends/tensorrt/trt_backend.cc b/csrc/fastdeploy/backends/tensorrt/trt_backend.cc
index 510dc9961..a61800068 100644
--- a/csrc/fastdeploy/backends/tensorrt/trt_backend.cc
+++ b/csrc/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "fastdeploy/backends/tensorrt/trt_backend.h"
-#include <cstring>
 #include "NvInferSafeRuntime.h"
 #include "fastdeploy/utils/utils.h"
+#include <cstring>
 #ifdef ENABLE_PADDLE_FRONTEND
 #include "paddle2onnx/converter.h"
 #endif
@@ -24,117 +24,46 @@ namespace fastdeploy {
 
 FDTrtLogger* FDTrtLogger::logger = nullptr;
 
-size_t TrtDataTypeSize(const nvinfer1::DataType& dtype) {
-  if (dtype == nvinfer1::DataType::kFLOAT) {
-    return sizeof(float);
-  } else if (dtype == nvinfer1::DataType::kHALF) {
-    return sizeof(float) / 2;
-  } else if (dtype == nvinfer1::DataType::kINT8) {
-    return sizeof(int8_t);
-  } else if (dtype == nvinfer1::DataType::kINT32) {
-    return sizeof(int32_t);
-  }
-  // kBOOL
-  return sizeof(bool);
-}
-
-FDDataType GetFDDataType(const nvinfer1::DataType& dtype) {
-  if (dtype == nvinfer1::DataType::kFLOAT) {
-    return FDDataType::FP32;
-  } else if (dtype == nvinfer1::DataType::kHALF) {
-    return FDDataType::FP16;
-  } else if (dtype == nvinfer1::DataType::kINT8) {
-    return FDDataType::INT8;
-  } else if (dtype == nvinfer1::DataType::kINT32) {
-    return FDDataType::INT32;
-  }
-  // kBOOL
-  return FDDataType::BOOL;
-}
-
-std::vector<int> toVec(const nvinfer1::Dims& dim) {
-  std::vector<int> out(dim.d, dim.d + dim.nbDims);
-  return out;
-}
-
-bool CheckDynamicShapeConfig(const paddle2onnx::OnnxReader& reader,
-                             const TrtBackendOption& option) {
-  // paddle2onnx::ModelTensorInfo inputs[reader.NumInputs()];
-  // std::string input_shapes[reader.NumInputs()];
-  std::vector<paddle2onnx::ModelTensorInfo> inputs(reader.NumInputs());
-  std::vector<std::string> input_shapes(reader.NumInputs());
-  for (int i = 0; i < reader.NumInputs(); ++i) {
-    reader.GetInputInfo(i, &inputs[i]);
-
-    // change 0 to -1, when input_dim is a string, onnx will make it to zero
-    for (int j = 0; j < inputs[i].rank; ++j) {
-      if (inputs[i].shape[j] <= 0) {
-        inputs[i].shape[j] = -1;
+// Check if the model can build tensorrt engine now
+// If the model has dynamic input shape, it will require defined shape
+// information We can set the shape range information by function
+// SetTrtInputShape() But if the shape range is not defined, then the engine
+// cannot build, in this case, The engine will build once there's data feeded,
+// and the shape range will be updated
+bool CanBuildEngine(
+    const std::map<std::string, ShapeRangeInfo>& shape_range_info) {
+  for (auto iter = shape_range_info.begin(); iter != shape_range_info.end();
+       ++iter) {
+    bool is_full_static = true;
+    for (size_t i = 0; i < iter->second.shape.size(); ++i) {
+      if (iter->second.shape[i] < 0) {
+        is_full_static = false;
+        break;
       }
     }
 
-    input_shapes[i] = "";
-    for (int j = 0; j < inputs[i].rank; ++j) {
-      if (j != inputs[i].rank - 1) {
-        input_shapes[i] += (std::to_string(inputs[i].shape[j]) + ", ");
-      } else {
-        input_shapes[i] += std::to_string(inputs[i].shape[j]);
+    if (is_full_static) {
+      continue;
+    }
+    for (size_t i = 0; i < iter->second.shape.size(); ++i) {
+      if (iter->second.min[i] < 0 || iter->second.max[i] < 0) {
+        return false;
       }
     }
   }
-
-  bool all_check_passed = true;
-  for (int i = 0; i < reader.NumInputs(); ++i) {
-    bool contain_unknown_dim = false;
-    for (int j = 0; j < inputs[i].rank; ++j) {
-      if (inputs[i].shape[j] < 0) {
-        contain_unknown_dim = true;
-      }
-    }
-
-    std::string name(inputs[i].name, strlen(inputs[i].name));
-    FDINFO << "The loaded model's input tensor:" << name
-           << " has shape [" + input_shapes[i] << "]." << std::endl;
-    if (contain_unknown_dim) {
-      auto iter1 = option.min_shape.find(name);
-      auto iter2 = option.max_shape.find(name);
-      auto iter3 = option.opt_shape.find(name);
-      if (iter1 == option.min_shape.end() || iter2 == option.max_shape.end() ||
-          iter3 == option.opt_shape.end()) {
-        FDERROR << "The loaded model's input tensor:" << name
-                << " has dynamic shape [" + input_shapes[i] +
-                       "], but didn't configure it's shape for tensorrt with "
-                       "SetTrtInputShape correctly."
-                << std::endl;
-        all_check_passed = false;
-      }
-    }
-  }
-
-  return all_check_passed;
+  return true;
 }
 
-bool TrtBackend::InitFromTrt(const std::string& trt_engine_file,
-                             const TrtBackendOption& option) {
-  if (initialized_) {
-    FDERROR << "TrtBackend is already initlized, cannot initialize again."
-            << std::endl;
-    return false;
-  }
-  cudaSetDevice(option.gpu_id);
+bool TrtBackend::LoadTrtCache(const std::string& trt_engine_file) {
+  cudaSetDevice(option_.gpu_id);
 
-  std::ifstream fin(trt_engine_file, std::ios::binary | std::ios::in);
-  if (!fin) {
-    FDERROR << "Failed to open TensorRT Engine file " << trt_engine_file
-            << std::endl;
-    return false;
-  }
-  fin.seekg(0, std::ios::end);
   std::string engine_buffer;
-  engine_buffer.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(engine_buffer.at(0)), engine_buffer.size());
-  fin.close();
+  if (!ReadBinaryFromFile(trt_engine_file, &engine_buffer)) {
+    FDERROR << "Failed to load TensorRT Engine from " << trt_engine_file << "."
+            << std::endl;
+    return false;
+  }
+
   FDUniquePtr<nvinfer1::IRuntime> runtime{
       nvinfer1::createInferRuntime(*FDTrtLogger::Get())};
   if (!runtime) {
@@ -152,10 +81,31 @@ bool TrtBackend::InitFromTrt(const std::string& trt_engine_file,
 
   context_ = std::shared_ptr<nvinfer1::IExecutionContext>(
       engine_->createExecutionContext());
-  FDASSERT(cudaStreamCreate(&stream_) == 0,
-           "[ERROR] Error occurs while calling cudaStreamCreate().");
   GetInputOutputInfo();
-  initialized_ = true;
+
+  for (int32_t i = 0; i < engine_->getNbBindings(); ++i) {
+    if (!engine_->bindingIsInput(i)) {
+      continue;
+    }
+    auto min = ToVec(engine_->getProfileDimensions(
+        i, 0, nvinfer1::OptProfileSelector::kMAX));
+    auto max = ToVec(engine_->getProfileDimensions(
+        i, 0, nvinfer1::OptProfileSelector::kMIN));
+    auto name = std::string(engine_->getBindingName(i));
+    auto iter = shape_range_info_.find(name);
+    if (iter == shape_range_info_.end()) {
+      FDERROR << "There's no input named '" << name << "' in loaded model."
+              << std::endl;
+      return false;
+    }
+    iter->second.Update(min);
+    iter->second.Update(max);
+  }
+  FDINFO << "Build TensorRT Engine from cache file: " << trt_engine_file
+         << " with shape range information as below," << std::endl;
+  for (const auto& item : shape_range_info_) {
+    FDINFO << item.second << std::endl;
+  }
   return true;
 }
 
@@ -167,10 +117,11 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
             << std::endl;
     return false;
   }
+  option_ = option;
 
 #ifdef ENABLE_PADDLE_FRONTEND
   std::vector<paddle2onnx::CustomOp> custom_ops;
-  for (auto& item : option.custom_op_info_) {
+  for (auto& item : option_.custom_op_info_) {
     paddle2onnx::CustomOp op;
     std::strcpy(op.op_name, item.first.c_str());
     std::strcpy(op.export_op_name, item.second.c_str());
@@ -187,7 +138,7 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
     return false;
   }
 
-  if (option.remove_multiclass_nms_) {
+  if (option_.remove_multiclass_nms_) {
     char* new_model = nullptr;
     int new_model_size = 0;
     if (!paddle2onnx::RemoveMultiClassNMS(model_content_ptr, model_content_size,
@@ -222,7 +173,8 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,
             << std::endl;
     return false;
   }
-  cudaSetDevice(option.gpu_id);
+  option_ = option;
+  cudaSetDevice(option_.gpu_id);
 
   std::string onnx_content = "";
   if (!from_memory_buffer) {
@@ -246,43 +198,94 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,
   outputs_order_.clear();
   auto onnx_reader =
       paddle2onnx::OnnxReader(onnx_content.c_str(), onnx_content.size());
-  for (int i = 0; i < onnx_reader.NumOutputs(); ++i) {
-    std::string name(
-        onnx_reader.output_names[i],
-        onnx_reader.output_names[i] + strlen(onnx_reader.output_names[i]));
+  for (int i = 0; i < onnx_reader.num_outputs; ++i) {
+    std::string name(onnx_reader.outputs[i].name);
     outputs_order_[name] = i;
   }
-  if (!CheckDynamicShapeConfig(onnx_reader, option)) {
-    FDERROR << "TrtBackend::CheckDynamicShapeConfig failed." << std::endl;
-    return false;
-  }
 
-  if (option.serialize_file != "") {
-    std::ifstream fin(option.serialize_file, std::ios::binary | std::ios::in);
-    if (fin) {
-      FDINFO << "Detect serialized TensorRT Engine file in "
-             << option.serialize_file << ", will load it directly."
-             << std::endl;
-      fin.close();
-      return InitFromTrt(option.serialize_file, option);
+  shape_range_info_.clear();
+  inputs_desc_.clear();
+  outputs_desc_.clear();
+  inputs_desc_.resize(onnx_reader.num_inputs);
+  outputs_desc_.resize(onnx_reader.num_outputs);
+  for (int i = 0; i < onnx_reader.num_inputs; ++i) {
+    std::string name(onnx_reader.inputs[i].name);
+    std::vector<int64_t> shape(onnx_reader.inputs[i].shape,
+                               onnx_reader.inputs[i].shape +
+                                   onnx_reader.inputs[i].rank);
+    inputs_desc_[i].name = name;
+    inputs_desc_[i].shape.assign(shape.begin(), shape.end());
+    inputs_desc_[i].dtype = ReaderDtypeToTrtDtype(onnx_reader.inputs[i].dtype);
+    auto info = ShapeRangeInfo(shape);
+    info.name = name;
+    auto iter_min = option.min_shape.find(name);
+    auto iter_max = option.max_shape.find(name);
+    auto iter_opt = option.opt_shape.find(name);
+    if (iter_min != option.min_shape.end()) {
+      info.min.assign(iter_min->second.begin(), iter_min->second.end());
+      info.max.assign(iter_max->second.begin(), iter_max->second.end());
+      info.opt.assign(iter_opt->second.begin(), iter_opt->second.end());
     }
+    shape_range_info_.insert(std::make_pair(name, info));
   }
 
-  if (!CreateTrtEngine(onnx_content, option)) {
-    return false;
+  for (int i = 0; i < onnx_reader.num_outputs; ++i) {
+    std::string name(onnx_reader.outputs[i].name);
+    std::vector<int64_t> shape(onnx_reader.outputs[i].shape,
+                               onnx_reader.outputs[i].shape +
+                                   onnx_reader.outputs[i].rank);
+    outputs_desc_[i].name = name;
+    outputs_desc_[i].shape.assign(shape.begin(), shape.end());
+    outputs_desc_[i].dtype =
+        ReaderDtypeToTrtDtype(onnx_reader.outputs[i].dtype);
   }
 
-  context_ = std::shared_ptr<nvinfer1::IExecutionContext>(
-      engine_->createExecutionContext());
   FDASSERT(cudaStreamCreate(&stream_) == 0,
            "[ERROR] Error occurs while calling cudaStreamCreate().");
-  GetInputOutputInfo();
+
+  if (!CreateTrtEngineFromOnnx(onnx_content)) {
+    FDERROR << "Failed to create tensorrt engine." << std::endl;
+    return false;
+  }
   initialized_ = true;
   return true;
 }
 
+int TrtBackend::ShapeRangeInfoUpdated(const std::vector<FDTensor>& inputs) {
+  bool need_update_engine = false;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto iter = shape_range_info_.find(inputs[i].name);
+    if (iter == shape_range_info_.end()) {
+      FDERROR << "There's no input named '" << inputs[i].name
+              << "' in loaded model." << std::endl;
+    }
+    if (iter->second.Update(inputs[i].shape) == 1) {
+      need_update_engine = true;
+    }
+  }
+  return need_update_engine;
+}
+
 bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
                        std::vector<FDTensor>* outputs) {
+  if (inputs.size() != NumInputs()) {
+    FDERROR << "Require " << NumInputs() << "inputs, but get " << inputs.size()
+            << "." << std::endl;
+    return false;
+  }
+  if (ShapeRangeInfoUpdated(inputs)) {
+    // meet new shape output of predefined max/min shape
+    // rebuild the tensorrt engine
+    FDWARNING
+        << "TensorRT engine will be rebuilt once shape range information "
+           "changed, this may take lots of time, you can set a proper shape "
+           "range before loading model to avoid rebuilding process. refer "
+           "https://github.com/PaddlePaddle/FastDeploy/docs/backends/"
+           "tensorrt.md for more details."
+        << std::endl;
+    BuildTrtEngine();
+  }
+
   AllocateBufferInDynamicShape(inputs, outputs);
   std::vector<void*> input_binds(inputs.size());
   for (size_t i = 0; i < inputs.size(); ++i) {
@@ -316,12 +319,14 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
 }
 
 void TrtBackend::GetInputOutputInfo() {
+  std::vector<TrtValueInfo>().swap(inputs_desc_);
+  std::vector<TrtValueInfo>().swap(outputs_desc_);
   inputs_desc_.clear();
   outputs_desc_.clear();
   auto num_binds = engine_->getNbBindings();
   for (auto i = 0; i < num_binds; ++i) {
     std::string name = std::string(engine_->getBindingName(i));
-    auto shape = toVec(engine_->getBindingDimensions(i));
+    auto shape = ToVec(engine_->getBindingDimensions(i));
     auto dtype = engine_->getBindingDataType(i);
     if (engine_->bindingIsInput(i)) {
       inputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
@@ -355,8 +360,10 @@ void TrtBackend::AllocateBufferInDynamicShape(
 
     // find the original index of output
     auto iter = outputs_order_.find(outputs_desc_[i].name);
-    FDASSERT(iter != outputs_order_.end(),
-             "Cannot find output: %s of tensorrt network from the original model.", outputs_desc_[i].name.c_str());
+    FDASSERT(
+        iter != outputs_order_.end(),
+        "Cannot find output: %s of tensorrt network from the original model.",
+        outputs_desc_[i].name.c_str());
     auto ori_idx = iter->second;
     (*outputs)[ori_idx].dtype = GetFDDataType(outputs_desc_[i].dtype);
     (*outputs)[ori_idx].shape.assign(output_dims.d,
@@ -372,32 +379,15 @@ void TrtBackend::AllocateBufferInDynamicShape(
   }
 }
 
-bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
-                                 const TrtBackendOption& option) {
-  const auto explicitBatch =
-      1U << static_cast<uint32_t>(
-          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-
-  builder_ = FDUniquePtr<nvinfer1::IBuilder>(
-      nvinfer1::createInferBuilder(*FDTrtLogger::Get()));
-  if (!builder_) {
-    FDERROR << "Failed to call createInferBuilder()." << std::endl;
-    return false;
-  }
-  network_ = FDUniquePtr<nvinfer1::INetworkDefinition>(
-      builder_->createNetworkV2(explicitBatch));
-  if (!network_) {
-    FDERROR << "Failed to call createNetworkV2()." << std::endl;
-    return false;
-  }
-  auto config = FDUniquePtr<nvinfer1::IBuilderConfig>(
-      builder_->createBuilderConfig());
+bool TrtBackend::BuildTrtEngine() {
+  auto config =
+      FDUniquePtr<nvinfer1::IBuilderConfig>(builder_->createBuilderConfig());
   if (!config) {
     FDERROR << "Failed to call createBuilderConfig()." << std::endl;
     return false;
   }
 
-  if (option.enable_fp16) {
+  if (option_.enable_fp16) {
     if (!builder_->platformHasFastFp16()) {
       FDWARNING << "Detected FP16 is not supported in the current GPU, "
                    "will use FP32 instead."
@@ -407,56 +397,52 @@ bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
     }
   }
 
-  parser_ = FDUniquePtr<nvonnxparser::IParser>(
-      nvonnxparser::createParser(*network_, *FDTrtLogger::Get()));
-  if (!parser_) {
-    FDERROR << "Failed to call createParser()." << std::endl;
-    return false;
-  }
-  if (!parser_->parse(onnx_model.data(), onnx_model.size())) {
-    FDERROR << "Failed to parse ONNX model by TensorRT." << std::endl;
-    return false;
-  }
-
   FDINFO << "Start to building TensorRT Engine..." << std::endl;
-  bool fp16 = builder_->platformHasFastFp16();
-  builder_->setMaxBatchSize(option.max_batch_size);
 
-  config->setMaxWorkspaceSize(option.max_workspace_size);
-
-  if (option.max_shape.size() > 0) {
-    auto profile = builder_->createOptimizationProfile();
-    FDASSERT(option.max_shape.size() == option.min_shape.size() &&
-                 option.min_shape.size() == option.opt_shape.size(),
-             "[TrtBackend] Size of max_shape/opt_shape/min_shape in "
-             "TrtBackendOption should keep same.");
-    for (const auto& item : option.min_shape) {
-      // set min shape
-      FDASSERT(profile->setDimensions(item.first.c_str(),
-                                      nvinfer1::OptProfileSelector::kMIN,
-                                      ToDims(item.second)),
-               "[TrtBackend] Failed to set min_shape for input: %s in TrtBackend.", item.first.c_str());
-
-      // set optimization shape
-      auto iter = option.opt_shape.find(item.first);
-      FDASSERT(iter != option.opt_shape.end(),
-               "[TrtBackend] Cannot find input name: %s in TrtBackendOption::opt_shape.", item.first.c_str());
-      FDASSERT(profile->setDimensions(item.first.c_str(),
-                                      nvinfer1::OptProfileSelector::kOPT,
-                                      ToDims(iter->second)),
-               "[TrtBackend] Failed to set opt_shape for input: %s in TrtBackend.", item.first.c_str());
-      // set max shape
-      iter = option.max_shape.find(item.first);
-      FDASSERT(iter != option.max_shape.end(),
-               "[TrtBackend] Cannot find input name: %s in TrtBackendOption::max_shape.", item.first);
-      FDASSERT(profile->setDimensions(item.first.c_str(),
-                                      nvinfer1::OptProfileSelector::kMAX,
-                                      ToDims(iter->second)),
-               "[TrtBackend] Failed to set max_shape for input: %s in TrtBackend.", item.first);
-    }
-    config->addOptimizationProfile(profile);
+  if (context_) {
+    context_.reset();
+    engine_.reset();
   }
 
+  builder_->setMaxBatchSize(option_.max_batch_size);
+  config->setMaxWorkspaceSize(option_.max_workspace_size);
+  auto profile = builder_->createOptimizationProfile();
+  for (const auto& item : shape_range_info_) {
+    FDASSERT(
+        profile->setDimensions(item.first.c_str(),
+                               nvinfer1::OptProfileSelector::kMIN,
+                               ToDims(item.second.min)),
+        "[TrtBackend] Failed to set min_shape for input: %s in TrtBackend.",
+        item.first.c_str());
+    FDASSERT(
+        profile->setDimensions(item.first.c_str(),
+                               nvinfer1::OptProfileSelector::kMAX,
+                               ToDims(item.second.max)),
+        "[TrtBackend] Failed to set max_shape for input: %s in TrtBackend.",
+        item.first.c_str());
+    if (item.second.opt.size() == 0) {
+      FDASSERT(
+          profile->setDimensions(item.first.c_str(),
+                                 nvinfer1::OptProfileSelector::kOPT,
+                                 ToDims(item.second.max)),
+          "[TrtBackend] Failed to set opt_shape for input: %s in TrtBackend.",
+          item.first.c_str());
+    } else {
+      FDASSERT(
+          item.second.opt.size() == item.second.shape.size(),
+          "Require the dimension of opt in shape range information equal to "
+          "dimension of input: %s in this model, but now it's %zu != %zu.",
+          item.first.c_str(), item.second.opt.size(), item.second.shape.size());
+      FDASSERT(
+          profile->setDimensions(item.first.c_str(),
+                                 nvinfer1::OptProfileSelector::kOPT,
+                                 ToDims(item.second.opt)),
+          "[TrtBackend] Failed to set opt_shape for input: %s in TrtBackend.",
+          item.first.c_str());
+    }
+  }
+  config->addOptimizationProfile(profile);
+
   FDUniquePtr<nvinfer1::IHostMemory> plan{
       builder_->buildSerializedNetwork(*network_, *config)};
   if (!plan) {
@@ -479,20 +465,24 @@ bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
     return false;
   }
 
+  context_ = std::shared_ptr<nvinfer1::IExecutionContext>(
+      engine_->createExecutionContext());
+  GetInputOutputInfo();
+
   FDINFO << "TensorRT Engine is built succussfully." << std::endl;
-  if (option.serialize_file != "") {
-    FDINFO << "Serialize TensorRTEngine to local file " << option.serialize_file
-           << "." << std::endl;
-    std::ofstream engine_file(option.serialize_file.c_str());
+  if (option_.serialize_file != "") {
+    FDINFO << "Serialize TensorRTEngine to local file "
+           << option_.serialize_file << "." << std::endl;
+    std::ofstream engine_file(option_.serialize_file.c_str());
     if (!engine_file) {
-      FDERROR << "Failed to open " << option.serialize_file << " to write."
+      FDERROR << "Failed to open " << option_.serialize_file << " to write."
               << std::endl;
       return false;
     }
     engine_file.write(static_cast<char*>(plan->data()), plan->size());
     engine_file.close();
     FDINFO << "TensorRTEngine is serialized to local file "
-           << option.serialize_file
+           << option_.serialize_file
            << ", we can load this model from the seralized engine "
               "directly next time."
            << std::endl;
@@ -500,8 +490,81 @@ bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
   return true;
 }
 
+bool TrtBackend::CreateTrtEngineFromOnnx(const std::string& onnx_model_buffer) {
+  const auto explicitBatch =
+      1U << static_cast<uint32_t>(
+          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+
+  builder_ = FDUniquePtr<nvinfer1::IBuilder>(
+      nvinfer1::createInferBuilder(*FDTrtLogger::Get()));
+  if (!builder_) {
+    FDERROR << "Failed to call createInferBuilder()." << std::endl;
+    return false;
+  }
+  network_ = FDUniquePtr<nvinfer1::INetworkDefinition>(
+      builder_->createNetworkV2(explicitBatch));
+  if (!network_) {
+    FDERROR << "Failed to call createNetworkV2()." << std::endl;
+    return false;
+  }
+  parser_ = FDUniquePtr<nvonnxparser::IParser>(
+      nvonnxparser::createParser(*network_, *FDTrtLogger::Get()));
+  if (!parser_) {
+    FDERROR << "Failed to call createParser()." << std::endl;
+    return false;
+  }
+  if (!parser_->parse(onnx_model_buffer.data(), onnx_model_buffer.size())) {
+    FDERROR << "Failed to parse ONNX model by TensorRT." << std::endl;
+    return false;
+  }
+
+  if (option_.serialize_file != "") {
+    std::ifstream fin(option_.serialize_file, std::ios::binary | std::ios::in);
+    if (fin) {
+      FDINFO << "Detect serialized TensorRT Engine file in "
+             << option_.serialize_file << ", will load it directly."
+             << std::endl;
+      fin.close();
+      // clear memory buffer of the temporary member
+      std::string().swap(onnx_model_buffer_);
+      return LoadTrtCache(option_.serialize_file);
+    }
+  }
+
+  if (!CanBuildEngine(shape_range_info_)) {
+    onnx_model_buffer_ = onnx_model_buffer;
+    FDWARNING << "Cannot build engine right now, because there's dynamic input "
+                 "shape exists, list as below,"
+              << std::endl;
+    for (int i = 0; i < NumInputs(); ++i) {
+      FDWARNING << "Input " << i << ": " << GetInputInfo(i) << std::endl;
+    }
+    FDWARNING
+        << "FastDeploy will build the engine while inference with input data, "
+           "and will also collect the input shape range information. You "
+           "should be noticed that FastDeploy will rebuild the engine while "
+           "new input shape is out of the collected shape range, this may "
+           "bring some time consuming problem, refer "
+           "https://github.com/PaddlePaddle/FastDeploy/docs/backends/"
+           "tensorrt.md for more details."
+        << std::endl;
+    initialized_ = true;
+    return true;
+  }
+
+  if (!BuildTrtEngine()) {
+    FDERROR << "Failed to build tensorrt engine." << std::endl;
+  }
+
+  // clear memory buffer of the temporary member
+  std::string().swap(onnx_model_buffer_);
+  return true;
+}
+
 TensorInfo TrtBackend::GetInputInfo(int index) {
-  FDASSERT(index < NumInputs(), "The index: %d should less than the number of inputs: %d.", index, NumInputs());
+  FDASSERT(index < NumInputs(),
+           "The index: %d should less than the number of inputs: %d.", index,
+           NumInputs());
   TensorInfo info;
   info.name = inputs_desc_[index].name;
   info.shape.assign(inputs_desc_[index].shape.begin(),
@@ -512,7 +575,8 @@ TensorInfo TrtBackend::GetInputInfo(int index) {
 
 TensorInfo TrtBackend::GetOutputInfo(int index) {
   FDASSERT(index < NumOutputs(),
-           "The index: %d should less than the number of outputs: %d.", index, NumOutputs());
+           "The index: %d should less than the number of outputs: %d.", index,
+           NumOutputs());
   TensorInfo info;
   info.name = outputs_desc_[index].name;
   info.shape.assign(outputs_desc_[index].shape.begin(),
@@ -520,4 +584,4 @@ TensorInfo TrtBackend::GetOutputInfo(int index) {
   info.dtype = GetFDDataType(outputs_desc_[index].dtype);
   return info;
 }
-}  // namespace fastdeploy
+} // namespace fastdeploy
diff --git a/csrc/fastdeploy/backends/tensorrt/trt_backend.h b/csrc/fastdeploy/backends/tensorrt/trt_backend.h
index c06c4e6ea..b04a75e55 100644
--- a/csrc/fastdeploy/backends/tensorrt/trt_backend.h
+++ b/csrc/fastdeploy/backends/tensorrt/trt_backend.h
@@ -19,11 +19,11 @@
 #include <string>
 #include <vector>
 
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
 #include "fastdeploy/backends/backend.h"
 #include "fastdeploy/backends/tensorrt/utils.h"
 #include <cuda_runtime_api.h>
-#include "NvOnnxParser.h"
-#include "NvInfer.h"
 
 namespace fastdeploy {
 
@@ -56,7 +56,6 @@ FDDataType GetFDDataType(const nvinfer1::DataType& dtype);
 class TrtBackend : public BaseBackend {
  public:
   TrtBackend() : engine_(nullptr), context_(nullptr) {}
-  virtual ~TrtBackend() = default;
   void BuildOption(const TrtBackendOption& option);
 
   bool InitFromPaddle(const std::string& model_file,
@@ -66,9 +65,6 @@ class TrtBackend : public BaseBackend {
   bool InitFromOnnx(const std::string& model_file,
                     const TrtBackendOption& option = TrtBackendOption(),
                     bool from_memory_buffer = false);
-  bool InitFromTrt(const std::string& trt_engine_file,
-                   const TrtBackendOption& option = TrtBackendOption());
-
   bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs);
 
   int NumInputs() const { return inputs_desc_.size(); }
@@ -76,7 +72,14 @@ class TrtBackend : public BaseBackend {
   TensorInfo GetInputInfo(int index);
   TensorInfo GetOutputInfo(int index);
 
+  ~TrtBackend() {
+    if (parser_) {
+      parser_.reset();
+    }
+  }
+
  private:
+  TrtBackendOption option_;
   std::shared_ptr<nvinfer1::ICudaEngine> engine_;
   std::shared_ptr<nvinfer1::IExecutionContext> context_;
   FDUniquePtr<nvonnxparser::IParser> parser_;
@@ -96,11 +99,22 @@ class TrtBackend : public BaseBackend {
   // order, to help recover the rigt order
   std::map<std::string, int> outputs_order_;
 
+  // temporary store onnx model content
+  // once it used to build trt egnine done
+  // it will be released
+  std::string onnx_model_buffer_;
+  // Stores shape information of the loaded model
+  // For dynmaic shape will record its range information
+  // Also will update the range information while inferencing
+  std::map<std::string, ShapeRangeInfo> shape_range_info_;
+
   void GetInputOutputInfo();
   void AllocateBufferInDynamicShape(const std::vector<FDTensor>& inputs,
                                     std::vector<FDTensor>* outputs);
-  bool CreateTrtEngine(const std::string& onnx_model,
-                       const TrtBackendOption& option);
+  bool CreateTrtEngineFromOnnx(const std::string& onnx_model_buffer);
+  bool BuildTrtEngine();
+  bool LoadTrtCache(const std::string& trt_engine_file);
+  int ShapeRangeInfoUpdated(const std::vector<FDTensor>& inputs);
 };
 
-}  // namespace fastdeploy
+} // namespace fastdeploy
diff --git a/csrc/fastdeploy/backends/tensorrt/utils.cc b/csrc/fastdeploy/backends/tensorrt/utils.cc
new file mode 100644
index 000000000..20c997ecd
--- /dev/null
+++ b/csrc/fastdeploy/backends/tensorrt/utils.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/backends/tensorrt/utils.h"
+
+namespace fastdeploy {
+
+int ShapeRangeInfo::Update(const std::vector<int64_t>& new_shape) {
+  if (new_shape.size() != shape.size()) {
+    return -1;
+  }
+  int need_update_engine = 0;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (is_static[i] == 1 && new_shape[i] != shape[i]) {
+      return -1;
+    }
+    if (new_shape[i] < min[i] || min[i] < 0) {
+      need_update_engine = 1;
+    }
+    if (new_shape[i] > max[i] || max[i] < 0) {
+      need_update_engine = 1;
+    }
+  }
+
+  if (need_update_engine == 0) {
+    return 0;
+  }
+
+  FDWARNING << "[New Shape Out of Range] input name: " << name
+            << ", shape: " << new_shape
+            << ", The shape range before: min_shape=" << min
+            << ", max_shape=" << max << "." << std::endl;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (new_shape[i] < min[i] || min[i] < 0) {
+      min[i] = new_shape[i];
+    }
+    if (new_shape[i] > max[i] || max[i] < 0) {
+      max[i] = new_shape[i];
+    }
+  }
+  FDWARNING
+      << "[New Shape Out of Range] The updated shape range now: min_shape="
+      << min << ", max_shape=" << max << "." << std::endl;
+  return need_update_engine;
+}
+
+size_t TrtDataTypeSize(const nvinfer1::DataType& dtype) {
+  if (dtype == nvinfer1::DataType::kFLOAT) {
+    return sizeof(float);
+  } else if (dtype == nvinfer1::DataType::kHALF) {
+    return sizeof(float) / 2;
+  } else if (dtype == nvinfer1::DataType::kINT8) {
+    return sizeof(int8_t);
+  } else if (dtype == nvinfer1::DataType::kINT32) {
+    return sizeof(int32_t);
+  }
+  // kBOOL
+  return sizeof(bool);
+}
+
+FDDataType GetFDDataType(const nvinfer1::DataType& dtype) {
+  if (dtype == nvinfer1::DataType::kFLOAT) {
+    return FDDataType::FP32;
+  } else if (dtype == nvinfer1::DataType::kHALF) {
+    return FDDataType::FP16;
+  } else if (dtype == nvinfer1::DataType::kINT8) {
+    return FDDataType::INT8;
+  } else if (dtype == nvinfer1::DataType::kINT32) {
+    return FDDataType::INT32;
+  }
+  // kBOOL
+  return FDDataType::BOOL;
+}
+
+nvinfer1::DataType ReaderDtypeToTrtDtype(int reader_dtype) {
+  if (reader_dtype == 0) {
+    return nvinfer1::DataType::kFLOAT;
+  } else if (reader_dtype == 1) {
+    FDASSERT(false, "TensorRT cannot support data type of double now.");
+  } else if (reader_dtype == 2) {
+    FDASSERT(false, "TensorRT cannot support data type of uint8 now.");
+  } else if (reader_dtype == 3) {
+    return nvinfer1::DataType::kINT8;
+  } else if (reader_dtype == 4) {
+    return nvinfer1::DataType::kINT32;
+  } else if (reader_dtype == 5) {
+    // regard int64 as int32
+    return nvinfer1::DataType::kINT32;
+  }
+  FDASSERT(false, "Received unexpected data type of %d", reader_dtype);
+  return nvinfer1::DataType::kFLOAT;
+}
+
+std::vector<int> ToVec(const nvinfer1::Dims& dim) {
+  std::vector<int> out(dim.d, dim.d + dim.nbDims);
+  return out;
+}
+
+int64_t Volume(const nvinfer1::Dims& d) {
+  return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+}
+
+nvinfer1::Dims ToDims(const std::vector<int>& vec) {
+  int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
+  if (static_cast<int>(vec.size()) > limit) {
+    FDWARNING << "Vector too long, only first 8 elements are used in dimension."
+              << std::endl;
+  }
+  // Pick first nvinfer1::Dims::MAX_DIMS elements
+  nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
+  std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+  return dims;
+}
+
+nvinfer1::Dims ToDims(const std::vector<int64_t>& vec) {
+  int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
+  if (static_cast<int>(vec.size()) > limit) {
+    FDWARNING << "Vector too long, only first 8 elements are used in dimension."
+              << std::endl;
+  }
+  // Pick first nvinfer1::Dims::MAX_DIMS elements
+  nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
+  std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+  return dims;
+}
+
+} // namespace fastdeploy
diff --git a/csrc/fastdeploy/backends/tensorrt/utils.h b/csrc/fastdeploy/backends/tensorrt/utils.h
index 482bd6fb4..4739cedbe 100644
--- a/csrc/fastdeploy/backends/tensorrt/utils.h
+++ b/csrc/fastdeploy/backends/tensorrt/utils.h
@@ -14,53 +14,54 @@
 
 #pragma once
 
-#include <iostream>
-#include <map>
-#include <string>
-#include <vector>
+#include "NvInfer.h"
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/utils/utils.h"
 #include <algorithm>
 #include <cuda_runtime_api.h>
-#include "NvInfer.h"
-#include "fastdeploy/utils/utils.h"
+#include <iostream>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
 
 namespace fastdeploy {
 
 struct FDInferDeleter {
-  template<typename T> void operator()(T* obj) const {
-    delete obj;
+  template <typename T> void operator()(T* obj) const {
+    if (obj) {
+      obj->destroy();
+    }
   }
 };
 
-template<typename T> using FDUniquePtr = std::unique_ptr<T, FDInferDeleter>;
+template <typename T> using FDUniquePtr = std::unique_ptr<T, FDInferDeleter>;
 
-inline uint32_t GetElementSize(nvinfer1::DataType t) noexcept {
-  switch (t) {
-  case nvinfer1::DataType::kINT32:
-    return 4;
-  case nvinfer1::DataType::kFLOAT:
-    return 4;
-  case nvinfer1::DataType::kHALF:
-    return 2;
-  case nvinfer1::DataType::kBOOL:
-  case nvinfer1::DataType::kINT8:
-    return 1;
+int64_t Volume(const nvinfer1::Dims& d);
+
+nvinfer1::Dims ToDims(const std::vector<int>& vec);
+nvinfer1::Dims ToDims(const std::vector<int64_t>& vec);
+
+size_t TrtDataTypeSize(const nvinfer1::DataType& dtype);
+
+FDDataType GetFDDataType(const nvinfer1::DataType& dtype);
+
+nvinfer1::DataType ReaderDtypeToTrtDtype(int reader_dtype);
+
+std::vector<int> ToVec(const nvinfer1::Dims& dim);
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, const std::vector<T>& vec) {
+  out << "[";
+  for (size_t i = 0; i < vec.size(); ++i) {
+    if (i != vec.size() - 1) {
+      out << vec[i] << ", ";
+    } else {
+      out << vec[i] << "]";
+    }
   }
-  return 0;
-}
-
-inline int64_t Volume(const nvinfer1::Dims& d) {
-  return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
-}
-
-inline nvinfer1::Dims ToDims(const std::vector<int>& vec) {
-  int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
-  if (static_cast<int>(vec.size()) > limit) {
-    FDWARNING << "Vector too long, only first 8 elements are used in dimension." << std::endl;
-  }
-  // Pick first nvinfer1::Dims::MAX_DIMS elements
-  nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
-  std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
-  return dims;
+  return out;
 }
 
 template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
@@ -123,9 +124,7 @@ template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
   //!
   //! \brief Returns the size (in bytes) of the buffer.
   //!
-  size_t nbBytes() const {
-    return this->size() * GetElementSize(mType);
-  }
+  size_t nbBytes() const { return this->size() * TrtDataTypeSize(mType); }
 
   //!
   //! \brief Resizes the buffer. This is a no-op if the new size is smaller than
@@ -145,9 +144,7 @@ template <typename AllocFunc, typename FreeFunc> class FDGenericBuffer {
   //!
   //! \brief Overload of resize that accepts Dims
   //!
-  void resize(const nvinfer1::Dims& dims) {
-    return this->resize(Volume(dims));
-  }
+  void resize(const nvinfer1::Dims& dims) { return this->resize(Volume(dims)); }
 
   ~FDGenericBuffer() { freeFn(mBuffer); }
 
@@ -183,11 +180,14 @@ class FDTrtLogger : public nvinfer1::ILogger {
     logger = new FDTrtLogger();
     return logger;
   }
-  void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override {
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) noexcept override {
     if (severity == nvinfer1::ILogger::Severity::kINFO) {
-      FDINFO << msg << std::endl;
+      // Disable this log
+      //      FDINFO << msg << std::endl;
     } else if (severity == nvinfer1::ILogger::Severity::kWARNING) {
-      FDWARNING << msg << std::endl;
+      // Disable this log
+      //      FDWARNING << msg << std::endl;
     } else if (severity == nvinfer1::ILogger::Severity::kERROR) {
       FDERROR << msg << std::endl;
     } else if (severity == nvinfer1::ILogger::Severity::kINTERNAL_ERROR) {
@@ -196,4 +196,47 @@ class FDTrtLogger : public nvinfer1::ILogger {
   }
 };
 
-}  // namespace fastdeploy
+struct ShapeRangeInfo {
+  ShapeRangeInfo(const std::vector<int64_t>& new_shape) {
+    shape.assign(new_shape.begin(), new_shape.end());
+    min.resize(new_shape.size());
+    max.resize(new_shape.size());
+    is_static.resize(new_shape.size());
+    for (size_t i = 0; i < new_shape.size(); ++i) {
+      if (new_shape[i] > 0) {
+        min[i] = new_shape[i];
+        max[i] = new_shape[i];
+        is_static[i] = 1;
+      } else {
+        min[i] = -1;
+        max[i] = -1;
+        is_static[i] = 0;
+      }
+    }
+  }
+
+  std::string name;
+  std::vector<int64_t> shape;
+  std::vector<int64_t> min;
+  std::vector<int64_t> max;
+  std::vector<int64_t> opt;
+  std::vector<int8_t> is_static;
+  // return
+  // -1: new shape is inillegal
+  // 0 : new shape is able to inference
+  // 1 : new shape is out of range, need to update engine
+  int Update(const std::vector<int64_t>& new_shape);
+  int Update(const std::vector<int>& new_shape) {
+    std::vector<int64_t> new_shape_int64(new_shape.begin(), new_shape.end());
+    return Update(new_shape_int64);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out,
+                                  const ShapeRangeInfo& info) {
+    out << "Input name: " << info.name << ", shape=" << info.shape
+        << ", min=" << info.min << ", max=" << info.max << std::endl;
+    return out;
+  }
+};
+
+} // namespace fastdeploy
diff --git a/external/eigen.cmake b/external/eigen.cmake
index 2248ee0fd..cbd9b1310 100644
--- a/external/eigen.cmake
+++ b/external/eigen.cmake
@@ -17,7 +17,8 @@ include(ExternalProject)
 # update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
-set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
+#set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
+set(EIGEN_REPOSITORY https://gitee.com/jiangjiajun/eigen.git)
 set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee)
 
 if(WIN32)
diff --git a/external/paddle2onnx.cmake b/external/paddle2onnx.cmake
index ae6f4acda..de52b6abc 100644
--- a/external/paddle2onnx.cmake
+++ b/external/paddle2onnx.cmake
@@ -42,8 +42,8 @@ else()
       CACHE FILEPATH "paddle2onnx compile library." FORCE)
 endif(WIN32)
 
-set(PADDLE2ONNX_URL_BASE "https://bj.bcebos.com/paddle2onnx/libs/")
-set(PADDLE2ONNX_VERSION "1.0.0rc3")
+set(PADDLE2ONNX_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
+set(PADDLE2ONNX_VERSION "1.0.1")
 if(WIN32)
   set(PADDLE2ONNX_FILE "paddle2onnx-win-x64-${PADDLE2ONNX_VERSION}.zip")
   if(NOT CMAKE_CL_64)