[Other] FastDeploy TensorRT && ONNX backend support to load model form memory (#1130)

* Update all backends load model from buffer * Delete redundant code * Format code style * Format code style * Delete redundant code * Delete redundant code * Add some FDASSERTs * Update load model form memory when cloning engine * Update clone engine code * Update set_model_buffer api parameters with char pointer * Release memory buffer variables after finish init backends * Fix conflict * Fix bug
2025-10-23 16:44:22 +08:00 · 2023-02-01 11:36:09 +08:00
parent 5b7728e898
commit 76df90afc3
17 changed files with 201 additions and 154 deletions
--- a/fastdeploy/pybind/runtime.cc
+++ b/fastdeploy/pybind/runtime.cc
@@ -84,8 +84,6 @@ void BindRuntime(pybind11::module& m) {
      .def_readwrite("backend", &RuntimeOption::backend)
      .def_readwrite("external_stream", &RuntimeOption::external_stream_)
      .def_readwrite("model_from_memory", &RuntimeOption::model_from_memory_)
      .def_readwrite("model_buffer_size", &RuntimeOption::model_buffer_size_)
      .def_readwrite("params_buffer_size", &RuntimeOption::params_buffer_size_)
      .def_readwrite("cpu_thread_num", &RuntimeOption::cpu_thread_num)
      .def_readwrite("device_id", &RuntimeOption::device_id)
      .def_readwrite("device", &RuntimeOption::device)
--- a/fastdeploy/runtime/backends/backend.h
+++ b/fastdeploy/runtime/backends/backend.h
@@ -21,6 +21,7 @@
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/fd_type.h"
 #include "fastdeploy/runtime/runtime_option.h"
 namespace fastdeploy {
@@ -76,7 +77,8 @@ class BaseBackend {
  // Optional: For those backends which can share memory
  // while creating multiple inference engines with same model file
-  virtual std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+  virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
                                             void *stream = nullptr,
                                             int device_id = -1) {
    FDERROR << "Clone no support" << std::endl;
    return nullptr;
--- a/fastdeploy/runtime/backends/openvino/ov_backend.cc
+++ b/fastdeploy/runtime/backends/openvino/ov_backend.cc
@@ -237,7 +237,6 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
  option_ = option;
  std::shared_ptr<ov::Model> model = core_.read_model(model_file);
  if (option_.shape_infos.size() > 0) {
    std::map<std::string, ov::PartialShape> shape_infos;
    for (const auto& item : option_.shape_infos) {
@@ -380,8 +379,8 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
  return true;
 }
-std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(void* stream,
+std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(
-                                                    int device_id) {
+    RuntimeOption& runtime_option, void* stream, int device_id) {
  std::unique_ptr<BaseBackend> new_backend =
      utils::make_unique<OpenVINOBackend>();
  auto casted_backend = dynamic_cast<OpenVINOBackend*>(new_backend.get());
--- a/fastdeploy/runtime/backends/openvino/ov_backend.h
+++ b/fastdeploy/runtime/backends/openvino/ov_backend.h
@@ -52,7 +52,8 @@ class OpenVINOBackend : public BaseBackend {
  std::vector<TensorInfo> GetInputInfos() override;
  std::vector<TensorInfo> GetOutputInfos() override;
-  std::unique_ptr<BaseBackend> Clone(void* stream = nullptr,
+  std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
                                     void* stream = nullptr,
                                     int device_id = -1) override;
 private:
--- a/fastdeploy/runtime/backends/ort/ort_backend.cc
+++ b/fastdeploy/runtime/backends/ort/ort_backend.cc
@@ -73,8 +73,8 @@ void OrtBackend::BuildOption(const OrtBackendOption& option) {
  }
 }
-bool OrtBackend::InitFromPaddle(const std::string& model_file,
+bool OrtBackend::InitFromPaddle(const std::string& model_buffer,
-                                const std::string& params_file,
+                                const std::string& params_buffer,
                                const OrtBackendOption& option, bool verbose) {
  if (initialized_) {
    FDERROR << "OrtBackend is already initlized, cannot initialize again."
@@ -92,7 +92,8 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
  strcpy(ops[1].op_name, "pool2d");
  strcpy(ops[1].export_op_name, "AdaptivePool2d");
-  if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
+  if (!paddle2onnx::Export(model_buffer.c_str(), model_buffer.size(),
                           params_buffer.c_str(), params_buffer.size(),
                           &model_content_ptr, &model_content_size, 11, true,
                           verbose, true, true, true, ops.data(), 2,
                           "onnxruntime", nullptr, 0, "", &save_external)) {
@@ -112,9 +113,8 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
             model_file_name.c_str());
    f << onnx_model_proto;
    f.close();
    return InitFromOnnx(model_file_name, option, false);
  }
-  return InitFromOnnx(onnx_model_proto, option, true);
+  return InitFromOnnx(onnx_model_proto, option);
 #else
  FDERROR << "Didn't compile with PaddlePaddle Frontend, you can try to "
             "call `InitFromOnnx` instead."
@@ -124,8 +124,7 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
 }
 bool OrtBackend::InitFromOnnx(const std::string& model_file,
-                              const OrtBackendOption& option,
+                              const OrtBackendOption& option) {
                              bool from_memory_buffer) {
  if (initialized_) {
    FDERROR << "OrtBackend is already initlized, cannot initialize again."
            << std::endl;
@@ -134,17 +133,7 @@ bool OrtBackend::InitFromOnnx(const std::string& model_file,
  BuildOption(option);
  InitCustomOperators();
  if (from_memory_buffer) {
  session_ = {env_, model_file.data(), model_file.size(), session_options_};
  } else {
 #ifdef _WIN32
    session_ = {env_,
                std::wstring(model_file.begin(), model_file.end()).c_str(),
                session_options_};
 #else
    session_ = {env_, model_file.c_str(), session_options_};
 #endif
  }
  binding_ = std::make_shared<Ort::IoBinding>(session_);
  Ort::MemoryInfo memory_info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
--- a/fastdeploy/runtime/backends/ort/ort_backend.h
+++ b/fastdeploy/runtime/backends/ort/ort_backend.h
@@ -39,14 +39,13 @@ class OrtBackend : public BaseBackend {
  void BuildOption(const OrtBackendOption& option);
-  bool InitFromPaddle(const std::string& model_file,
+  bool InitFromPaddle(const std::string& model_buffer,
-                      const std::string& params_file,
+                      const std::string& params_buffer,
                      const OrtBackendOption& option = OrtBackendOption(),
                      bool verbose = false);
-  bool InitFromOnnx(const std::string& model_file,
+  bool InitFromOnnx(const std::string& model_buffer,
-                    const OrtBackendOption& option = OrtBackendOption(),
+                    const OrtBackendOption& option = OrtBackendOption());
                    bool from_memory_buffer = false);
  bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
             bool copy_to_fd = true) override;
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -39,10 +39,7 @@ struct PaddleBackendOption {
  std::string model_file = "";   // Path of model file
  std::string params_file = "";  // Path of parameters file, can be empty
-  std::string model_buffer_ = "";
+  // load model and paramters from memory
  std::string params_buffer_ = "";
  size_t model_buffer_size_ = 0;
  size_t params_buffer_size_ = 0;
  bool model_from_memory_ = false;
 #ifdef WITH_GPU
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -89,9 +89,10 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
  }
 }
-bool PaddleBackend::InitFromPaddle(const std::string& model_file,
+bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
-                                   const std::string& params_file,
+                                   const std::string& params_buffer,
                                   const PaddleBackendOption& option) {
  // bool PaddleBackend::InitFromPaddle(const std::string& contents) {
  if (initialized_) {
    FDERROR << "PaddleBackend is already initlized, cannot initialize again."
            << std::endl;
@@ -102,16 +103,9 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_file,
  // PaddleReader instead now
  std::string contents;
-  if (option.model_from_memory_) {
+  config_.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
-    config_.SetModelBuffer(model_file.c_str(), option.model_buffer_size_,
+                         params_buffer.c_str(), params_buffer.size());
-                           params_file.c_str(), option.params_buffer_size_);
+  contents = model_buffer;
    contents = model_file;
  } else {
    config_.SetModel(model_file, params_file);
    if (!ReadBinaryFromFile(model_file, &contents)) {
      return false;
    }
  }
  config_.EnableMemoryOptim();
  BuildOption(option);
  auto reader = paddle2onnx::PaddleReader(contents.c_str(), contents.size());
@@ -172,20 +166,16 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_file,
    // Set the shape info file.
    std::string curr_model_dir = "./";
    if (!option.model_from_memory_) {
-      curr_model_dir = GetDirFromPath(model_file);
+      curr_model_dir = GetDirFromPath(option.model_file);
    }
    std::string shape_range_info =
        PathJoin(curr_model_dir, "shape_range_info.pbtxt");
    if (!CheckFileExists(shape_range_info)) {
      FDINFO << "Start generating shape range info file." << std::endl;
      paddle_infer::Config analysis_config;
-      if (option.model_from_memory_) {
+      analysis_config.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
-        analysis_config.SetModelBuffer(
+                                     params_buffer.c_str(),
-            model_file.c_str(), option.model_buffer_size_, params_file.c_str(),
+                                     params_buffer.size());
            option.params_buffer_size_);
      } else {
        analysis_config.SetModel(model_file, params_file);
      }
      analysis_config.CollectShapeRangeInfo(shape_range_info);
      auto predictor_tmp = paddle_infer::CreatePredictor(analysis_config);
      std::map<std::string, std::vector<int>> max_shape;
@@ -258,7 +248,8 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
  return true;
 }
-std::unique_ptr<BaseBackend> PaddleBackend::Clone(void* stream, int device_id) {
+std::unique_ptr<BaseBackend> PaddleBackend::Clone(RuntimeOption& runtime_option,
                                                  void* stream, int device_id) {
  std::unique_ptr<BaseBackend> new_backend =
      utils::make_unique<PaddleBackend>();
  auto casted_backend = dynamic_cast<PaddleBackend*>(new_backend.get());
@@ -266,8 +257,27 @@ std::unique_ptr<BaseBackend> PaddleBackend::Clone(void* stream, int device_id) {
    auto clone_option = option_;
    clone_option.gpu_id = device_id;
    clone_option.external_stream_ = stream;
-    casted_backend->InitFromPaddle(clone_option.model_file,
+    if (runtime_option.model_from_memory_) {
-                                   clone_option.params_file, clone_option);
+      FDASSERT(
          casted_backend->InitFromPaddle(runtime_option.model_buffer_,
                                         runtime_option.params_buffer_,
                                         clone_option),
          "Clone model from Paddle failed while initialize PaddleBackend.");
    } else {
      std::string model_buffer = "";
      std::string params_buffer = "";
      FDASSERT(
          ReadBinaryFromFile(clone_option.model_file, &model_buffer),
          "Fail to read binary from model file while cloning PaddleBackend");
      FDASSERT(ReadBinaryFromFile(clone_option.params_file, &params_buffer),
               "Fail to read binary from parameter file while cloning "
               "PaddleBackend");
      FDASSERT(
          casted_backend->InitFromPaddle(model_buffer, params_buffer,
                                         clone_option),
          "Clone model from Paddle failed while initialize PaddleBackend.");
    }
    FDWARNING << "The target device id:" << device_id
              << " is different from current device id:" << option_.gpu_id
              << ", cannot share memory with current engine." << std::endl;
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.h
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.h
@@ -53,8 +53,8 @@ class PaddleBackend : public BaseBackend {
  virtual ~PaddleBackend() = default;
  void BuildOption(const PaddleBackendOption& option);
-  bool
+  bool InitFromPaddle(const std::string& model_buffer,
-  InitFromPaddle(const std::string& model_file, const std::string& params_file,
+                     const std::string& params_buffer,
                     const PaddleBackendOption& option = PaddleBackendOption());
  bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
@@ -64,7 +64,8 @@ class PaddleBackend : public BaseBackend {
  int NumOutputs() const override { return outputs_desc_.size(); }
-  std::unique_ptr<BaseBackend> Clone(void* stream = nullptr,
+  std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
                                     void* stream = nullptr,
                                     int device_id = -1) override;
  TensorInfo GetInputInfo(int index) override;
--- a/fastdeploy/runtime/backends/tensorrt/option.h
+++ b/fastdeploy/runtime/backends/tensorrt/option.h
@@ -24,6 +24,7 @@ namespace fastdeploy {
 struct TrtBackendOption {
  std::string model_file = "";   // Path of model file
  std::string params_file = "";  // Path of parameters file, can be empty
  // format of input model
  ModelFormat model_format = ModelFormat::AUTOREC;
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
@@ -113,8 +113,8 @@ bool TrtBackend::LoadTrtCache(const std::string& trt_engine_file) {
  return true;
 }
-bool TrtBackend::InitFromPaddle(const std::string& model_file,
+bool TrtBackend::InitFromPaddle(const std::string& model_buffer,
-                                const std::string& params_file,
+                                const std::string& params_buffer,
                                const TrtBackendOption& option, bool verbose) {
  if (initialized_) {
    FDERROR << "TrtBackend is already initlized, cannot initialize again."
@@ -132,7 +132,8 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
  int model_content_size = 0;
  char* calibration_cache_ptr;
  int calibration_cache_size = 0;
-  if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
+  if (!paddle2onnx::Export(model_buffer.c_str(), model_buffer.size(),
                           params_buffer.c_str(), params_buffer.size(),
                           &model_content_ptr, &model_content_size, 11, true,
                           verbose, true, true, true, ops.data(), 1, "tensorrt",
                           &calibration_cache_ptr, &calibration_cache_size, "",
@@ -141,7 +142,6 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
            << std::endl;
    return false;
  }
  std::string onnx_model_proto(model_content_ptr,
                               model_content_ptr + model_content_size);
  delete[] model_content_ptr;
@@ -159,9 +159,8 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
             model_file_name_.c_str());
    f << onnx_model_proto;
    f.close();
    return InitFromOnnx(model_file_name_, option, false);
  }
-  return InitFromOnnx(onnx_model_proto, option, true);
+  return InitFromOnnx(onnx_model_proto, option);
 #else
  FDERROR << "Didn't compile with PaddlePaddle frontend, you can try to "
             "call `InitFromOnnx` instead."
@@ -170,9 +169,8 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
 #endif
 }
-bool TrtBackend::InitFromOnnx(const std::string& model_file,
+bool TrtBackend::InitFromOnnx(const std::string& model_buffer,
-                              const TrtBackendOption& option,
+                              const TrtBackendOption& option) {
                              bool from_memory_buffer) {
  if (initialized_) {
    FDERROR << "TrtBackend is already initlized, cannot initialize again."
            << std::endl;
@@ -181,22 +179,7 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,
  option_ = option;
  cudaSetDevice(option_.gpu_id);
-  std::string onnx_content = "";
+  std::string onnx_content = model_buffer;
  if (!from_memory_buffer) {
    std::ifstream fin(model_file.c_str(), std::ios::binary | std::ios::in);
    if (!fin) {
      FDERROR << "[ERROR] Failed to open ONNX model file: " << model_file
              << std::endl;
      return false;
    }
    fin.seekg(0, std::ios::end);
    onnx_content.resize(fin.tellg());
    fin.seekg(0, std::ios::beg);
    fin.read(&(onnx_content.at(0)), onnx_content.size());
    fin.close();
  } else {
    onnx_content = model_file;
  }
  // This part of code will record the original outputs order
  // because the converted tensorrt network may exist wrong order of outputs
@@ -739,21 +722,41 @@ std::vector<TensorInfo> TrtBackend::GetOutputInfos() {
  return infos;
 }
-std::unique_ptr<BaseBackend> TrtBackend::Clone(void* stream, int device_id) {
+std::unique_ptr<BaseBackend> TrtBackend::Clone(RuntimeOption& runtime_option,
                                               void* stream, int device_id) {
  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<TrtBackend>();
  auto casted_backend = dynamic_cast<TrtBackend*>(new_backend.get());
  if (device_id > 0 && device_id != option_.gpu_id) {
    auto clone_option = option_;
    clone_option.gpu_id = device_id;
    clone_option.external_stream_ = stream;
    if (runtime_option.model_from_memory_) {
      FDASSERT(casted_backend->InitFromPaddle(runtime_option.model_buffer_,
                                              runtime_option.params_buffer_,
                                              clone_option),
               "Clone model from Paddle failed while initialize TrtBackend.");
    } else {
      if (option_.model_format == ModelFormat::ONNX) {
-      FDASSERT(casted_backend->InitFromOnnx(option_.model_file, clone_option),
+        std::string model_buffer = "";
        FDASSERT(
            ReadBinaryFromFile(clone_option.model_file, &model_buffer),
            "Fail to read binary from model file while cloning TrtBackend");
        FDASSERT(casted_backend->InitFromOnnx(model_buffer, clone_option),
                 "Clone model from ONNX failed while initialize TrtBackend.");
      } else {
-      FDASSERT(casted_backend->InitFromPaddle(
+        std::string model_buffer = "";
-                   option_.model_file, option_.params_file, clone_option),
+        std::string params_buffer = "";
        FDASSERT(
            ReadBinaryFromFile(clone_option.model_file, &model_buffer),
            "Fail to read binary from model file while cloning TrtBackend");
        FDASSERT(
            ReadBinaryFromFile(clone_option.params_file, &params_buffer),
            "Fail to read binary from parameter file while cloning TrtBackend");
        FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
                                                clone_option),
                 "Clone model from Paddle failed while initialize TrtBackend.");
      }
    }
    FDWARNING << "The target device id:" << device_id
              << " is different from current device id:" << option_.gpu_id
              << ", cannot share memory with current engine." << std::endl;
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.h
@@ -72,13 +72,12 @@ class TrtBackend : public BaseBackend {
  TrtBackend() : engine_(nullptr), context_(nullptr) {}
  void BuildOption(const TrtBackendOption& option);
-  bool InitFromPaddle(const std::string& model_file,
+  bool InitFromPaddle(const std::string& model_buffer,
-                      const std::string& params_file,
+                      const std::string& params_buffer,
                      const TrtBackendOption& option = TrtBackendOption(),
                      bool verbose = false);
-  bool InitFromOnnx(const std::string& model_file,
+  bool InitFromOnnx(const std::string& model_buffer,
-                    const TrtBackendOption& option = TrtBackendOption(),
+                    const TrtBackendOption& option = TrtBackendOption());
                    bool from_memory_buffer = false);
  bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
             bool copy_to_fd = true) override;
@@ -88,7 +87,8 @@ class TrtBackend : public BaseBackend {
  TensorInfo GetOutputInfo(int index);
  std::vector<TensorInfo> GetInputInfos() override;
  std::vector<TensorInfo> GetOutputInfos() override;
-  std::unique_ptr<BaseBackend> Clone(void* stream = nullptr,
+  std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
                                     void* stream = nullptr,
                                     int device_id = -1) override;
  ~TrtBackend() {
--- a/fastdeploy/runtime/runtime.cc
+++ b/fastdeploy/runtime/runtime.cc
@@ -208,6 +208,15 @@ FDTensor* Runtime::GetOutputTensor(const std::string& name) {
  return nullptr;
 }
 void Runtime::ReleaseModelMemoryBuffer() {
  if (option.model_from_memory_) {
    option.model_buffer_.clear();
    option.model_buffer_.shrink_to_fit();
    option.params_buffer_.clear();
    option.params_buffer_.shrink_to_fit();
  }
 }
 void Runtime::CreatePaddleBackend() {
  FDASSERT(
      option.device == Device::CPU || option.device == Device::GPU ||
@@ -231,12 +240,6 @@ void Runtime::CreatePaddleBackend() {
  pd_option.enable_pinned_memory = option.enable_pinned_memory;
  pd_option.external_stream_ = option.external_stream_;
  pd_option.model_from_memory_ = option.model_from_memory_;
  if (pd_option.model_from_memory_) {
    pd_option.model_buffer_ = option.model_buffer_;
    pd_option.params_buffer_ = option.params_buffer_;
    pd_option.model_buffer_size_ = option.model_buffer_size_;
    pd_option.params_buffer_size_ = option.params_buffer_size_;
  }
 #ifdef ENABLE_TRT_BACKEND
  if (pd_option.use_gpu && option.pd_enable_trt) {
    pd_option.enable_trt = true;
@@ -276,9 +279,16 @@ void Runtime::CreatePaddleBackend() {
    FDASSERT(casted_backend->InitFromPaddle(option.model_buffer_,
                                            option.params_buffer_, pd_option),
             "Load model from Paddle failed while initliazing PaddleBackend.");
    ReleaseModelMemoryBuffer();
  } else {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
+    std::string model_buffer = "";
-                                            option.params_file, pd_option),
+    std::string params_buffer = "";
    FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
             "Fail to read binary from model file");
    FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
             "Fail to read binary from parameter file");
    FDASSERT(
        casted_backend->InitFromPaddle(model_buffer, params_buffer, pd_option),
        "Load model from Paddle failed while initliazing PaddleBackend.");
  }
 #else
@@ -291,6 +301,10 @@ void Runtime::CreatePaddleBackend() {
 }
 void Runtime::CreateOpenVINOBackend() {
  // TODO(huangjianhui) OpenVINO only supports to load ONNX format model from
  // memory Temporarily disable this function
  FDASSERT(option.model_from_memory_ == false,
           "OpenVINOBackend don't support to load model from memory");
  FDASSERT(option.device == Device::CPU,
           "Backend::OPENVINO only supports Device::CPU");
  FDASSERT(option.model_format == ModelFormat::PADDLE ||
@@ -342,16 +356,37 @@ void Runtime::CreateOrtBackend() {
  ort_option.use_gpu = (option.device == Device::GPU) ? true : false;
  ort_option.gpu_id = option.device_id;
  ort_option.external_stream_ = option.external_stream_;
  backend_ = utils::make_unique<OrtBackend>();
  auto casted_backend = dynamic_cast<OrtBackend*>(backend_.get());
  if (option.model_format == ModelFormat::ONNX) {
-    FDASSERT(casted_backend->InitFromOnnx(option.model_file, ort_option),
+    if (option.model_from_memory_) {
      FDASSERT(casted_backend->InitFromOnnx(option.model_buffer_, ort_option),
               "Load model from ONNX failed while initliazing OrtBackend.");
      ReleaseModelMemoryBuffer();
    } else {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
+      std::string model_buffer = "";
-                                            option.params_file, ort_option),
+      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
               "Fail to read binary from model file");
      FDASSERT(casted_backend->InitFromOnnx(model_buffer, ort_option),
               "Load model from ONNX failed while initliazing OrtBackend.");
    }
  } else {
    if (option.model_from_memory_) {
      FDASSERT(casted_backend->InitFromPaddle(
                   option.model_buffer_, option.params_buffer_, ort_option),
               "Load model from Paddle failed while initliazing OrtBackend.");
      ReleaseModelMemoryBuffer();
    } else {
      std::string model_buffer = "";
      std::string params_buffer = "";
      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
               "Fail to read binary from model file");
      FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
               "Fail to read binary from parameter file");
      FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
                                              ort_option),
               "Load model from Paddle failed while initliazing OrtBackend.");
    }
  }
 #else
  FDASSERT(false,
@@ -385,16 +420,37 @@ void Runtime::CreateTrtBackend() {
  trt_option.serialize_file = option.trt_serialize_file;
  trt_option.enable_pinned_memory = option.enable_pinned_memory;
  trt_option.external_stream_ = option.external_stream_;
  backend_ = utils::make_unique<TrtBackend>();
  auto casted_backend = dynamic_cast<TrtBackend*>(backend_.get());
  if (option.model_format == ModelFormat::ONNX) {
-    FDASSERT(casted_backend->InitFromOnnx(option.model_file, trt_option),
+    if (option.model_from_memory_) {
      FDASSERT(casted_backend->InitFromOnnx(option.model_buffer_, trt_option),
               "Load model from ONNX failed while initliazing TrtBackend.");
      ReleaseModelMemoryBuffer();
    } else {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
+      std::string model_buffer = "";
-                                            option.params_file, trt_option),
+      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
               "Fail to read binary from model file");
      FDASSERT(casted_backend->InitFromOnnx(model_buffer, trt_option),
               "Load model from ONNX failed while initliazing TrtBackend.");
    }
  } else {
    if (option.model_from_memory_) {
      FDASSERT(casted_backend->InitFromPaddle(
                   option.model_buffer_, option.params_buffer_, trt_option),
               "Load model from Paddle failed while initliazing TrtBackend.");
      ReleaseModelMemoryBuffer();
    } else {
      std::string model_buffer = "";
      std::string params_buffer = "";
      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
               "Fail to read binary from model file");
      FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
               "Fail to read binary from parameter file");
      FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
                                              trt_option),
               "Load model from Paddle failed while initliazing TrtBackend.");
    }
  }
 #else
  FDASSERT(false,
@@ -406,6 +462,9 @@ void Runtime::CreateTrtBackend() {
 }
 void Runtime::CreateLiteBackend() {
 #ifdef ENABLE_LITE_BACKEND
  FDASSERT(option.model_from_memory_ == false,
           "LiteBackend don't support to load model from memory");
  FDASSERT(option.device == Device::CPU || option.device == Device::TIMVX ||
               option.device == Device::KUNLUNXIN ||
               option.device == Device::ASCEND,
@@ -413,7 +472,6 @@ void Runtime::CreateLiteBackend() {
           "Device::CPU/Device::TIMVX/Device::KUNLUNXIN/Device::ASCEND.");
  FDASSERT(option.model_format == ModelFormat::PADDLE,
           "LiteBackend only support model format of ModelFormat::PADDLE");
 #ifdef ENABLE_LITE_BACKEND
  backend_ = utils::make_unique<LiteBackend>();
  auto casted_backend = dynamic_cast<LiteBackend*>(backend_.get());
  FDASSERT(casted_backend->InitFromPaddle(option.model_file, option.params_file,
@@ -429,6 +487,8 @@ void Runtime::CreateLiteBackend() {
 }
 void Runtime::CreateRKNPU2Backend() {
  FDASSERT(option.model_from_memory_ == false,
           "RKNPU2Backend don't support to load model from memory");
  FDASSERT(option.device == Device::RKNPU,
           "Backend::RKNPU2 only supports Device::RKNPU2");
  FDASSERT(option.model_format == ModelFormat::RKNN,
@@ -451,11 +511,14 @@ void Runtime::CreateRKNPU2Backend() {
 }
 void Runtime::CreateSophgoNPUBackend() {
 #ifdef ENABLE_SOPHGO_BACKEND
  auto sophgo_option = SophgoBackendOption();
  FDASSERT(option.model_from_memory_ == false,
           "SophgoBackend don't support to load model from memory");
  FDASSERT(option.device == Device::SOPHGOTPUD,
           "Backend::SOPHGO only supports Device::SOPHGO");
  FDASSERT(option.model_format == ModelFormat::SOPHGO,
           "SophgoBackend only support model format of ModelFormat::SOPHGO");
 #ifdef ENABLE_SOPHGO_BACKEND
  auto sophgo_option = SophgoBackendOption();
  backend_ = utils::make_unique<SophgoBackend>();
  auto casted_backend = dynamic_cast<SophgoBackend*>(backend_.get());
@@ -486,7 +549,7 @@ Runtime* Runtime::Clone(void* stream, int device_id) {
  FDINFO << "Runtime Clone with Backend:: " << option.backend << " in "
         << option.device << "." << std::endl;
  runtime->option = option;
-  runtime->backend_ = backend_->Clone(stream, device_id);
+  runtime->backend_ = backend_->Clone(option, stream, device_id);
  return runtime;
 }
--- a/fastdeploy/runtime/runtime.h
+++ b/fastdeploy/runtime/runtime.h
@@ -83,6 +83,8 @@ struct FASTDEPLOY_DECL Runtime {
   */
  Runtime* Clone(void* stream = nullptr, int device_id = -1);
  void ReleaseModelMemoryBuffer();
  RuntimeOption option;
  /** \brief Compile TorchScript Module, only for Poros backend
--- a/fastdeploy/runtime/runtime_option.cc
+++ b/fastdeploy/runtime/runtime_option.cc
@@ -38,29 +38,21 @@ void RuntimeOption::SetModelPath(const std::string& model_path,
  }
 }
-void RuntimeOption::SetModelBuffer(const char* model_buffer,
+void RuntimeOption::SetModelBuffer(const std::string& model_buffer,
-                                   size_t model_buffer_size,
+                                   const std::string& params_buffer,
                                   const char* params_buffer,
                                   size_t params_buffer_size,
                                   const ModelFormat& format) {
  model_buffer_size_ = model_buffer_size;
  params_buffer_size_ = params_buffer_size;
  model_from_memory_ = true;
  if (format == ModelFormat::PADDLE) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
+    model_buffer_ = model_buffer;
-    params_buffer_ =
+    params_buffer_ = params_buffer;
        std::string(params_buffer, params_buffer + params_buffer_size);
    model_format = ModelFormat::PADDLE;
  } else if (format == ModelFormat::ONNX) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
+    model_buffer_ = model_buffer;
    model_format = ModelFormat::ONNX;
  } else if (format == ModelFormat::TORCHSCRIPT) {
    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
    model_format = ModelFormat::TORCHSCRIPT;
  } else {
    FDASSERT(false,
             "The model format only can be "
-             "ModelFormat::PADDLE/ModelFormat::ONNX/ModelFormat::TORCHSCRIPT.");
+             "ModelFormat::PADDLE/ModelFormat::ONNX.");
  }
 }
--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -50,14 +50,12 @@ struct FASTDEPLOY_DECL RuntimeOption {
  /** \brief Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory
   *
-   * \param[in] model_buffer The memory buffer of model
+   * \param[in] model_buffer The string of model memory buffer
-   * \param[in] model_buffer_size The size of the model data
+   * \param[in] params_buffer The string of parameters memory buffer
   * \param[in] params_buffer The memory buffer of the combined parameters file
   * \param[in] params_buffer_size The size of the combined parameters data
   * \param[in] format Format of the loaded model
   */
-  void SetModelBuffer(const char* model_buffer, size_t model_buffer_size,
+  void SetModelBuffer(const std::string& model_buffer,
-                      const char* params_buffer, size_t params_buffer_size,
+                      const std::string& params_buffer = "",
                      const ModelFormat& format = ModelFormat::PADDLE);
  /// Use cpu to inference, the runtime will inference on CPU by default
@@ -431,8 +429,6 @@ struct FASTDEPLOY_DECL RuntimeOption {
  std::string model_buffer_ = "";
  std::string params_buffer_ = "";
  size_t model_buffer_size_ = 0;
  size_t params_buffer_size_ = 0;
  bool model_from_memory_ = false;
 };
--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -229,20 +229,14 @@ class RuntimeOption:
    def set_model_buffer(self,
                         model_buffer,
-                         model_buffer_size,
+                         params_buffer="",
                         params_buffer,
                         params_buffer_size,
                         model_format=ModelFormat.PADDLE):
        """Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory
        :param model_buffer: (bytes)The memory buffer of model
-        :param model_buffer_size: (unsigned int)The size of the model data.
+        :param params_buffer: (bytes)The memory buffer of the parameters
        :param params_buffer: (bytes)The memory buffer of the combined parameters file
        :param params_buffer_size: (unsigned inst)The size of the combined parameters data
        :param model_format: (ModelFormat)Format of model, support ModelFormat.PADDLE/ModelFormat.ONNX/ModelFormat.TORCHSCRIPT
        """
-        return self._option.set_model_buffer(model_buffer, model_buffer_size,
+        return self._option.set_model_buffer(model_buffer, params_buffer,
                                             params_buffer, params_buffer_size,
                                             model_format)
    def use_gpu(self, device_id=0):