[Other] FastDeploy TensorRT && ONNX backend support to load model form memory (#1130)

* Update all backends load model from buffer * Delete redundant code * Format code style * Format code style * Delete redundant code * Delete redundant code * Add some FDASSERTs * Update load model form memory when cloning engine * Update clone engine code * Update set_model_buffer api parameters with char pointer * Release memory buffer variables after finish init backends * Fix conflict * Fix bug
2025-10-21 15:49:31 +08:00 · 2023-02-01 11:36:09 +08:00
parent 5b7728e898
commit 76df90afc3
17 changed files with 201 additions and 154 deletions
--- a/fastdeploy/pybind/runtime.cc
+++ b/fastdeploy/pybind/runtime.cc
@@ -84,8 +84,6 @@ void BindRuntime(pybind11::module& m) {
      .def_readwrite("backend", &RuntimeOption::backend)
      .def_readwrite("external_stream", &RuntimeOption::external_stream_)
      .def_readwrite("model_from_memory", &RuntimeOption::model_from_memory_)
-      .def_readwrite("model_buffer_size", &RuntimeOption::model_buffer_size_)
-      .def_readwrite("params_buffer_size", &RuntimeOption::params_buffer_size_)
      .def_readwrite("cpu_thread_num", &RuntimeOption::cpu_thread_num)
      .def_readwrite("device_id", &RuntimeOption::device_id)
      .def_readwrite("device", &RuntimeOption::device)
--- a/fastdeploy/runtime/backends/backend.h
+++ b/fastdeploy/runtime/backends/backend.h
@@ -21,6 +21,7 @@

 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/core/fd_type.h"
+#include "fastdeploy/runtime/runtime_option.h"

 namespace fastdeploy {

@@ -76,7 +77,8 @@ class BaseBackend {

  // Optional: For those backends which can share memory
  // while creating multiple inference engines with same model file
-  virtual std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+  virtual std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
+                                             void *stream = nullptr,
                                             int device_id = -1) {
    FDERROR << "Clone no support" << std::endl;
    return nullptr;
--- a/fastdeploy/runtime/backends/openvino/ov_backend.cc
+++ b/fastdeploy/runtime/backends/openvino/ov_backend.cc
@@ -237,7 +237,6 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
  option_ = option;

  std::shared_ptr<ov::Model> model = core_.read_model(model_file);
-
  if (option_.shape_infos.size() > 0) {
    std::map<std::string, ov::PartialShape> shape_infos;
    for (const auto& item : option_.shape_infos) {
@@ -380,8 +379,8 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
  return true;
 }

-std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(void* stream,
-                                                    int device_id) {
+std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(
+    RuntimeOption& runtime_option, void* stream, int device_id) {
  std::unique_ptr<BaseBackend> new_backend =
      utils::make_unique<OpenVINOBackend>();
  auto casted_backend = dynamic_cast<OpenVINOBackend*>(new_backend.get());
--- a/fastdeploy/runtime/backends/openvino/ov_backend.h
+++ b/fastdeploy/runtime/backends/openvino/ov_backend.h
@@ -52,7 +52,8 @@ class OpenVINOBackend : public BaseBackend {
  std::vector<TensorInfo> GetInputInfos() override;
  std::vector<TensorInfo> GetOutputInfos() override;

-  std::unique_ptr<BaseBackend> Clone(void* stream = nullptr,
+  std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
+                                     void* stream = nullptr,
                                     int device_id = -1) override;

 private:
--- a/fastdeploy/runtime/backends/ort/ort_backend.cc
+++ b/fastdeploy/runtime/backends/ort/ort_backend.cc
@@ -73,8 +73,8 @@ void OrtBackend::BuildOption(const OrtBackendOption& option) {
  }
 }

-bool OrtBackend::InitFromPaddle(const std::string& model_file,
-                                const std::string& params_file,
+bool OrtBackend::InitFromPaddle(const std::string& model_buffer,
+                                const std::string& params_buffer,
                                const OrtBackendOption& option, bool verbose) {
  if (initialized_) {
    FDERROR << "OrtBackend is already initlized, cannot initialize again."
@@ -92,7 +92,8 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
  strcpy(ops[1].op_name, "pool2d");
  strcpy(ops[1].export_op_name, "AdaptivePool2d");

-  if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
+  if (!paddle2onnx::Export(model_buffer.c_str(), model_buffer.size(),
+                           params_buffer.c_str(), params_buffer.size(),
                           &model_content_ptr, &model_content_size, 11, true,
                           verbose, true, true, true, ops.data(), 2,
                           "onnxruntime", nullptr, 0, "", &save_external)) {
@@ -112,9 +113,8 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
             model_file_name.c_str());
    f << onnx_model_proto;
    f.close();
-    return InitFromOnnx(model_file_name, option, false);
  }
-  return InitFromOnnx(onnx_model_proto, option, true);
+  return InitFromOnnx(onnx_model_proto, option);
 #else
  FDERROR << "Didn't compile with PaddlePaddle Frontend, you can try to "
             "call `InitFromOnnx` instead."
@@ -124,8 +124,7 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
 }

 bool OrtBackend::InitFromOnnx(const std::string& model_file,
-                              const OrtBackendOption& option,
-                              bool from_memory_buffer) {
+                              const OrtBackendOption& option) {
  if (initialized_) {
    FDERROR << "OrtBackend is already initlized, cannot initialize again."
            << std::endl;
@@ -134,17 +133,7 @@ bool OrtBackend::InitFromOnnx(const std::string& model_file,

  BuildOption(option);
  InitCustomOperators();
-  if (from_memory_buffer) {
  session_ = {env_, model_file.data(), model_file.size(), session_options_};
-  } else {
-#ifdef _WIN32
-    session_ = {env_,
-                std::wstring(model_file.begin(), model_file.end()).c_str(),
-                session_options_};
-#else
-    session_ = {env_, model_file.c_str(), session_options_};
-#endif
-  }
  binding_ = std::make_shared<Ort::IoBinding>(session_);

  Ort::MemoryInfo memory_info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
--- a/fastdeploy/runtime/backends/ort/ort_backend.h
+++ b/fastdeploy/runtime/backends/ort/ort_backend.h
@@ -39,14 +39,13 @@ class OrtBackend : public BaseBackend {

  void BuildOption(const OrtBackendOption& option);

-  bool InitFromPaddle(const std::string& model_file,
-                      const std::string& params_file,
+  bool InitFromPaddle(const std::string& model_buffer,
+                      const std::string& params_buffer,
                      const OrtBackendOption& option = OrtBackendOption(),
                      bool verbose = false);

-  bool InitFromOnnx(const std::string& model_file,
-                    const OrtBackendOption& option = OrtBackendOption(),
-                    bool from_memory_buffer = false);
+  bool InitFromOnnx(const std::string& model_buffer,
+                    const OrtBackendOption& option = OrtBackendOption());

  bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
             bool copy_to_fd = true) override;
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -39,10 +39,7 @@ struct PaddleBackendOption {
  std::string model_file = "";   // Path of model file
  std::string params_file = "";  // Path of parameters file, can be empty

-  std::string model_buffer_ = "";
-  std::string params_buffer_ = "";
-  size_t model_buffer_size_ = 0;
-  size_t params_buffer_size_ = 0;
+  // load model and paramters from memory
  bool model_from_memory_ = false;

 #ifdef WITH_GPU
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -89,9 +89,10 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
  }
 }

-bool PaddleBackend::InitFromPaddle(const std::string& model_file,
-                                   const std::string& params_file,
+bool PaddleBackend::InitFromPaddle(const std::string& model_buffer,
+                                   const std::string& params_buffer,
                                   const PaddleBackendOption& option) {
+  // bool PaddleBackend::InitFromPaddle(const std::string& contents) {
  if (initialized_) {
    FDERROR << "PaddleBackend is already initlized, cannot initialize again."
            << std::endl;
@@ -102,16 +103,9 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_file,
  // PaddleReader instead now
  std::string contents;

-  if (option.model_from_memory_) {
-    config_.SetModelBuffer(model_file.c_str(), option.model_buffer_size_,
-                           params_file.c_str(), option.params_buffer_size_);
-    contents = model_file;
-  } else {
-    config_.SetModel(model_file, params_file);
-    if (!ReadBinaryFromFile(model_file, &contents)) {
-      return false;
-    }
-  }
+  config_.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
+                         params_buffer.c_str(), params_buffer.size());
+  contents = model_buffer;
  config_.EnableMemoryOptim();
  BuildOption(option);
  auto reader = paddle2onnx::PaddleReader(contents.c_str(), contents.size());
@@ -172,20 +166,16 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_file,
    // Set the shape info file.
    std::string curr_model_dir = "./";
    if (!option.model_from_memory_) {
-      curr_model_dir = GetDirFromPath(model_file);
+      curr_model_dir = GetDirFromPath(option.model_file);
    }
    std::string shape_range_info =
        PathJoin(curr_model_dir, "shape_range_info.pbtxt");
    if (!CheckFileExists(shape_range_info)) {
      FDINFO << "Start generating shape range info file." << std::endl;
      paddle_infer::Config analysis_config;
-      if (option.model_from_memory_) {
-        analysis_config.SetModelBuffer(
-            model_file.c_str(), option.model_buffer_size_, params_file.c_str(),
-            option.params_buffer_size_);
-      } else {
-        analysis_config.SetModel(model_file, params_file);
-      }
+      analysis_config.SetModelBuffer(model_buffer.c_str(), model_buffer.size(),
+                                     params_buffer.c_str(),
+                                     params_buffer.size());
      analysis_config.CollectShapeRangeInfo(shape_range_info);
      auto predictor_tmp = paddle_infer::CreatePredictor(analysis_config);
      std::map<std::string, std::vector<int>> max_shape;
@@ -258,7 +248,8 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
  return true;
 }

-std::unique_ptr<BaseBackend> PaddleBackend::Clone(void* stream, int device_id) {
+std::unique_ptr<BaseBackend> PaddleBackend::Clone(RuntimeOption& runtime_option,
+                                                  void* stream, int device_id) {
  std::unique_ptr<BaseBackend> new_backend =
      utils::make_unique<PaddleBackend>();
  auto casted_backend = dynamic_cast<PaddleBackend*>(new_backend.get());
@@ -266,8 +257,27 @@ std::unique_ptr<BaseBackend> PaddleBackend::Clone(void* stream, int device_id) {
    auto clone_option = option_;
    clone_option.gpu_id = device_id;
    clone_option.external_stream_ = stream;
-    casted_backend->InitFromPaddle(clone_option.model_file,
-                                   clone_option.params_file, clone_option);
+    if (runtime_option.model_from_memory_) {
+      FDASSERT(
+          casted_backend->InitFromPaddle(runtime_option.model_buffer_,
+                                         runtime_option.params_buffer_,
+                                         clone_option),
+          "Clone model from Paddle failed while initialize PaddleBackend.");
+    } else {
+      std::string model_buffer = "";
+      std::string params_buffer = "";
+      FDASSERT(
+          ReadBinaryFromFile(clone_option.model_file, &model_buffer),
+          "Fail to read binary from model file while cloning PaddleBackend");
+      FDASSERT(ReadBinaryFromFile(clone_option.params_file, &params_buffer),
+               "Fail to read binary from parameter file while cloning "
+               "PaddleBackend");
+      FDASSERT(
+          casted_backend->InitFromPaddle(model_buffer, params_buffer,
+                                         clone_option),
+          "Clone model from Paddle failed while initialize PaddleBackend.");
+    }
+
    FDWARNING << "The target device id:" << device_id
              << " is different from current device id:" << option_.gpu_id
              << ", cannot share memory with current engine." << std::endl;
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.h
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.h
@@ -53,8 +53,8 @@ class PaddleBackend : public BaseBackend {
  virtual ~PaddleBackend() = default;
  void BuildOption(const PaddleBackendOption& option);

-  bool
-  InitFromPaddle(const std::string& model_file, const std::string& params_file,
+  bool InitFromPaddle(const std::string& model_buffer,
+                     const std::string& params_buffer,
                     const PaddleBackendOption& option = PaddleBackendOption());

  bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
@@ -64,7 +64,8 @@ class PaddleBackend : public BaseBackend {

  int NumOutputs() const override { return outputs_desc_.size(); }

-  std::unique_ptr<BaseBackend> Clone(void* stream = nullptr,
+  std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
+                                     void* stream = nullptr,
                                     int device_id = -1) override;

  TensorInfo GetInputInfo(int index) override;
--- a/fastdeploy/runtime/backends/tensorrt/option.h
+++ b/fastdeploy/runtime/backends/tensorrt/option.h
@@ -24,6 +24,7 @@ namespace fastdeploy {
 struct TrtBackendOption {
  std::string model_file = "";   // Path of model file
  std::string params_file = "";  // Path of parameters file, can be empty
+
  // format of input model
  ModelFormat model_format = ModelFormat::AUTOREC;

--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
@@ -113,8 +113,8 @@ bool TrtBackend::LoadTrtCache(const std::string& trt_engine_file) {
  return true;
 }

-bool TrtBackend::InitFromPaddle(const std::string& model_file,
-                                const std::string& params_file,
+bool TrtBackend::InitFromPaddle(const std::string& model_buffer,
+                                const std::string& params_buffer,
                                const TrtBackendOption& option, bool verbose) {
  if (initialized_) {
    FDERROR << "TrtBackend is already initlized, cannot initialize again."
@@ -132,7 +132,8 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
  int model_content_size = 0;
  char* calibration_cache_ptr;
  int calibration_cache_size = 0;
-  if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
+  if (!paddle2onnx::Export(model_buffer.c_str(), model_buffer.size(),
+                           params_buffer.c_str(), params_buffer.size(),
                           &model_content_ptr, &model_content_size, 11, true,
                           verbose, true, true, true, ops.data(), 1, "tensorrt",
                           &calibration_cache_ptr, &calibration_cache_size, "",
@@ -141,7 +142,6 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
            << std::endl;
    return false;
  }
-
  std::string onnx_model_proto(model_content_ptr,
                               model_content_ptr + model_content_size);
  delete[] model_content_ptr;
@@ -159,9 +159,8 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
             model_file_name_.c_str());
    f << onnx_model_proto;
    f.close();
-    return InitFromOnnx(model_file_name_, option, false);
  }
-  return InitFromOnnx(onnx_model_proto, option, true);
+  return InitFromOnnx(onnx_model_proto, option);
 #else
  FDERROR << "Didn't compile with PaddlePaddle frontend, you can try to "
             "call `InitFromOnnx` instead."
@@ -170,9 +169,8 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
 #endif
 }

-bool TrtBackend::InitFromOnnx(const std::string& model_file,
-                              const TrtBackendOption& option,
-                              bool from_memory_buffer) {
+bool TrtBackend::InitFromOnnx(const std::string& model_buffer,
+                              const TrtBackendOption& option) {
  if (initialized_) {
    FDERROR << "TrtBackend is already initlized, cannot initialize again."
            << std::endl;
@@ -181,22 +179,7 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,
  option_ = option;
  cudaSetDevice(option_.gpu_id);

-  std::string onnx_content = "";
-  if (!from_memory_buffer) {
-    std::ifstream fin(model_file.c_str(), std::ios::binary | std::ios::in);
-    if (!fin) {
-      FDERROR << "[ERROR] Failed to open ONNX model file: " << model_file
-              << std::endl;
-      return false;
-    }
-    fin.seekg(0, std::ios::end);
-    onnx_content.resize(fin.tellg());
-    fin.seekg(0, std::ios::beg);
-    fin.read(&(onnx_content.at(0)), onnx_content.size());
-    fin.close();
-  } else {
-    onnx_content = model_file;
-  }
+  std::string onnx_content = model_buffer;

  // This part of code will record the original outputs order
  // because the converted tensorrt network may exist wrong order of outputs
@@ -739,21 +722,41 @@ std::vector<TensorInfo> TrtBackend::GetOutputInfos() {
  return infos;
 }

-std::unique_ptr<BaseBackend> TrtBackend::Clone(void* stream, int device_id) {
+std::unique_ptr<BaseBackend> TrtBackend::Clone(RuntimeOption& runtime_option,
+                                               void* stream, int device_id) {
  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<TrtBackend>();
  auto casted_backend = dynamic_cast<TrtBackend*>(new_backend.get());
  if (device_id > 0 && device_id != option_.gpu_id) {
    auto clone_option = option_;
    clone_option.gpu_id = device_id;
    clone_option.external_stream_ = stream;
+    if (runtime_option.model_from_memory_) {
+      FDASSERT(casted_backend->InitFromPaddle(runtime_option.model_buffer_,
+                                              runtime_option.params_buffer_,
+                                              clone_option),
+               "Clone model from Paddle failed while initialize TrtBackend.");
+    } else {
      if (option_.model_format == ModelFormat::ONNX) {
-      FDASSERT(casted_backend->InitFromOnnx(option_.model_file, clone_option),
+        std::string model_buffer = "";
+        FDASSERT(
+            ReadBinaryFromFile(clone_option.model_file, &model_buffer),
+            "Fail to read binary from model file while cloning TrtBackend");
+        FDASSERT(casted_backend->InitFromOnnx(model_buffer, clone_option),
                 "Clone model from ONNX failed while initialize TrtBackend.");
      } else {
-      FDASSERT(casted_backend->InitFromPaddle(
-                   option_.model_file, option_.params_file, clone_option),
+        std::string model_buffer = "";
+        std::string params_buffer = "";
+        FDASSERT(
+            ReadBinaryFromFile(clone_option.model_file, &model_buffer),
+            "Fail to read binary from model file while cloning TrtBackend");
+        FDASSERT(
+            ReadBinaryFromFile(clone_option.params_file, &params_buffer),
+            "Fail to read binary from parameter file while cloning TrtBackend");
+        FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
+                                                clone_option),
                 "Clone model from Paddle failed while initialize TrtBackend.");
      }
+    }
    FDWARNING << "The target device id:" << device_id
              << " is different from current device id:" << option_.gpu_id
              << ", cannot share memory with current engine." << std::endl;
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.h
@@ -72,13 +72,12 @@ class TrtBackend : public BaseBackend {
  TrtBackend() : engine_(nullptr), context_(nullptr) {}
  void BuildOption(const TrtBackendOption& option);

-  bool InitFromPaddle(const std::string& model_file,
-                      const std::string& params_file,
+  bool InitFromPaddle(const std::string& model_buffer,
+                      const std::string& params_buffer,
                      const TrtBackendOption& option = TrtBackendOption(),
                      bool verbose = false);
-  bool InitFromOnnx(const std::string& model_file,
-                    const TrtBackendOption& option = TrtBackendOption(),
-                    bool from_memory_buffer = false);
+  bool InitFromOnnx(const std::string& model_buffer,
+                    const TrtBackendOption& option = TrtBackendOption());
  bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
             bool copy_to_fd = true) override;

@@ -88,7 +87,8 @@ class TrtBackend : public BaseBackend {
  TensorInfo GetOutputInfo(int index);
  std::vector<TensorInfo> GetInputInfos() override;
  std::vector<TensorInfo> GetOutputInfos() override;
-  std::unique_ptr<BaseBackend> Clone(void* stream = nullptr,
+  std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
+                                     void* stream = nullptr,
                                     int device_id = -1) override;

  ~TrtBackend() {
--- a/fastdeploy/runtime/runtime.cc
+++ b/fastdeploy/runtime/runtime.cc
@@ -208,6 +208,15 @@ FDTensor* Runtime::GetOutputTensor(const std::string& name) {
  return nullptr;
 }

+void Runtime::ReleaseModelMemoryBuffer() {
+  if (option.model_from_memory_) {
+    option.model_buffer_.clear();
+    option.model_buffer_.shrink_to_fit();
+    option.params_buffer_.clear();
+    option.params_buffer_.shrink_to_fit();
+  }
+}
+
 void Runtime::CreatePaddleBackend() {
  FDASSERT(
      option.device == Device::CPU || option.device == Device::GPU ||
@@ -231,12 +240,6 @@ void Runtime::CreatePaddleBackend() {
  pd_option.enable_pinned_memory = option.enable_pinned_memory;
  pd_option.external_stream_ = option.external_stream_;
  pd_option.model_from_memory_ = option.model_from_memory_;
-  if (pd_option.model_from_memory_) {
-    pd_option.model_buffer_ = option.model_buffer_;
-    pd_option.params_buffer_ = option.params_buffer_;
-    pd_option.model_buffer_size_ = option.model_buffer_size_;
-    pd_option.params_buffer_size_ = option.params_buffer_size_;
-  }
 #ifdef ENABLE_TRT_BACKEND
  if (pd_option.use_gpu && option.pd_enable_trt) {
    pd_option.enable_trt = true;
@@ -276,9 +279,16 @@ void Runtime::CreatePaddleBackend() {
    FDASSERT(casted_backend->InitFromPaddle(option.model_buffer_,
                                            option.params_buffer_, pd_option),
             "Load model from Paddle failed while initliazing PaddleBackend.");
+    ReleaseModelMemoryBuffer();
  } else {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
-                                            option.params_file, pd_option),
+    std::string model_buffer = "";
+    std::string params_buffer = "";
+    FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
+             "Fail to read binary from model file");
+    FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
+             "Fail to read binary from parameter file");
+    FDASSERT(
+        casted_backend->InitFromPaddle(model_buffer, params_buffer, pd_option),
        "Load model from Paddle failed while initliazing PaddleBackend.");
  }
 #else
@@ -291,6 +301,10 @@ void Runtime::CreatePaddleBackend() {
 }

 void Runtime::CreateOpenVINOBackend() {
+  // TODO(huangjianhui) OpenVINO only supports to load ONNX format model from
+  // memory Temporarily disable this function
+  FDASSERT(option.model_from_memory_ == false,
+           "OpenVINOBackend don't support to load model from memory");
  FDASSERT(option.device == Device::CPU,
           "Backend::OPENVINO only supports Device::CPU");
  FDASSERT(option.model_format == ModelFormat::PADDLE ||
@@ -342,16 +356,37 @@ void Runtime::CreateOrtBackend() {
  ort_option.use_gpu = (option.device == Device::GPU) ? true : false;
  ort_option.gpu_id = option.device_id;
  ort_option.external_stream_ = option.external_stream_;
-
  backend_ = utils::make_unique<OrtBackend>();
  auto casted_backend = dynamic_cast<OrtBackend*>(backend_.get());
  if (option.model_format == ModelFormat::ONNX) {
-    FDASSERT(casted_backend->InitFromOnnx(option.model_file, ort_option),
+    if (option.model_from_memory_) {
+      FDASSERT(casted_backend->InitFromOnnx(option.model_buffer_, ort_option),
               "Load model from ONNX failed while initliazing OrtBackend.");
+      ReleaseModelMemoryBuffer();
    } else {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
-                                            option.params_file, ort_option),
+      std::string model_buffer = "";
+      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
+               "Fail to read binary from model file");
+      FDASSERT(casted_backend->InitFromOnnx(model_buffer, ort_option),
+               "Load model from ONNX failed while initliazing OrtBackend.");
+    }
+  } else {
+    if (option.model_from_memory_) {
+      FDASSERT(casted_backend->InitFromPaddle(
+                   option.model_buffer_, option.params_buffer_, ort_option),
               "Load model from Paddle failed while initliazing OrtBackend.");
+      ReleaseModelMemoryBuffer();
+    } else {
+      std::string model_buffer = "";
+      std::string params_buffer = "";
+      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
+               "Fail to read binary from model file");
+      FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
+               "Fail to read binary from parameter file");
+      FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
+                                              ort_option),
+               "Load model from Paddle failed while initliazing OrtBackend.");
+    }
  }
 #else
  FDASSERT(false,
@@ -385,16 +420,37 @@ void Runtime::CreateTrtBackend() {
  trt_option.serialize_file = option.trt_serialize_file;
  trt_option.enable_pinned_memory = option.enable_pinned_memory;
  trt_option.external_stream_ = option.external_stream_;
-
  backend_ = utils::make_unique<TrtBackend>();
  auto casted_backend = dynamic_cast<TrtBackend*>(backend_.get());
  if (option.model_format == ModelFormat::ONNX) {
-    FDASSERT(casted_backend->InitFromOnnx(option.model_file, trt_option),
+    if (option.model_from_memory_) {
+      FDASSERT(casted_backend->InitFromOnnx(option.model_buffer_, trt_option),
               "Load model from ONNX failed while initliazing TrtBackend.");
+      ReleaseModelMemoryBuffer();
    } else {
-    FDASSERT(casted_backend->InitFromPaddle(option.model_file,
-                                            option.params_file, trt_option),
+      std::string model_buffer = "";
+      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
+               "Fail to read binary from model file");
+      FDASSERT(casted_backend->InitFromOnnx(model_buffer, trt_option),
+               "Load model from ONNX failed while initliazing TrtBackend.");
+    }
+  } else {
+    if (option.model_from_memory_) {
+      FDASSERT(casted_backend->InitFromPaddle(
+                   option.model_buffer_, option.params_buffer_, trt_option),
               "Load model from Paddle failed while initliazing TrtBackend.");
+      ReleaseModelMemoryBuffer();
+    } else {
+      std::string model_buffer = "";
+      std::string params_buffer = "";
+      FDASSERT(ReadBinaryFromFile(option.model_file, &model_buffer),
+               "Fail to read binary from model file");
+      FDASSERT(ReadBinaryFromFile(option.params_file, &params_buffer),
+               "Fail to read binary from parameter file");
+      FDASSERT(casted_backend->InitFromPaddle(model_buffer, params_buffer,
+                                              trt_option),
+               "Load model from Paddle failed while initliazing TrtBackend.");
+    }
  }
 #else
  FDASSERT(false,
@@ -406,6 +462,9 @@ void Runtime::CreateTrtBackend() {
 }

 void Runtime::CreateLiteBackend() {
+#ifdef ENABLE_LITE_BACKEND
+  FDASSERT(option.model_from_memory_ == false,
+           "LiteBackend don't support to load model from memory");
  FDASSERT(option.device == Device::CPU || option.device == Device::TIMVX ||
               option.device == Device::KUNLUNXIN ||
               option.device == Device::ASCEND,
@@ -413,7 +472,6 @@ void Runtime::CreateLiteBackend() {
           "Device::CPU/Device::TIMVX/Device::KUNLUNXIN/Device::ASCEND.");
  FDASSERT(option.model_format == ModelFormat::PADDLE,
           "LiteBackend only support model format of ModelFormat::PADDLE");
-#ifdef ENABLE_LITE_BACKEND
  backend_ = utils::make_unique<LiteBackend>();
  auto casted_backend = dynamic_cast<LiteBackend*>(backend_.get());
  FDASSERT(casted_backend->InitFromPaddle(option.model_file, option.params_file,
@@ -429,6 +487,8 @@ void Runtime::CreateLiteBackend() {
 }

 void Runtime::CreateRKNPU2Backend() {
+  FDASSERT(option.model_from_memory_ == false,
+           "RKNPU2Backend don't support to load model from memory");
  FDASSERT(option.device == Device::RKNPU,
           "Backend::RKNPU2 only supports Device::RKNPU2");
  FDASSERT(option.model_format == ModelFormat::RKNN,
@@ -451,11 +511,14 @@ void Runtime::CreateRKNPU2Backend() {
 }

 void Runtime::CreateSophgoNPUBackend() {
+#ifdef ENABLE_SOPHGO_BACKEND
+  auto sophgo_option = SophgoBackendOption();
+  FDASSERT(option.model_from_memory_ == false,
+           "SophgoBackend don't support to load model from memory");
  FDASSERT(option.device == Device::SOPHGOTPUD,
           "Backend::SOPHGO only supports Device::SOPHGO");
  FDASSERT(option.model_format == ModelFormat::SOPHGO,
           "SophgoBackend only support model format of ModelFormat::SOPHGO");
-#ifdef ENABLE_SOPHGO_BACKEND
  auto sophgo_option = SophgoBackendOption();
  backend_ = utils::make_unique<SophgoBackend>();
  auto casted_backend = dynamic_cast<SophgoBackend*>(backend_.get());
@@ -486,7 +549,7 @@ Runtime* Runtime::Clone(void* stream, int device_id) {
  FDINFO << "Runtime Clone with Backend:: " << option.backend << " in "
         << option.device << "." << std::endl;
  runtime->option = option;
-  runtime->backend_ = backend_->Clone(stream, device_id);
+  runtime->backend_ = backend_->Clone(option, stream, device_id);
  return runtime;
 }

--- a/fastdeploy/runtime/runtime.h
+++ b/fastdeploy/runtime/runtime.h
@@ -83,6 +83,8 @@ struct FASTDEPLOY_DECL Runtime {
   */
  Runtime* Clone(void* stream = nullptr, int device_id = -1);

+  void ReleaseModelMemoryBuffer();
+
  RuntimeOption option;

  /** \brief Compile TorchScript Module, only for Poros backend
--- a/fastdeploy/runtime/runtime_option.cc
+++ b/fastdeploy/runtime/runtime_option.cc
@@ -38,29 +38,21 @@ void RuntimeOption::SetModelPath(const std::string& model_path,
  }
 }

-void RuntimeOption::SetModelBuffer(const char* model_buffer,
-                                   size_t model_buffer_size,
-                                   const char* params_buffer,
-                                   size_t params_buffer_size,
+void RuntimeOption::SetModelBuffer(const std::string& model_buffer,
+                                   const std::string& params_buffer,
                                   const ModelFormat& format) {
-  model_buffer_size_ = model_buffer_size;
-  params_buffer_size_ = params_buffer_size;
  model_from_memory_ = true;
  if (format == ModelFormat::PADDLE) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
-    params_buffer_ =
-        std::string(params_buffer, params_buffer + params_buffer_size);
+    model_buffer_ = model_buffer;
+    params_buffer_ = params_buffer;
    model_format = ModelFormat::PADDLE;
  } else if (format == ModelFormat::ONNX) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
+    model_buffer_ = model_buffer;
    model_format = ModelFormat::ONNX;
-  } else if (format == ModelFormat::TORCHSCRIPT) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
-    model_format = ModelFormat::TORCHSCRIPT;
  } else {
    FDASSERT(false,
             "The model format only can be "
-             "ModelFormat::PADDLE/ModelFormat::ONNX/ModelFormat::TORCHSCRIPT.");
+             "ModelFormat::PADDLE/ModelFormat::ONNX.");
  }
 }

--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -50,14 +50,12 @@ struct FASTDEPLOY_DECL RuntimeOption {

  /** \brief Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory
   *
-   * \param[in] model_buffer The memory buffer of model
-   * \param[in] model_buffer_size The size of the model data
-   * \param[in] params_buffer The memory buffer of the combined parameters file
-   * \param[in] params_buffer_size The size of the combined parameters data
+   * \param[in] model_buffer The string of model memory buffer
+   * \param[in] params_buffer The string of parameters memory buffer
   * \param[in] format Format of the loaded model
   */
-  void SetModelBuffer(const char* model_buffer, size_t model_buffer_size,
-                      const char* params_buffer, size_t params_buffer_size,
+  void SetModelBuffer(const std::string& model_buffer,
+                      const std::string& params_buffer = "",
                      const ModelFormat& format = ModelFormat::PADDLE);

  /// Use cpu to inference, the runtime will inference on CPU by default
@@ -431,8 +429,6 @@ struct FASTDEPLOY_DECL RuntimeOption {

  std::string model_buffer_ = "";
  std::string params_buffer_ = "";
-  size_t model_buffer_size_ = 0;
-  size_t params_buffer_size_ = 0;
  bool model_from_memory_ = false;
 };

--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -229,20 +229,14 @@ class RuntimeOption:

    def set_model_buffer(self,
                         model_buffer,
-                         model_buffer_size,
-                         params_buffer,
-                         params_buffer_size,
+                         params_buffer="",
                         model_format=ModelFormat.PADDLE):
        """Specify the memory buffer of model and parameter. Used when model and params are loaded directly from memory
-
        :param model_buffer: (bytes)The memory buffer of model
-        :param model_buffer_size: (unsigned int)The size of the model data.
-        :param params_buffer: (bytes)The memory buffer of the combined parameters file
-        :param params_buffer_size: (unsigned inst)The size of the combined parameters data
+        :param params_buffer: (bytes)The memory buffer of the parameters
        :param model_format: (ModelFormat)Format of model, support ModelFormat.PADDLE/ModelFormat.ONNX/ModelFormat.TORCHSCRIPT
        """
-        return self._option.set_model_buffer(model_buffer, model_buffer_size,
-                                             params_buffer, params_buffer_size,
+        return self._option.set_model_buffer(model_buffer, params_buffer,
                                             model_format)

    def use_gpu(self, device_id=0):