Merge pull request #1305 from wwbitejotunn/set_stream_infer-shareExData

[Backend] Set inference with extra stream and prebind output tensor with share ex data
2025-12-24 13:28:13 +08:00 · 2023-02-14 17:08:56 +08:00
parent cd5a54cd75 a0fc0d5e49
commit 41c5f2de5a
10 changed files with 177 additions and 35 deletions
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -25,6 +25,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
  if (option.device == Device::GPU) {
    config_.EnableUseGpu(option.gpu_mem_init_size, option.device_id);
    if (option_.external_stream_) {
+      FDINFO << "Will use external stream for Paddle Backend." << std::endl;
      config_.SetExecStream(option_.external_stream_);
    }
    if (option.enable_trt) {
@@ -226,23 +227,47 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
            << inputs_desc_.size() << ")." << std::endl;
    return false;
  }
+  // output share backend memory only support CPU or GPU
+  if (option_.device == Device::IPU) {
+    copy_to_fd = true;
+  }

  RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN
  for (size_t i = 0; i < inputs.size(); ++i) {
    auto handle = predictor_->GetInputHandle(inputs[i].name);
    ShareTensorFromFDTensor(handle.get(), inputs[i]);
  }
+  std::unordered_set<std::string> prebinded_output_name;
+  // prebinded output only support for GPU
+  if (!copy_to_fd) {
+    for (size_t i = 0; i < (*outputs).size(); ++i) {
+      auto output_name = (*outputs)[i].name;
+      // if a output is not prebinded,
+      // the name of output is expected to be empty.
+      // We skip here
+      if (output_name.empty()) {
+        continue;
+      }
+      // Record the prebinded output_name.
+      // Those outputs do not need PaddleTensorToFDTensor
+      // after predictor_.Run()
+      prebinded_output_name.insert(output_name);
+      auto handle = predictor_->GetOutputHandle(output_name);
+      ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
+    }
+  }

  RUNTIME_PROFILE_LOOP_BEGIN(1)
  predictor_->Run();
  RUNTIME_PROFILE_LOOP_END

-  // output share backend memory only support CPU or GPU
-  if (option_.device == Device::IPU) {
-    copy_to_fd = true;
-  }
  outputs->resize(outputs_desc_.size());
  for (size_t i = 0; i < outputs_desc_.size(); ++i) {
+    // skip prebinded output
+    if (copy_to_fd == false &&
+        prebinded_output_name.count(outputs_desc_[i].name)) {
+      continue;
+    }
    auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name);
    if (copy_to_fd) {
      (*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.h
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.h
@@ -35,6 +35,9 @@ paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device);
 // Share memory buffer with paddle_infer::Tensor from fastdeploy::FDTensor
 void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, FDTensor& fd_tensor);

+void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
+                             FDTensor& fd_tensor);
+
 // convert paddle_infer::Tensor to fastdeploy::FDTensor
 // if copy_to_fd is true, copy memory data to FDTensor
 /// else share memory to FDTensor
@@ -89,4 +92,4 @@ class PaddleBackend : public BaseBackend {
  std::vector<TensorInfo> inputs_desc_;
  std::vector<TensorInfo> outputs_desc_;
 };
-}  // namespace fastdeploy
+}  // namespace fastdeploy
--- a/fastdeploy/runtime/backends/paddle/util.cc
+++ b/fastdeploy/runtime/backends/paddle/util.cc
@@ -61,6 +61,43 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
           Str(fd_tensor.dtype).c_str());
 }

+void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
+                                FDTensor& fd_tensor) {
+  std::vector<int> shape(fd_tensor.shape.begin(), fd_tensor.shape.end());
+  auto place = ConvertFDDeviceToPlace(fd_tensor.device);
+  if (fd_tensor.dtype == FDDataType::FP32) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<float*>(fd_tensor.MutableData()),
+                                shape, place);
+    } else {
+      tensor->CopyToCpu(static_cast<float*>(fd_tensor.MutableData()));
+    }
+    return;
+  } else if (fd_tensor.dtype == FDDataType::INT32) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<int32_t*>(fd_tensor.MutableData()),
+                                shape, place);
+    } else {
+      tensor->CopyToCpu(static_cast<int32_t*>(fd_tensor.MutableData()));
+    }
+    return;
+  } else if (fd_tensor.dtype == FDDataType::INT64) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<int64_t*>(fd_tensor.MutableData()),
+                                shape, place);
+    } else {
+      tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor.MutableData()));
+    }
+    return;
+  } else if (fd_tensor.dtype == FDDataType::UINT8) {
+    tensor->ShareExternalData(static_cast<uint8_t*>(fd_tensor.MutableData()),
+                              shape, paddle_infer::PlaceType::kCPU);
+    return;
+  }
+  FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
+           Str(fd_tensor.dtype).c_str());
+}
+
 void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
                            FDTensor* fd_tensor, bool copy_to_fd) {
  auto fd_dtype = PaddleDataTypeToFD(tensor->type());
--- a/fastdeploy/runtime/option_pybind.cc
+++ b/fastdeploy/runtime/option_pybind.cc
@@ -49,6 +49,10 @@ void BindOption(pybind11::module& m) {
      .def_readwrite("poros_option", &RuntimeOption::poros_option)
      .def_readwrite("paddle_infer_option", &RuntimeOption::paddle_infer_option)
      .def("set_external_stream", &RuntimeOption::SetExternalStream)
+      .def("set_external_raw_stream",
+           [](RuntimeOption& self, size_t external_stream) {
+             self.SetExternalStream(reinterpret_cast<void*>(external_stream));
+           })
      .def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
      .def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
      .def("use_poros_backend", &RuntimeOption::UsePorosBackend)
--- a/fastdeploy/runtime/runtime.cc
+++ b/fastdeploy/runtime/runtime.cc
@@ -224,6 +224,25 @@ void Runtime::BindInputTensor(const std::string& name, FDTensor& input) {
  }
 }

+void Runtime::BindOutputTensor(const std::string& name, FDTensor& output) {
+  bool is_exist = false;
+  for (auto& t : output_tensors_) {
+    if (t.name == name) {
+      FDINFO << "The output name [" << name << "] is exist." << std::endl;
+      is_exist = true;
+      t.SetExternalData(output.shape, output.dtype, output.MutableData(),
+                        output.device, output.device_id);
+      break;
+    }
+  }
+  if (!is_exist) {
+    FDINFO << "The output name [" << name << "] is prebinded added into output tensor list." << std::endl;
+    FDTensor new_tensor(name);
+    new_tensor.SetExternalData(output.shape, output.dtype, output.MutableData(),
+                               output.device, output.device_id);
+    output_tensors_.emplace_back(std::move(new_tensor));
+  }
+}
 FDTensor* Runtime::GetOutputTensor(const std::string& name) {
  for (auto& t : output_tensors_) {
    if (t.name == name) {
--- a/fastdeploy/runtime/runtime.h
+++ b/fastdeploy/runtime/runtime.h
@@ -75,6 +75,12 @@ struct FASTDEPLOY_DECL Runtime {
  /** \brief Bind FDTensor by name, no copy and share input memory
   */
  void BindInputTensor(const std::string& name, FDTensor& input);
+
+  /** \brief Bind FDTensor by name, no copy and share output memory.
+   *  Please make share the correctness of tensor shape of output.
+   */
+  void BindOutputTensor(const std::string& name, FDTensor& output);
+
  /** \brief Get output FDTensor by name, no copy and share backend output memory
   */
  FDTensor* GetOutputTensor(const std::string& name);
--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -110,6 +110,7 @@ struct FASTDEPLOY_DECL RuntimeOption {
                    bool enable_multi_stream = false);

  void SetExternalStream(void* external_stream);
+
  /*
   * @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
   */