From abfa9fd850025d4c94d45578f72f3aa51124f06e Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Mon, 13 Feb 2023 03:11:31 +0000 Subject: [PATCH] prebind output by shareExternalData --- fastdeploy/pybind/fd_tensor.cc | 93 +++++++++++++------ fastdeploy/pybind/runtime.cc | 1 + .../runtime/backends/paddle/paddle_backend.cc | 38 ++++++-- .../runtime/backends/paddle/paddle_backend.h | 5 +- fastdeploy/runtime/backends/paddle/util.cc | 37 ++++++++ fastdeploy/runtime/runtime.cc | 20 ++++ fastdeploy/runtime/runtime.h | 8 +- python/fastdeploy/runtime.py | 11 ++- 8 files changed, 174 insertions(+), 39 deletions(-) diff --git a/fastdeploy/pybind/fd_tensor.cc b/fastdeploy/pybind/fd_tensor.cc index 0f1d145b3..6e34019f5 100644 --- a/fastdeploy/pybind/fd_tensor.cc +++ b/fastdeploy/pybind/fd_tensor.cc @@ -15,9 +15,9 @@ #include #include "fastdeploy/core/fd_type.h" -#include "fastdeploy/utils/utils.h" #include "fastdeploy/fastdeploy_model.h" #include "fastdeploy/pybind/main.h" +#include "fastdeploy/utils/utils.h" namespace fastdeploy { @@ -68,8 +68,8 @@ DLDataType FDToDlpackType(FDDataType fd_dtype) { break; default: - FDASSERT(false, - "Convert to DlPack, FDType \"%s\" is not supported.", Str(fd_dtype).c_str()); + FDASSERT(false, "Convert to DlPack, FDType \"%s\" is not supported.", + Str(fd_dtype).c_str()); } dl_dtype.code = dl_code; @@ -77,10 +77,8 @@ DLDataType FDToDlpackType(FDDataType fd_dtype) { return dl_dtype; } -FDDataType -DlpackToFDType(const DLDataType& data_type) { - FDASSERT(data_type.lanes == 1, - "FDTensor does not support dlpack lanes != 1") +FDDataType DlpackToFDType(const DLDataType& data_type) { + FDASSERT(data_type.lanes == 1, "FDTensor does not support dlpack lanes != 1") if (data_type.code == DLDataTypeCode::kDLFloat) { if (data_type.bits == 16) { @@ -152,7 +150,7 @@ pybind11::capsule FDTensorToDLPack(FDTensor& fd_tensor) { dlpack_tensor->dl_tensor.dtype = FDToDlpackType(fd_tensor.dtype); dlpack_tensor->dl_tensor.device.device_id = fd_tensor.device_id; - if(fd_tensor.device == Device::GPU) { + if (fd_tensor.device == Device::GPU) { if (fd_tensor.is_pinned_memory) { dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDAHost; } else { @@ -162,8 +160,8 @@ pybind11::capsule FDTensorToDLPack(FDTensor& fd_tensor) { dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU; } - return pybind11::capsule( - static_cast(dlpack_tensor), "dltensor", &DeleteUnusedDltensor); + return pybind11::capsule(static_cast(dlpack_tensor), "dltensor", + &DeleteUnusedDltensor); } FDTensor FDTensorFromDLPack(const std::string& name, @@ -178,9 +176,8 @@ FDTensor FDTensorFromDLPack(const std::string& name, int64_t* strides = dl_managed_tensor->dl_tensor.strides; int ndim = dl_managed_tensor->dl_tensor.ndim; - std::vector dims( - dl_managed_tensor->dl_tensor.shape, - dl_managed_tensor->dl_tensor.shape + ndim); + std::vector dims(dl_managed_tensor->dl_tensor.shape, + dl_managed_tensor->dl_tensor.shape + ndim); // Check if the input is contiguous and in C order if (strides != nullptr) { @@ -196,8 +193,8 @@ FDTensor FDTensorFromDLPack(const std::string& name, } FDASSERT(is_contiguous_c_order, - "DLPack tensor is not contiguous. Only contiguous DLPack " - "tensors that are stored in C-Order are supported."); + "DLPack tensor is not contiguous. Only contiguous DLPack " + "tensors that are stored in C-Order are supported."); } Device device; @@ -216,21 +213,20 @@ FDTensor FDTensorFromDLPack(const std::string& name, is_pinned_memory = true; break; default: - FDASSERT(false, + FDASSERT( + false, ("DLDevice type " + - std::to_string(dl_managed_tensor->dl_tensor.device.device_type) + - " is not support by Python backend.").c_str()); + std::to_string(dl_managed_tensor->dl_tensor.device.device_type) + + " is not support by Python backend.") + .c_str()); break; } - FDDataType dtype = - DlpackToFDType(dl_managed_tensor->dl_tensor.dtype); + FDDataType dtype = DlpackToFDType(dl_managed_tensor->dl_tensor.dtype); PyCapsule_SetName(dlpack_tensor.ptr(), "used_dlpack"); FDTensor fd_tensor(name); - fd_tensor.SetExternalData( - dims, dtype, memory_ptr, device, device_id - ); + fd_tensor.SetExternalData(dims, dtype, memory_ptr, device, device_id); fd_tensor.is_pinned_memory = is_pinned_memory; return fd_tensor; } @@ -242,15 +238,52 @@ void BindFDTensor(pybind11::module& m) { .def_readonly("shape", &FDTensor::shape) .def_readonly("dtype", &FDTensor::dtype) .def_readonly("device", &FDTensor::device) - .def("numpy", [](FDTensor& self) { - return TensorToPyArray(self); - }) + .def("numpy", [](FDTensor& self) { return TensorToPyArray(self); }) .def("data", &FDTensor::MutableData) - .def("from_numpy", [](FDTensor& self, pybind11::array& pyarray, bool share_buffer = false) { - PyArrayToTensor(pyarray, &self, share_buffer); - }) + .def("from_numpy", + [](FDTensor& self, pybind11::array& pyarray, + bool share_buffer = false) { + PyArrayToTensor(pyarray, &self, share_buffer); + }) + .def("from_external_data", + [](const std::string& name, size_t data_addr, + const std::vector& shape, const std::string& data_type, + const std::string& data_place, int device_id) { + auto fd_data_type = FDDataType::UNKNOWN1; + if (data_type == "FP32") { + fd_data_type = FDDataType::FP32; + } else if (data_type == "FP16") { + fd_data_type = FDDataType::FP16; + } else if (data_type == "INT32") { + fd_data_type = FDDataType::INT32; + } else if (data_type == "INT64") { + fd_data_type = FDDataType::INT64; + } else { + FDASSERT(false, + "FDTensor.from_external_data, datatype \"%s\" is not " + "supported.", + data_type.c_str()); + } + + Device fd_data_place; + if (data_place.find("gpu") != data_place.npos) { + fd_data_place = Device::GPU; + } else { + FDASSERT(false, + ("Device type " + data_place + + " is not support by FDTensor.from_external_data.") + .c_str()); + } + void* data_ptr = nullptr; + data_ptr = reinterpret_cast(data_addr); + FDTensor fd_tensor(name); + fd_tensor.SetExternalData(shape, fd_data_type, + static_cast(data_ptr), + fd_data_place, device_id); + return fd_tensor; + }) .def("to_dlpack", &FDTensorToDLPack) - .def("from_dlpack",&FDTensorFromDLPack) + .def("from_dlpack", &FDTensorFromDLPack) .def("print_info", &FDTensor::PrintInfo); } diff --git a/fastdeploy/pybind/runtime.cc b/fastdeploy/pybind/runtime.cc index ca2f4886b..408c3ced2 100644 --- a/fastdeploy/pybind/runtime.cc +++ b/fastdeploy/pybind/runtime.cc @@ -110,6 +110,7 @@ void BindRuntime(pybind11::module& m) { return outputs; }) .def("bind_input_tensor", &Runtime::BindInputTensor) + .def("bind_output_tensor", &Runtime::BindOutputTensor) .def("infer", [](Runtime& self) { self.Infer(); }) .def("get_output_tensor", [](Runtime& self, const std::string& name) { diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.cc b/fastdeploy/runtime/backends/paddle/paddle_backend.cc index 19493f90b..49cb2532b 100644 --- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc +++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc @@ -25,6 +25,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) { if (option.device == Device::GPU) { config_.EnableUseGpu(option.gpu_mem_init_size, option.device_id); if (option_.external_stream_) { + FDINFO << "Will use external stream for Paddle Backend." << std::endl; config_.SetExecStream(option_.external_stream_); } if (option.enable_trt) { @@ -47,7 +48,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) { config_.SetOptimCacheDir(option.trt_option.serialize_file); } config_.EnableTensorRtEngine(option.trt_option.max_workspace_size, - option.trt_option.max_batch_size, 3, + option.trt_option.max_batch_size, 20, precision, use_static); SetTRTDynamicShapeToConfig(option); } @@ -124,9 +125,10 @@ bool PaddleBackend::InitFromPaddle(const std::string& model_buffer, "file will save to the directory where paddle model saved." << std::endl; use_static = true; + config_.SetOptimCacheDir(option.trt_option.serialize_file); } config_.EnableTensorRtEngine(option.trt_option.max_workspace_size, - option.trt_option.max_batch_size, 3, + option.trt_option.max_batch_size, 20, paddle_infer::PrecisionType::kInt8, use_static, false); SetTRTDynamicShapeToConfig(option); @@ -223,23 +225,47 @@ bool PaddleBackend::Infer(std::vector& inputs, << inputs_desc_.size() << ")." << std::endl; return false; } + // output share backend memory only support CPU or GPU + if (option_.device == Device::IPU) { + copy_to_fd = true; + } RUNTIME_PROFILE_LOOP_H2D_D2H_BEGIN for (size_t i = 0; i < inputs.size(); ++i) { auto handle = predictor_->GetInputHandle(inputs[i].name); ShareTensorFromFDTensor(handle.get(), inputs[i]); } + std::unordered_set prebinded_output_name; + // prebinded output only support for GPU + if (!copy_to_fd) { + for (size_t i = 0; i < (*outputs).size(); ++i) { + auto output_name = (*outputs)[i].name; + // if a output is not prebinded, + // the name of output is expected to be empty. + // We skip here + if (output_name.empty()) { + continue; + } + // Record the prebinded output_name. + // Those outputs do not need PaddleTensorToFDTensor + // after predictor_.Run() + prebinded_output_name.insert(output_name); + auto handle = predictor_->GetOutputHandle(output_name); + ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]); + } + } RUNTIME_PROFILE_LOOP_BEGIN(1) predictor_->Run(); RUNTIME_PROFILE_LOOP_END - // output share backend memory only support CPU or GPU - if (option_.device == Device::IPU) { - copy_to_fd = true; - } outputs->resize(outputs_desc_.size()); for (size_t i = 0; i < outputs_desc_.size(); ++i) { + // skip prebinded output + if (copy_to_fd == false && + prebinded_output_name.count(outputs_desc_[i].name)) { + continue; + } auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name); if (copy_to_fd) { (*outputs)[i].is_pinned_memory = option_.enable_pinned_memory; diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.h b/fastdeploy/runtime/backends/paddle/paddle_backend.h index 02c430ade..60079fed6 100755 --- a/fastdeploy/runtime/backends/paddle/paddle_backend.h +++ b/fastdeploy/runtime/backends/paddle/paddle_backend.h @@ -35,6 +35,9 @@ paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device); // Share memory buffer with paddle_infer::Tensor from fastdeploy::FDTensor void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, FDTensor& fd_tensor); +void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor, + FDTensor& fd_tensor); + // convert paddle_infer::Tensor to fastdeploy::FDTensor // if copy_to_fd is true, copy memory data to FDTensor /// else share memory to FDTensor @@ -89,4 +92,4 @@ class PaddleBackend : public BaseBackend { std::vector inputs_desc_; std::vector outputs_desc_; }; -} // namespace fastdeploy \ No newline at end of file +} // namespace fastdeploy diff --git a/fastdeploy/runtime/backends/paddle/util.cc b/fastdeploy/runtime/backends/paddle/util.cc index f117a49bc..bd7ff0944 100644 --- a/fastdeploy/runtime/backends/paddle/util.cc +++ b/fastdeploy/runtime/backends/paddle/util.cc @@ -61,6 +61,43 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, Str(fd_tensor.dtype).c_str()); } +void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor, + FDTensor& fd_tensor) { + std::vector shape(fd_tensor.shape.begin(), fd_tensor.shape.end()); + auto place = ConvertFDDeviceToPlace(fd_tensor.device); + if (fd_tensor.dtype == FDDataType::FP32) { + if (place == paddle_infer::PlaceType::kGPU) { + tensor->ShareExternalData(static_cast(fd_tensor.MutableData()), + shape, place); + } else { + tensor->CopyToCpu(static_cast(fd_tensor.MutableData())); + } + return; + } else if (fd_tensor.dtype == FDDataType::INT32) { + if (place == paddle_infer::PlaceType::kGPU) { + tensor->ShareExternalData(static_cast(fd_tensor.MutableData()), + shape, place); + } else { + tensor->CopyToCpu(static_cast(fd_tensor.MutableData())); + } + return; + } else if (fd_tensor.dtype == FDDataType::INT64) { + if (place == paddle_infer::PlaceType::kGPU) { + tensor->ShareExternalData(static_cast(fd_tensor.MutableData()), + shape, place); + } else { + tensor->CopyToCpu(static_cast(fd_tensor.MutableData())); + } + return; + } else if (fd_tensor.dtype == FDDataType::UINT8) { + tensor->ShareExternalData(static_cast(fd_tensor.MutableData()), + shape, paddle_infer::PlaceType::kCPU); + return; + } + FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.", + Str(fd_tensor.dtype).c_str()); +} + void PaddleTensorToFDTensor(std::unique_ptr& tensor, FDTensor* fd_tensor, bool copy_to_fd) { auto fd_dtype = PaddleDataTypeToFD(tensor->type()); diff --git a/fastdeploy/runtime/runtime.cc b/fastdeploy/runtime/runtime.cc index 70714e4f0..3f3ccd031 100644 --- a/fastdeploy/runtime/runtime.cc +++ b/fastdeploy/runtime/runtime.cc @@ -198,6 +198,26 @@ void Runtime::BindInputTensor(const std::string& name, FDTensor& input) { } } +void Runtime::BindOutputTensor(const std::string& name, FDTensor& output) { + bool is_exist = false; + for (auto& t : output_tensors_) { + if (t.name == name) { + // FDWARNING << "The output name [" << name << "] is exist." << std::endl; + is_exist = true; + t.SetExternalData(output.shape, output.dtype, output.MutableData(), + output.device, output.device_id); + break; + } + } + if (!is_exist) { + // FDWARNING << "The output name [" << name << "] don't exist." << + // std::endl; + FDTensor new_tensor(name); + new_tensor.SetExternalData(output.shape, output.dtype, output.MutableData(), + output.device, output.device_id); + output_tensors_.emplace_back(std::move(new_tensor)); + } +} FDTensor* Runtime::GetOutputTensor(const std::string& name) { for (auto& t : output_tensors_) { if (t.name == name) { diff --git a/fastdeploy/runtime/runtime.h b/fastdeploy/runtime/runtime.h index 6e7dc9629..66ce9c94e 100755 --- a/fastdeploy/runtime/runtime.h +++ b/fastdeploy/runtime/runtime.h @@ -72,6 +72,12 @@ struct FASTDEPLOY_DECL Runtime { /** \brief Bind FDTensor by name, no copy and share input memory */ void BindInputTensor(const std::string& name, FDTensor& input); + + /** \brief Bind FDTensor by name, no copy and share output memory. + * Please make share the correctness of tensor shape of output. + */ + void BindOutputTensor(const std::string& name, FDTensor& output); + /** \brief Get output FDTensor by name, no copy and share backend output memory */ FDTensor* GetOutputTensor(const std::string& name); @@ -99,7 +105,7 @@ struct FASTDEPLOY_DECL Runtime { */ double GetProfileTime() { return backend_->benchmark_result_.time_of_runtime; - } + } private: void CreateOrtBackend(); diff --git a/python/fastdeploy/runtime.py b/python/fastdeploy/runtime.py index 47659c98c..1251b955c 100644 --- a/python/fastdeploy/runtime.py +++ b/python/fastdeploy/runtime.py @@ -72,6 +72,14 @@ class Runtime: """ self._runtime.bind_input_tensor(name, fdtensor) + def bind_output_tensor(self, name, fdtensor): + """Bind FDTensor by name, no copy and share output memory + + :param name: (str)The name of output data. + :param fdtensor: (fastdeploy.FDTensor)The output FDTensor. + """ + self._runtime.bind_output_tensor(name, fdtensor) + def zero_copy_infer(self): """No params inference the model. @@ -656,7 +664,8 @@ class RuntimeOption: continue if hasattr(getattr(self._option, attr), "__call__"): continue - message += " {} : {}\t\n".format(attr, getattr(self._option, attr)) + message += " {} : {}\t\n".format(attr, + getattr(self._option, attr)) message.strip("\n") message += ")" return message