[Backend & Serving] Serving and Runtime support Clone (#464)

* Add Serving and Runtime use Clone

* support TRT, OpenVINO and Paddle Backend

Co-authored-by: Jason <jiangjiajun@baidu.com>
This commit is contained in:
heliqi
2022-11-04 17:16:40 +08:00
committed by GitHub
parent 61634caf28
commit 277bec38c7
13 changed files with 343 additions and 150 deletions

View File

@@ -285,6 +285,7 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
BuildTrtEngine();
}
cudaSetDevice(option_.gpu_id);
SetInputs(inputs);
AllocateOutputsBuffer(outputs);
@@ -356,13 +357,17 @@ void TrtBackend::GetInputOutputInfo() {
outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
casted_output_tensors_[name] = FDTensor();
}
io_name_index_[name] = i;
}
bindings_.resize(num_binds);
}
void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
for (const auto& item : inputs) {
auto idx = engine_->getBindingIndex(item.name.c_str());
// auto idx = engine_->getBindingIndex(item.name.c_str());
auto iter = io_name_index_.find(item.name);
FDASSERT(iter != io_name_index_.end(), "TRTBackend SetInputs not find name:%s", item.name.c_str());
auto idx = iter->second;
std::vector<int> shape(item.shape.begin(), item.shape.end());
auto dims = ToDims(shape);
context_->setBindingDimensions(idx, dims);
@@ -410,7 +415,10 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
outputs->resize(outputs_desc_.size());
}
for (size_t i = 0; i < outputs_desc_.size(); ++i) {
auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
// auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
auto idx_iter = io_name_index_.find(outputs_desc_[i].name);
FDASSERT(idx_iter != io_name_index_.end(), "TRTBackend Outputs not find name:%s", outputs_desc_[i].name.c_str());
auto idx = idx_iter->second;
auto output_dims = context_->getBindingDimensions(idx);
// find the original index of output
@@ -673,4 +681,47 @@ std::vector<TensorInfo> TrtBackend::GetOutputInfos() {
return infos;
}
std::unique_ptr<BaseBackend> TrtBackend::Clone(void *stream, int device_id) {
std::unique_ptr<BaseBackend> new_backend = utils::make_unique<TrtBackend>();
auto casted_backend = dynamic_cast<TrtBackend*>(new_backend.get());
if(device_id > 0 && device_id != option_.gpu_id) {
auto clone_option = option_;
clone_option.gpu_id = device_id;
clone_option.external_stream_ = stream;
if (option_.model_format == ModelFormat::ONNX) {
FDASSERT(casted_backend->InitFromOnnx(option_.model_file, clone_option),
"Clone model from ONNX failed while initialize TrtBackend.");
} else {
FDASSERT(casted_backend->InitFromPaddle(option_.model_file,
option_.params_file, clone_option),
"Clone model from Paddle failed while initialize TrtBackend.");
}
FDWARNING << "The target device id:"
<< device_id
<< " is different from current device id:"
<< option_.gpu_id
<< ", cannot share memory with current engine."
<< std::endl;
return new_backend;
}
cudaSetDevice(option_.gpu_id);
casted_backend->option_.gpu_id = option_.gpu_id;
if (stream) {
casted_backend->stream_ = reinterpret_cast<cudaStream_t>(stream);
} else {
FDASSERT(cudaStreamCreate(&casted_backend->stream_) == 0,
"[ERROR] Error occurs while clone calling cudaStreamCreate().");
}
casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end());
casted_backend->outputs_order_.insert(outputs_order_.begin(), outputs_order_.end());
casted_backend->shape_range_info_.insert(shape_range_info_.begin(), shape_range_info_.end());
casted_backend->engine_ = engine_;
casted_backend->context_ = std::shared_ptr<nvinfer1::IExecutionContext>(
casted_backend->engine_->createExecutionContext());
casted_backend->GetInputOutputInfo();
FDINFO << "TRTBackend clone finish." << std::endl;
return new_backend;
}
} // namespace fastdeploy

View File

@@ -25,6 +25,7 @@
#include "NvOnnxParser.h"
#include "fastdeploy/backends/backend.h"
#include "fastdeploy/backends/tensorrt/utils.h"
#include "fastdeploy/utils/unique_ptr.h"
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
public:
@@ -45,7 +46,7 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
void writeCalibrationCache(const void* cache,
size_t length) noexcept override {
std::cout << "NOT IMPLEMENT." << std::endl;
fastdeploy::FDERROR << "NOT IMPLEMENT." << std::endl;
}
private:
@@ -62,6 +63,11 @@ struct TrtValueInfo {
};
struct TrtBackendOption {
std::string model_file = ""; // Path of model file
std::string params_file = ""; // Path of parameters file, can be empty
// format of input model
ModelFormat model_format = ModelFormat::AUTOREC;
int gpu_id = 0;
bool enable_fp16 = false;
bool enable_int8 = false;
@@ -99,6 +105,8 @@ class TrtBackend : public BaseBackend {
TensorInfo GetOutputInfo(int index);
std::vector<TensorInfo> GetInputInfos() override;
std::vector<TensorInfo> GetOutputInfos() override;
std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
int device_id = -1) override;
~TrtBackend() {
if (parser_) {
@@ -119,6 +127,7 @@ class TrtBackend : public BaseBackend {
std::vector<TrtValueInfo> outputs_desc_;
std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
std::map<std::string, int> io_name_index_;
std::string calibration_str_;