mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Serving][backend]serving support multi stream and backend support external stream (#431)
* serving support multi stream * pybind add external stream Co-authored-by: Jason <jiangjiajun@baidu.com>
This commit is contained in:
@@ -63,6 +63,10 @@ void OrtBackend::BuildOption(const OrtBackendOption& option) {
|
|||||||
} else {
|
} else {
|
||||||
OrtCUDAProviderOptions cuda_options;
|
OrtCUDAProviderOptions cuda_options;
|
||||||
cuda_options.device_id = option.gpu_id;
|
cuda_options.device_id = option.gpu_id;
|
||||||
|
if(option.external_stream_) {
|
||||||
|
cuda_options.has_user_compute_stream = 1;
|
||||||
|
cuda_options.user_compute_stream = option.external_stream_;
|
||||||
|
}
|
||||||
session_options_.AppendExecutionProvider_CUDA(cuda_options);
|
session_options_.AppendExecutionProvider_CUDA(cuda_options);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -44,6 +44,7 @@ struct OrtBackendOption {
|
|||||||
int execution_mode = -1;
|
int execution_mode = -1;
|
||||||
bool use_gpu = false;
|
bool use_gpu = false;
|
||||||
int gpu_id = 0;
|
int gpu_id = 0;
|
||||||
|
void* external_stream_ = nullptr;
|
||||||
|
|
||||||
// inside parameter, maybe remove next version
|
// inside parameter, maybe remove next version
|
||||||
bool remove_multiclass_nms_ = false;
|
bool remove_multiclass_nms_ = false;
|
||||||
@@ -66,7 +67,8 @@ class OrtBackend : public BaseBackend {
|
|||||||
const OrtBackendOption& option = OrtBackendOption(),
|
const OrtBackendOption& option = OrtBackendOption(),
|
||||||
bool from_memory_buffer = false);
|
bool from_memory_buffer = false);
|
||||||
|
|
||||||
bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs) override;
|
bool Infer(std::vector<FDTensor>& inputs,
|
||||||
|
std::vector<FDTensor>* outputs) override;
|
||||||
|
|
||||||
int NumInputs() const override { return inputs_desc_.size(); }
|
int NumInputs() const override { return inputs_desc_.size(); }
|
||||||
|
|
||||||
|
@@ -22,6 +22,9 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
|
|||||||
option_ = option;
|
option_ = option;
|
||||||
if (option.use_gpu) {
|
if (option.use_gpu) {
|
||||||
config_.EnableUseGpu(option.gpu_mem_init_size, option.gpu_id);
|
config_.EnableUseGpu(option.gpu_mem_init_size, option.gpu_id);
|
||||||
|
if(option_.external_stream_) {
|
||||||
|
config_.SetExecStream(option_.external_stream_);
|
||||||
|
}
|
||||||
if (option.enable_trt) {
|
if (option.enable_trt) {
|
||||||
#ifdef ENABLE_TRT_BACKEND
|
#ifdef ENABLE_TRT_BACKEND
|
||||||
auto precision = paddle_infer::PrecisionType::kFloat32;
|
auto precision = paddle_infer::PrecisionType::kFloat32;
|
||||||
|
@@ -54,6 +54,7 @@ struct PaddleBackendOption {
|
|||||||
// gpu device id
|
// gpu device id
|
||||||
int gpu_id = 0;
|
int gpu_id = 0;
|
||||||
bool enable_pinned_memory = false;
|
bool enable_pinned_memory = false;
|
||||||
|
void* external_stream_ = nullptr;
|
||||||
|
|
||||||
std::vector<std::string> delete_pass_names = {};
|
std::vector<std::string> delete_pass_names = {};
|
||||||
};
|
};
|
||||||
|
@@ -258,8 +258,12 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,
|
|||||||
ReaderDtypeToTrtDtype(onnx_reader.outputs[i].dtype);
|
ReaderDtypeToTrtDtype(onnx_reader.outputs[i].dtype);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (option_.external_stream_) {
|
||||||
|
stream_ = reinterpret_cast<cudaStream_t>(option_.external_stream_);
|
||||||
|
} else {
|
||||||
FDASSERT(cudaStreamCreate(&stream_) == 0,
|
FDASSERT(cudaStreamCreate(&stream_) == 0,
|
||||||
"[ERROR] Error occurs while calling cudaStreamCreate().");
|
"[ERROR] Error occurs while calling cudaStreamCreate().");
|
||||||
|
}
|
||||||
|
|
||||||
if (!CreateTrtEngineFromOnnx(onnx_content)) {
|
if (!CreateTrtEngineFromOnnx(onnx_content)) {
|
||||||
FDERROR << "Failed to create tensorrt engine." << std::endl;
|
FDERROR << "Failed to create tensorrt engine." << std::endl;
|
||||||
|
@@ -71,6 +71,7 @@ struct TrtBackendOption {
|
|||||||
std::map<std::string, std::vector<int32_t>> opt_shape;
|
std::map<std::string, std::vector<int32_t>> opt_shape;
|
||||||
std::string serialize_file = "";
|
std::string serialize_file = "";
|
||||||
bool enable_pinned_memory = false;
|
bool enable_pinned_memory = false;
|
||||||
|
void* external_stream_ = nullptr;
|
||||||
|
|
||||||
// inside parameter, maybe remove next version
|
// inside parameter, maybe remove next version
|
||||||
bool remove_multiclass_nms_ = false;
|
bool remove_multiclass_nms_ = false;
|
||||||
|
@@ -22,6 +22,7 @@ void BindRuntime(pybind11::module& m) {
|
|||||||
.def("set_model_path", &RuntimeOption::SetModelPath)
|
.def("set_model_path", &RuntimeOption::SetModelPath)
|
||||||
.def("use_gpu", &RuntimeOption::UseGpu)
|
.def("use_gpu", &RuntimeOption::UseGpu)
|
||||||
.def("use_cpu", &RuntimeOption::UseCpu)
|
.def("use_cpu", &RuntimeOption::UseCpu)
|
||||||
|
.def("set_external_stream", &RuntimeOption::SetExternalStream)
|
||||||
.def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
|
.def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
|
||||||
.def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
|
.def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
|
||||||
.def("use_poros_backend", &RuntimeOption::UsePorosBackend)
|
.def("use_poros_backend", &RuntimeOption::UsePorosBackend)
|
||||||
@@ -52,6 +53,7 @@ void BindRuntime(pybind11::module& m) {
|
|||||||
.def_readwrite("params_file", &RuntimeOption::params_file)
|
.def_readwrite("params_file", &RuntimeOption::params_file)
|
||||||
.def_readwrite("model_format", &RuntimeOption::model_format)
|
.def_readwrite("model_format", &RuntimeOption::model_format)
|
||||||
.def_readwrite("backend", &RuntimeOption::backend)
|
.def_readwrite("backend", &RuntimeOption::backend)
|
||||||
|
.def_readwrite("backend", &RuntimeOption::external_stream_)
|
||||||
.def_readwrite("cpu_thread_num", &RuntimeOption::cpu_thread_num)
|
.def_readwrite("cpu_thread_num", &RuntimeOption::cpu_thread_num)
|
||||||
.def_readwrite("device_id", &RuntimeOption::device_id)
|
.def_readwrite("device_id", &RuntimeOption::device_id)
|
||||||
.def_readwrite("device", &RuntimeOption::device)
|
.def_readwrite("device", &RuntimeOption::device)
|
||||||
|
@@ -223,6 +223,10 @@ void RuntimeOption::UseGpu(int gpu_id) {
|
|||||||
|
|
||||||
void RuntimeOption::UseCpu() { device = Device::CPU; }
|
void RuntimeOption::UseCpu() { device = Device::CPU; }
|
||||||
|
|
||||||
|
void RuntimeOption::SetExternalStream(void* external_stream) {
|
||||||
|
external_stream_ = external_stream;
|
||||||
|
}
|
||||||
|
|
||||||
void RuntimeOption::SetCpuThreadNum(int thread_num) {
|
void RuntimeOption::SetCpuThreadNum(int thread_num) {
|
||||||
FDASSERT(thread_num > 0, "The thread_num must be greater than 0.");
|
FDASSERT(thread_num > 0, "The thread_num must be greater than 0.");
|
||||||
cpu_thread_num = thread_num;
|
cpu_thread_num = thread_num;
|
||||||
@@ -508,6 +512,7 @@ void Runtime::CreatePaddleBackend() {
|
|||||||
pd_option.delete_pass_names = option.pd_delete_pass_names;
|
pd_option.delete_pass_names = option.pd_delete_pass_names;
|
||||||
pd_option.cpu_thread_num = option.cpu_thread_num;
|
pd_option.cpu_thread_num = option.cpu_thread_num;
|
||||||
pd_option.enable_pinned_memory = option.enable_pinned_memory;
|
pd_option.enable_pinned_memory = option.enable_pinned_memory;
|
||||||
|
pd_option.external_stream_ = option.external_stream_;
|
||||||
#ifdef ENABLE_TRT_BACKEND
|
#ifdef ENABLE_TRT_BACKEND
|
||||||
if (pd_option.use_gpu && option.pd_enable_trt) {
|
if (pd_option.use_gpu && option.pd_enable_trt) {
|
||||||
pd_option.enable_trt = true;
|
pd_option.enable_trt = true;
|
||||||
@@ -574,6 +579,7 @@ void Runtime::CreateOrtBackend() {
|
|||||||
ort_option.execution_mode = option.ort_execution_mode;
|
ort_option.execution_mode = option.ort_execution_mode;
|
||||||
ort_option.use_gpu = (option.device == Device::GPU) ? true : false;
|
ort_option.use_gpu = (option.device == Device::GPU) ? true : false;
|
||||||
ort_option.gpu_id = option.device_id;
|
ort_option.gpu_id = option.device_id;
|
||||||
|
ort_option.external_stream_ = option.external_stream_;
|
||||||
|
|
||||||
// TODO(jiangjiajun): inside usage, maybe remove this later
|
// TODO(jiangjiajun): inside usage, maybe remove this later
|
||||||
ort_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
|
ort_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
|
||||||
@@ -613,6 +619,7 @@ void Runtime::CreateTrtBackend() {
|
|||||||
trt_option.opt_shape = option.trt_opt_shape;
|
trt_option.opt_shape = option.trt_opt_shape;
|
||||||
trt_option.serialize_file = option.trt_serialize_file;
|
trt_option.serialize_file = option.trt_serialize_file;
|
||||||
trt_option.enable_pinned_memory = option.enable_pinned_memory;
|
trt_option.enable_pinned_memory = option.enable_pinned_memory;
|
||||||
|
trt_option.external_stream_ = option.external_stream_;
|
||||||
|
|
||||||
// TODO(jiangjiajun): inside usage, maybe remove this later
|
// TODO(jiangjiajun): inside usage, maybe remove this later
|
||||||
trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
|
trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
|
||||||
|
@@ -102,6 +102,8 @@ struct FASTDEPLOY_DECL RuntimeOption {
|
|||||||
/// Use Nvidia GPU to inference
|
/// Use Nvidia GPU to inference
|
||||||
void UseGpu(int gpu_id = 0);
|
void UseGpu(int gpu_id = 0);
|
||||||
|
|
||||||
|
void SetExternalStream(void* external_stream);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
|
* @brief Set number of cpu threads while inference on CPU, by default it will decided by the different backends
|
||||||
*/
|
*/
|
||||||
@@ -232,6 +234,8 @@ struct FASTDEPLOY_DECL RuntimeOption {
|
|||||||
|
|
||||||
Device device = Device::CPU;
|
Device device = Device::CPU;
|
||||||
|
|
||||||
|
void* external_stream_ = nullptr;
|
||||||
|
|
||||||
bool enable_pinned_memory = false;
|
bool enable_pinned_memory = false;
|
||||||
|
|
||||||
// ======Only for ORT Backend========
|
// ======Only for ORT Backend========
|
||||||
|
@@ -379,6 +379,7 @@ TRITONSERVER_Error* ModelState::LoadModel(
|
|||||||
if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
|
if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
|
||||||
(instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
|
(instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
|
||||||
runtime_options_->UseGpu(instance_group_device_id);
|
runtime_options_->UseGpu(instance_group_device_id);
|
||||||
|
runtime_options_->SetExternalStream((void*)stream);
|
||||||
} else {
|
} else {
|
||||||
runtime_options_->UseCpu();
|
runtime_options_->UseCpu();
|
||||||
}
|
}
|
||||||
@@ -1001,9 +1002,7 @@ TRITONSERVER_Error* ModelInstanceState::Run(
|
|||||||
runtime_->Infer(input_tensors_, &output_tensors_);
|
runtime_->Infer(input_tensors_, &output_tensors_);
|
||||||
#ifdef TRITON_ENABLE_GPU
|
#ifdef TRITON_ENABLE_GPU
|
||||||
if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
|
if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
|
||||||
// TODO: stream controll
|
cudaStreamSynchronize(CudaStream());
|
||||||
cudaDeviceSynchronize();
|
|
||||||
// cudaStreamSynchronize(CudaStream());
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
Reference in New Issue
Block a user