mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-23 16:44:22 +08:00
[Model] Yolov5/v5lite/v6/v7/v7end2end: CUDA preprocessing (#370)
* add yolo cuda preprocessing * cmake build cuda src * yolov5 support cuda preprocessing * yolov5 cuda preprocessing configurable * yolov5 update get mat data api * yolov5 check cuda preprocess args * refactor cuda function name * yolo cuda preprocess padding value configurable * yolov5 release cuda memory * cuda preprocess pybind api update * move use_cuda_preprocessing option to yolov5 model * yolov5lite cuda preprocessing * yolov6 cuda preprocessing * yolov7 cuda preprocessing * yolov7_e2e cuda preprocessing * remove cuda preprocessing in runtime option * refine log and cmake variable name * fix model runtime ptr type Co-authored-by: Jason <jiangjiajun@baidu.com>
This commit is contained in:
@@ -15,6 +15,9 @@
|
||||
#include "fastdeploy/vision/detection/contrib/yolov7.h"
|
||||
#include "fastdeploy/utils/perf.h"
|
||||
#include "fastdeploy/vision/utils/utils.h"
|
||||
#ifdef ENABLE_CUDA_PREPROCESS
|
||||
#include "fastdeploy/vision/utils/cuda_utils.h"
|
||||
#endif // ENABLE_CUDA_PREPROCESS
|
||||
|
||||
namespace fastdeploy {
|
||||
namespace vision {
|
||||
@@ -106,6 +109,16 @@ bool YOLOv7::Initialize() {
|
||||
return true;
|
||||
}
|
||||
|
||||
YOLOv7::~YOLOv7() {
|
||||
#ifdef ENABLE_CUDA_PREPROCESS
|
||||
if (use_cuda_preprocessing_) {
|
||||
CUDA_CHECK(cudaFreeHost(input_img_cuda_buffer_host_));
|
||||
CUDA_CHECK(cudaFree(input_img_cuda_buffer_device_));
|
||||
CUDA_CHECK(cudaFree(input_tensor_cuda_buffer_device_));
|
||||
}
|
||||
#endif // ENABLE_CUDA_PREPROCESS
|
||||
}
|
||||
|
||||
bool YOLOv7::Preprocess(Mat* mat, FDTensor* output,
|
||||
std::map<std::string, std::array<float, 2>>* im_info) {
|
||||
// process after image load
|
||||
@@ -145,6 +158,65 @@ bool YOLOv7::Preprocess(Mat* mat, FDTensor* output,
|
||||
return true;
|
||||
}
|
||||
|
||||
void YOLOv7::UseCudaPreprocessing(int max_image_size) {
|
||||
#ifdef ENABLE_CUDA_PREPROCESS
|
||||
use_cuda_preprocessing_ = true;
|
||||
is_scale_up = true;
|
||||
if (input_img_cuda_buffer_host_ == nullptr) {
|
||||
// prepare input data cache in GPU pinned memory
|
||||
CUDA_CHECK(cudaMallocHost((void**)&input_img_cuda_buffer_host_, max_image_size * 3));
|
||||
// prepare input data cache in GPU device memory
|
||||
CUDA_CHECK(cudaMalloc((void**)&input_img_cuda_buffer_device_, max_image_size * 3));
|
||||
CUDA_CHECK(cudaMalloc((void**)&input_tensor_cuda_buffer_device_, 3 * size[0] * size[1] * sizeof(float)));
|
||||
}
|
||||
#else
|
||||
FDWARNING << "The FastDeploy didn't compile with BUILD_CUDA_SRC=ON."
|
||||
<< std::endl;
|
||||
use_cuda_preprocessing_ = false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool YOLOv7::CudaPreprocess(Mat* mat, FDTensor* output,
|
||||
std::map<std::string, std::array<float, 2>>* im_info) {
|
||||
#ifdef ENABLE_CUDA_PREPROCESS
|
||||
if (is_mini_pad != false || is_no_pad != false || is_scale_up != true) {
|
||||
FDERROR << "Preprocessing with CUDA is only available when the arguments satisfy (is_mini_pad=false, is_no_pad=false, is_scale_up=true)." << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Record the shape of image and the shape of preprocessed image
|
||||
(*im_info)["input_shape"] = {static_cast<float>(mat->Height()),
|
||||
static_cast<float>(mat->Width())};
|
||||
(*im_info)["output_shape"] = {static_cast<float>(mat->Height()),
|
||||
static_cast<float>(mat->Width())};
|
||||
|
||||
cudaStream_t stream;
|
||||
CUDA_CHECK(cudaStreamCreate(&stream));
|
||||
int src_img_buf_size = mat->Height() * mat->Width() * mat->Channels();
|
||||
memcpy(input_img_cuda_buffer_host_, mat->Data(), src_img_buf_size);
|
||||
CUDA_CHECK(cudaMemcpyAsync(input_img_cuda_buffer_device_,
|
||||
input_img_cuda_buffer_host_,
|
||||
src_img_buf_size, cudaMemcpyHostToDevice, stream));
|
||||
utils::CudaYoloPreprocess(input_img_cuda_buffer_device_, mat->Width(),
|
||||
mat->Height(), input_tensor_cuda_buffer_device_,
|
||||
size[0], size[1], padding_value, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaStreamDestroy(stream);
|
||||
|
||||
// Record output shape of preprocessed image
|
||||
(*im_info)["output_shape"] = {static_cast<float>(size[0]), static_cast<float>(size[1])};
|
||||
|
||||
output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
|
||||
input_tensor_cuda_buffer_device_);
|
||||
output->device = Device::GPU;
|
||||
output->shape.insert(output->shape.begin(), 1); // reshape to n, h, w, c
|
||||
return true;
|
||||
#else
|
||||
FDERROR << "CUDA src code was not enabled." << std::endl;
|
||||
return false;
|
||||
#endif // ENABLE_CUDA_PREPROCESS
|
||||
}
|
||||
|
||||
bool YOLOv7::Postprocess(
|
||||
FDTensor& infer_result, DetectionResult* result,
|
||||
const std::map<std::string, std::array<float, 2>>& im_info,
|
||||
@@ -227,9 +299,16 @@ bool YOLOv7::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold,
|
||||
im_info["output_shape"] = {static_cast<float>(mat.Height()),
|
||||
static_cast<float>(mat.Width())};
|
||||
|
||||
if (!Preprocess(&mat, &input_tensors[0], &im_info)) {
|
||||
FDERROR << "Failed to preprocess input image." << std::endl;
|
||||
return false;
|
||||
if (use_cuda_preprocessing_) {
|
||||
if (!CudaPreprocess(&mat, &input_tensors[0], &im_info)) {
|
||||
FDERROR << "Failed to preprocess input image." << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!Preprocess(&mat, &input_tensors[0], &im_info)) {
|
||||
FDERROR << "Failed to preprocess input image." << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
input_tensors[0].name = InputInfoOfRuntime(0).name;
|
||||
|
Reference in New Issue
Block a user