mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 17:17:14 +08:00
[Backend] Add TensorRT FP16 support for AdaptivePool2d (#1116)
* add fp16 cuda kernel * fix code bug * update code
This commit is contained in:
54
fastdeploy/runtime/backends/common/cuda/adaptive_pool2d_kernel.cu
Executable file → Normal file
54
fastdeploy/runtime/backends/common/cuda/adaptive_pool2d_kernel.cu
Executable file → Normal file
@@ -17,8 +17,8 @@
|
|||||||
#include "adaptive_pool2d_kernel.h"
|
#include "adaptive_pool2d_kernel.h"
|
||||||
|
|
||||||
namespace fastdeploy {
|
namespace fastdeploy {
|
||||||
|
template <typename T1, typename T2>
|
||||||
__global__ void CudaCastKernel(const float* in, float* out, int edge,
|
__global__ void CudaCastKernel(const T1* in, T2* out, int edge,
|
||||||
int out_bc_offset, int in_bc_offset, int ih,
|
int out_bc_offset, int in_bc_offset, int ih,
|
||||||
int iw, int oh, int ow, bool is_avg) {
|
int iw, int oh, int ow, bool is_avg) {
|
||||||
int position = blockDim.x * blockIdx.x + threadIdx.x;
|
int position = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
@@ -32,29 +32,37 @@ __global__ void CudaCastKernel(const float* in, float* out, int edge,
|
|||||||
int hend = ceilf(static_cast<float>((h + 1) * ih) / oh);
|
int hend = ceilf(static_cast<float>((h + 1) * ih) / oh);
|
||||||
int wstart = floorf(static_cast<float>(w * iw) / ow);
|
int wstart = floorf(static_cast<float>(w * iw) / ow);
|
||||||
int wend = ceilf(static_cast<float>((w + 1) * iw) / ow);
|
int wend = ceilf(static_cast<float>((w + 1) * iw) / ow);
|
||||||
|
float ele_val = 0.0;
|
||||||
if (is_avg) {
|
if (is_avg) {
|
||||||
out[position] = 0.0;
|
ele_val = 0.0;
|
||||||
} else {
|
} else {
|
||||||
out[position] = in[offset * in_bc_offset + hstart * iw + wstart];
|
ele_val =
|
||||||
|
static_cast<float>(in[offset * in_bc_offset + hstart * iw + wstart]);
|
||||||
}
|
}
|
||||||
for (int h = hstart; h < hend; ++h) {
|
for (int h = hstart; h < hend; ++h) {
|
||||||
for (int w = wstart; w < wend; ++w) {
|
for (int w = wstart; w < wend; ++w) {
|
||||||
int input_idx = h * iw + w;
|
int input_idx = h * iw + w;
|
||||||
if (is_avg) {
|
if (is_avg) {
|
||||||
out[position] = out[position] + in[offset * in_bc_offset + input_idx];
|
ele_val =
|
||||||
|
ele_val + static_cast<float>(in[offset * in_bc_offset + input_idx]);
|
||||||
} else {
|
} else {
|
||||||
out[position] =
|
ele_val =
|
||||||
max(out[position], in[offset * in_bc_offset + input_idx]);
|
(ele_val >
|
||||||
|
static_cast<float>(in[offset * in_bc_offset + input_idx]))
|
||||||
|
? ele_val
|
||||||
|
: static_cast<float>(in[offset * in_bc_offset + input_idx]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out[position] = out[position] / ((hend - hstart) * (wend - wstart));
|
out[position] = static_cast<T2>(
|
||||||
|
ele_val / static_cast<float>(((hend - hstart) * (wend - wstart))));
|
||||||
}
|
}
|
||||||
|
|
||||||
void CudaAdaptivePool(const std::vector<int64_t>& input_dims,
|
void CudaAdaptivePool(const std::vector<int64_t>& input_dims,
|
||||||
const std::vector<int64_t>& output_dims, float* output,
|
const std::vector<int64_t>& output_dims, void* output,
|
||||||
const float* input, void* compute_stream,
|
const void* input, void* compute_stream,
|
||||||
const std::string& pooling_type) {
|
const std::string& pooling_type, const std::string& dtype,
|
||||||
|
const std::string& out_dtype) {
|
||||||
auto casted_compute_stream = reinterpret_cast<cudaStream_t>(compute_stream);
|
auto casted_compute_stream = reinterpret_cast<cudaStream_t>(compute_stream);
|
||||||
int out_bc_offset = output_dims[2] * output_dims[3];
|
int out_bc_offset = output_dims[2] * output_dims[3];
|
||||||
int in_bc_offset = input_dims[2] * input_dims[3];
|
int in_bc_offset = input_dims[2] * input_dims[3];
|
||||||
@@ -65,9 +73,27 @@ void CudaAdaptivePool(const std::vector<int64_t>& input_dims,
|
|||||||
bool is_avg = pooling_type == "avg";
|
bool is_avg = pooling_type == "avg";
|
||||||
int threads = 256;
|
int threads = 256;
|
||||||
int blocks = ceil(jobs / static_cast<float>(threads));
|
int blocks = ceil(jobs / static_cast<float>(threads));
|
||||||
CudaCastKernel<<<blocks, threads, 0, casted_compute_stream>>>(
|
if (dtype == "float") {
|
||||||
input, output, jobs, out_bc_offset, in_bc_offset, int(input_dims[2]),
|
CudaCastKernel<float, float><<<blocks, threads, 0, casted_compute_stream>>>(
|
||||||
int(input_dims[3]), int(output_dims[2]), int(output_dims[3]), is_avg);
|
static_cast<const float*>(input), static_cast<float*>(output), jobs,
|
||||||
|
out_bc_offset, in_bc_offset, int(input_dims[2]), int(input_dims[3]),
|
||||||
|
int(output_dims[2]), int(output_dims[3]), is_avg);
|
||||||
|
} else if (dtype == "half") {
|
||||||
|
if (out_dtype == "half") {
|
||||||
|
CudaCastKernel<half, half><<<blocks, threads, 0, casted_compute_stream>>>(
|
||||||
|
static_cast<const half*>(input), static_cast<half*>(output), jobs,
|
||||||
|
out_bc_offset, in_bc_offset, int(input_dims[2]), int(input_dims[3]),
|
||||||
|
int(output_dims[2]), int(output_dims[3]), is_avg);
|
||||||
|
}
|
||||||
|
if (out_dtype == "float") {
|
||||||
|
CudaCastKernel<half, float>
|
||||||
|
<<<blocks, threads, 0, casted_compute_stream>>>(
|
||||||
|
static_cast<const half*>(input), static_cast<float*>(output),
|
||||||
|
jobs, out_bc_offset, in_bc_offset, int(input_dims[2]),
|
||||||
|
int(input_dims[3]), int(output_dims[2]), int(output_dims[3]),
|
||||||
|
is_avg);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} // namespace fastdeploy
|
} // namespace fastdeploy
|
||||||
#endif
|
#endif
|
||||||
|
@@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda_fp16.h>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
@@ -25,8 +26,10 @@
|
|||||||
namespace fastdeploy {
|
namespace fastdeploy {
|
||||||
|
|
||||||
void CudaAdaptivePool(const std::vector<int64_t>& input_dims,
|
void CudaAdaptivePool(const std::vector<int64_t>& input_dims,
|
||||||
const std::vector<int64_t>& output_dims, float* output,
|
const std::vector<int64_t>& output_dims, void* output,
|
||||||
const float* input, void* compute_stream,
|
const void* input, void* compute_stream,
|
||||||
const std::string& pooling_type);
|
const std::string& pooling_type,
|
||||||
|
const std::string& dtype = "float",
|
||||||
|
const std::string& out_dtype = "float");
|
||||||
|
|
||||||
} // namespace fastdeploy
|
} // namespace fastdeploy
|
||||||
|
@@ -63,11 +63,6 @@ int AdaptivePool2d::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
|
|||||||
const nvinfer1::PluginTensorDesc* outputDesc,
|
const nvinfer1::PluginTensorDesc* outputDesc,
|
||||||
const void* const* inputs, void* const* outputs,
|
const void* const* inputs, void* const* outputs,
|
||||||
void* workspace, cudaStream_t stream) noexcept {
|
void* workspace, cudaStream_t stream) noexcept {
|
||||||
if (inputDesc[0].type != nvinfer1::DataType::kFLOAT) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
auto const* data = static_cast<float const*>(inputs[0]);
|
|
||||||
auto* result = static_cast<float*>(outputs[0]);
|
|
||||||
int nums = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] *
|
int nums = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] *
|
||||||
outputDesc[0].dims.d[2] * outputDesc[0].dims.d[3];
|
outputDesc[0].dims.d[2] * outputDesc[0].dims.d[3];
|
||||||
std::vector<int64_t> input_size, output_size;
|
std::vector<int64_t> input_size, output_size;
|
||||||
@@ -75,8 +70,18 @@ int AdaptivePool2d::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
|
|||||||
input_size.push_back(inputDesc[0].dims.d[i]);
|
input_size.push_back(inputDesc[0].dims.d[i]);
|
||||||
output_size.push_back(outputDesc[0].dims.d[i]);
|
output_size.push_back(outputDesc[0].dims.d[i]);
|
||||||
}
|
}
|
||||||
CudaAdaptivePool(input_size, output_size, result, data, stream,
|
if (inputDesc[0].type == nvinfer1::DataType::kHALF) {
|
||||||
pooling_type_);
|
if (outputDesc[0].type == nvinfer1::DataType::kHALF) {
|
||||||
|
CudaAdaptivePool(input_size, output_size, outputs[0], inputs[0], stream,
|
||||||
|
pooling_type_, "half", "half");
|
||||||
|
} else if (outputDesc[0].type == nvinfer1::DataType::kFLOAT) {
|
||||||
|
CudaAdaptivePool(input_size, output_size, outputs[0], inputs[0], stream,
|
||||||
|
pooling_type_, "half", "float");
|
||||||
|
}
|
||||||
|
} else if (inputDesc[0].type == nvinfer1::DataType::kFLOAT) {
|
||||||
|
CudaAdaptivePool(input_size, output_size, outputs[0], inputs[0], stream,
|
||||||
|
pooling_type_, "float", "float");
|
||||||
|
}
|
||||||
return cudaPeekAtLastError();
|
return cudaPeekAtLastError();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -106,7 +111,12 @@ nvinfer1::DataType AdaptivePool2d::getOutputDataType(
|
|||||||
bool AdaptivePool2d::supportsFormatCombination(
|
bool AdaptivePool2d::supportsFormatCombination(
|
||||||
int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
|
int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
|
||||||
int nbOutputs) noexcept {
|
int nbOutputs) noexcept {
|
||||||
return (inOut[pos].format == nvinfer1::PluginFormat::kLINEAR);
|
if ((inOut[pos].format == nvinfer1::PluginFormat::kLINEAR) &&
|
||||||
|
(inOut[pos].type == nvinfer1::DataType::kFLOAT ||
|
||||||
|
inOut[pos].type == nvinfer1::DataType::kHALF)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int AdaptivePool2d::initialize() noexcept { return 0; }
|
int AdaptivePool2d::initialize() noexcept { return 0; }
|
||||||
|
Reference in New Issue
Block a user