[Backend] TRT cast GPU input from int64 to int32, output from int32 to int64, and Windows support building CUDA files (#426)

* TRT cast int64 to int32

* windows cmake build cuda src

* fix windows cmake error when build cuda src

* add a notice in windows gpu build doc

* cmake add cuda std=11

* TRT cast output from int32 to int64

* nits

* trt get original input output dtype
This commit is contained in:
Wang Xinyu
2022-10-28 13:38:06 +08:00
committed by GitHub
parent 04704c8411
commit caa369f64a
9 changed files with 181 additions and 25 deletions

View File

@@ -0,0 +1,45 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "fastdeploy/function/cuda_cast.h"
namespace fastdeploy {
template <typename T_IN, typename T_OUT>
__global__ void CudaCastKernel(const T_IN* in, T_OUT* out, int edge) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge) return;
out[position] = (T_OUT)in[position];
}
void CudaCast(const FDTensor& in, FDTensor* out, cudaStream_t stream) {
int jobs = in.Numel();
int threads = 256;
int blocks = ceil(jobs / (float)threads);
if (in.dtype == FDDataType::INT64 && out->dtype == FDDataType::INT32) {
CudaCastKernel<int64_t, int32_t><<<blocks, threads, 0, stream>>>(
reinterpret_cast<int64_t*>(const_cast<void*>(in.Data())),
reinterpret_cast<int32_t*>(out->MutableData()),
jobs);
} else if (in.dtype == FDDataType::INT32 && out->dtype == FDDataType::INT64) {
CudaCastKernel<int32_t, int64_t><<<blocks, threads, 0, stream>>>(
reinterpret_cast<int32_t*>(const_cast<void*>(in.Data())),
reinterpret_cast<int64_t*>(out->MutableData()),
jobs);
} else {
FDASSERT(false, "CudaCast only support input INT64, output INT32.");
}
}
} // namespace fastdeploy

View File

@@ -0,0 +1,29 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "fastdeploy/core/fd_tensor.h"
namespace fastdeploy {
/** Cast the type of the data in GPU buffer.
@param in The input tensor.
@param out The output tensor
@param stream CUDA stream
*/
FASTDEPLOY_DECL void CudaCast(const FDTensor& in, FDTensor* out,
cudaStream_t stream);
} // namespace fastdeploy