[Backend] TRT cast GPU input from int64 to int32, output from int32 to int64, and Windows support building CUDA files (#426)

* TRT cast int64 to int32 * windows cmake build cuda src * fix windows cmake error when build cuda src * add a notice in windows gpu build doc * cmake add cuda std=11 * TRT cast output from int32 to int64 * nits * trt get original input output dtype
2025-10-05 16:48:03 +08:00 · 2022-10-28 13:38:06 +08:00
parent 04704c8411
commit caa369f64a
9 changed files with 181 additions and 25 deletions
--- a/fastdeploy/function/cuda_cast.cu
+++ b/fastdeploy/function/cuda_cast.cu
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/function/cuda_cast.h"
+
+namespace fastdeploy {
+
+template <typename T_IN, typename T_OUT>
+__global__ void CudaCastKernel(const T_IN* in, T_OUT* out, int edge) {
+  int position = blockDim.x * blockIdx.x + threadIdx.x;
+  if (position >= edge) return;
+  out[position] = (T_OUT)in[position];
+}
+
+void CudaCast(const FDTensor& in, FDTensor* out, cudaStream_t stream) {
+  int jobs = in.Numel();
+  int threads = 256;
+  int blocks = ceil(jobs / (float)threads);
+  if (in.dtype == FDDataType::INT64 && out->dtype == FDDataType::INT32) {
+    CudaCastKernel<int64_t, int32_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<int64_t*>(const_cast<void*>(in.Data())),
+        reinterpret_cast<int32_t*>(out->MutableData()),
+        jobs);
+  } else if (in.dtype == FDDataType::INT32 && out->dtype == FDDataType::INT64) {
+    CudaCastKernel<int32_t, int64_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<int32_t*>(const_cast<void*>(in.Data())),
+        reinterpret_cast<int64_t*>(out->MutableData()),
+        jobs);
+  } else {
+    FDASSERT(false, "CudaCast only support input INT64, output INT32.");
+  }
+}
+
+}  // namespace fastdeploy
--- a/fastdeploy/function/cuda_cast.h
+++ b/fastdeploy/function/cuda_cast.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+
+/** Cast the type of the data in GPU buffer.
+    @param in The input tensor.
+    @param out The output tensor
+    @param stream CUDA stream
+*/
+FASTDEPLOY_DECL void CudaCast(const FDTensor& in, FDTensor* out,
+                              cudaStream_t stream);
+
+}  // namespace fastdeploy