[Backend] cuda normalize and permute, cuda concat, optimized ppcls, ppdet & ppseg (#546)

* cuda normalize and permute, cuda concat * add use cuda option for preprocessor * ppyoloe use cuda normalize * ppseg use cuda normalize * add proclib cuda in processor base * ppcls add use cuda preprocess api * ppcls preprocessor set gpu id * fix pybind * refine ppcls preprocessing use gpu logic * fdtensor device id is -1 by default * refine assert message Co-authored-by: heliqi <1101791222@qq.com>
2025-10-06 00:57:33 +08:00 · 2022-11-14 18:44:00 +08:00
parent 8dec2115d5
commit a36f5d3396
20 changed files with 204 additions and 26 deletions
--- a/fastdeploy/function/concat.cc
+++ b/fastdeploy/function/concat.cc
@@ -85,8 +85,9 @@ struct ConcatFunctor {
      int64_t col_len = input_cols[j];
      const T* input_data = reinterpret_cast<const T*>(input[j].Data());
      for (int64_t k = 0; k < out_rows; ++k) {
-        std::memcpy(output_data + k * out_cols + col_idx,
-                    input_data + k * col_len, sizeof(T) * col_len);
+        FDTensor::CopyBuffer(output_data + k * out_cols + col_idx,
+                             input_data + k * col_len, sizeof(T) * col_len,
+                             input[j].device, input[j].is_pinned_memory);
      }
      col_idx += col_len;
    }
@@ -97,7 +98,8 @@ template <typename T>
 void ConcatKernel(const std::vector<FDTensor>& input, FDTensor* output,
                  int axis) {
  auto output_shape = ComputeAndCheckConcatOutputShape(input, axis);
-  output->Allocate(output_shape, TypeToDataType<T>::dtype);
+  output->Resize(output_shape, TypeToDataType<T>::dtype, output->name,
+                 input[0].device);

  ConcatFunctor<T> functor;
  functor(input, axis, output);
@@ -115,10 +117,9 @@ void Concat(const std::vector<FDTensor>& x, FDTensor* out, int axis) {
  if (axis < 0) {
    axis += rank;
  }
-  FDTensor out_temp;
+
  FD_VISIT_ALL_TYPES(x[0].dtype, "Concat",
-                     ([&] { ConcatKernel<data_t>(x, &out_temp, axis); }));
-  *out = std::move(out_temp);
+                     ([&] { ConcatKernel<data_t>(x, out, axis); }));
 }

 }  // namespace function