[Backend] cuda normalize and permute, cuda concat, optimized ppcls, ppdet & ppseg (#546)

* cuda normalize and permute, cuda concat

* add use cuda option for preprocessor

* ppyoloe use cuda normalize

* ppseg use cuda normalize

* add proclib cuda in processor base

* ppcls add use cuda preprocess api

* ppcls preprocessor set gpu id

* fix pybind

* refine ppcls preprocessing use gpu logic

* fdtensor device id is -1 by default

* refine assert message

Co-authored-by: heliqi <1101791222@qq.com>
This commit is contained in:
Wang Xinyu
2022-11-14 18:44:00 +08:00
committed by GitHub
parent 8dec2115d5
commit a36f5d3396
20 changed files with 204 additions and 26 deletions

View File

@@ -85,8 +85,9 @@ struct ConcatFunctor {
int64_t col_len = input_cols[j];
const T* input_data = reinterpret_cast<const T*>(input[j].Data());
for (int64_t k = 0; k < out_rows; ++k) {
std::memcpy(output_data + k * out_cols + col_idx,
input_data + k * col_len, sizeof(T) * col_len);
FDTensor::CopyBuffer(output_data + k * out_cols + col_idx,
input_data + k * col_len, sizeof(T) * col_len,
input[j].device, input[j].is_pinned_memory);
}
col_idx += col_len;
}
@@ -97,7 +98,8 @@ template <typename T>
void ConcatKernel(const std::vector<FDTensor>& input, FDTensor* output,
int axis) {
auto output_shape = ComputeAndCheckConcatOutputShape(input, axis);
output->Allocate(output_shape, TypeToDataType<T>::dtype);
output->Resize(output_shape, TypeToDataType<T>::dtype, output->name,
input[0].device);
ConcatFunctor<T> functor;
functor(input, axis, output);
@@ -115,10 +117,9 @@ void Concat(const std::vector<FDTensor>& x, FDTensor* out, int axis) {
if (axis < 0) {
axis += rank;
}
FDTensor out_temp;
FD_VISIT_ALL_TYPES(x[0].dtype, "Concat",
([&] { ConcatKernel<data_t>(x, &out_temp, axis); }));
*out = std::move(out_temp);
([&] { ConcatKernel<data_t>(x, out, axis); }));
}
} // namespace function