[Backend] cuda normalize and permute, cuda concat, optimized ppcls, ppdet & ppseg (#546)

* cuda normalize and permute, cuda concat

* add use cuda option for preprocessor

* ppyoloe use cuda normalize

* ppseg use cuda normalize

* add proclib cuda in processor base

* ppcls add use cuda preprocess api

* ppcls preprocessor set gpu id

* fix pybind

* refine ppcls preprocessing use gpu logic

* fdtensor device id is -1 by default

* refine assert message

Co-authored-by: heliqi <1101791222@qq.com>
This commit is contained in:
Wang Xinyu
2022-11-14 18:44:00 +08:00
committed by GitHub
parent 8dec2115d5
commit a36f5d3396
20 changed files with 204 additions and 26 deletions

View File

@@ -252,7 +252,8 @@ void FDTensor::FreeFn() {
}
}
void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes) {
void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes,
const Device& device, bool is_pinned_memory) {
if (device == Device::GPU) {
#ifdef WITH_GPU
FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0,
@@ -295,7 +296,7 @@ FDTensor::FDTensor(const FDTensor& other)
size_t nbytes = Nbytes();
FDASSERT(ReallocFn(nbytes),
"The FastDeploy FDTensor allocate memory error");
CopyBuffer(buffer_, other.buffer_, nbytes);
CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory);
}
}
@@ -325,7 +326,7 @@ FDTensor& FDTensor::operator=(const FDTensor& other) {
} else {
Resize(other.shape, other.dtype, other.name, other.device);
size_t nbytes = Nbytes();
CopyBuffer(buffer_, other.buffer_, nbytes);
CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory);
}
external_data_ptr = other.external_data_ptr;
}