// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "fastdeploy/core/fd_tensor.h" #include "fastdeploy/core/float16.h" #include "fastdeploy/utils/utils.h" #include #include #ifdef WITH_GPU #include #endif namespace fastdeploy { void* FDTensor::MutableData() { if (external_data_ptr != nullptr) { return external_data_ptr; } return buffer_; } void* FDTensor::Data() { if (external_data_ptr != nullptr) { return external_data_ptr; } return buffer_; } const void* FDTensor::Data() const { if (external_data_ptr != nullptr) { return external_data_ptr; } return buffer_; } void FDTensor::StopSharing() { if (IsShared()) { ReallocFn(Nbytes()); CopyBuffer(buffer_, external_data_ptr, Nbytes()); external_data_ptr = nullptr; } } const void* FDTensor::CpuData() const { if (device == Device::GPU) { #ifdef WITH_GPU auto* cpu_ptr = const_cast*>(&temporary_cpu_buffer); cpu_ptr->resize(Nbytes()); // need to copy cuda mem to cpu first if (external_data_ptr != nullptr) { FDASSERT(cudaMemcpy(cpu_ptr->data(), external_data_ptr, Nbytes(), cudaMemcpyDeviceToHost) == 0, "[ERROR] Error occurs while copy memory from GPU to CPU"); } else { FDASSERT(cudaMemcpy(cpu_ptr->data(), buffer_, Nbytes(), cudaMemcpyDeviceToHost) == 0, "[ERROR] Error occurs while buffer copy memory from GPU to CPU"); } return cpu_ptr->data(); #else FDASSERT(false, "The FastDeploy didn't compile under -DWITH_GPU=ON, so this is " "an unexpected problem happend."); #endif } return Data(); } void FDTensor::SetExternalData(const std::vector& new_shape, const FDDataType& data_type, void* data_buffer, const Device& new_device, int new_device_id) { dtype = data_type; shape.assign(new_shape.begin(), new_shape.end()); external_data_ptr = data_buffer; device = new_device; device_id = new_device_id; } void FDTensor::ExpandDim(int64_t axis) { size_t ndim = shape.size(); FDASSERT(axis >= 0 && axis <= ndim, "The allowed 'axis' must be in range of (0, %lu)!", ndim); shape.insert(shape.begin() + axis, 1); } void FDTensor::Squeeze(int64_t axis) { size_t ndim = shape.size(); FDASSERT(axis >= 0 && axis < ndim, "The allowed 'axis' must be in range of (0, %lu)!", ndim); FDASSERT(shape[axis] == 1, "The No.%ld dimension of shape should be 1, but it is %ld!", (long)axis, (long)shape[axis]); shape.erase(shape.begin() + axis); } void FDTensor::Allocate(const std::vector& new_shape, const FDDataType& data_type, const std::string& tensor_name, const Device& new_device) { dtype = data_type; name = tensor_name; shape.assign(new_shape.begin(), new_shape.end()); device = new_device; size_t nbytes = Nbytes(); FDASSERT(ReallocFn(nbytes), "The FastDeploy FDTensor allocate cpu memory error"); } int FDTensor::Nbytes() const { return Numel() * FDDataTypeSize(dtype); } int FDTensor::Numel() const { return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); } void FDTensor::Resize(size_t new_nbytes) { ReallocFn(new_nbytes); } void FDTensor::Resize(const std::vector& new_shape) { int numel = Numel(); int new_numel = std::accumulate(new_shape.begin(), new_shape.end(), 1, std::multiplies()); if (new_numel > numel || external_data_ptr != nullptr) { size_t nbytes = new_numel * FDDataTypeSize(dtype); ReallocFn(nbytes); } shape.assign(new_shape.begin(), new_shape.end()); external_data_ptr = nullptr; } void FDTensor::SetQuantizationInfo(int32_t zero_point,std::vector& scale){ quantized_parameter_.first = zero_point; quantized_parameter_.second = scale; } void FDTensor::Resize(const std::vector& new_shape, const FDDataType& data_type, const std::string& tensor_name, const Device& new_device) { external_data_ptr = nullptr; name = tensor_name; device = new_device; dtype = data_type; int new_nbytes = std::accumulate(new_shape.begin(), new_shape.end(), 1, std::multiplies()) * FDDataTypeSize(data_type); ReallocFn(new_nbytes); shape.assign(new_shape.begin(), new_shape.end()); } bool FDTensor::Reshape(const std::vector& new_shape) { int numel = Numel(); const int64_t unk_dim_val = -1; const int64_t copy_dim_val = 0; std::vector output_shape(new_shape.size(), 0); int64_t capacity = 1; int unk_dim_idx = -1; for (size_t i = 0; i < new_shape.size(); ++i) { if (new_shape[i] == unk_dim_val) { FDASSERT(unk_dim_idx == -1, "Only one dimension value of 'shape' in ReshapeOp can " "be -1. But received shape = [%s], shape[%d] is also -1.", Str(new_shape).c_str(), i); unk_dim_idx = i; } else if (new_shape[i] == copy_dim_val) { FDASSERT(i < shape.size(), "The index of 0 in `shape` must be less than " "the input tensor X's dimensions. " "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " "X's dimensions = %d.", Str(new_shape).c_str(), i, Str(shape).c_str(), shape.size()); } else { FDASSERT(new_shape[i] > 0, "Each dimension value of 'shape' in ReshapeOp must not " "be negative except one unknown dimension. " "But received shape = [%s], shape[%d] = %d.", Str(new_shape).c_str(), i, new_shape[i]); } capacity *= (new_shape[i] ? new_shape[i] : shape[i]); output_shape[i] = (new_shape[i] ? new_shape[i] : shape[i]); } if (unk_dim_idx != -1) { output_shape[unk_dim_idx] = -numel / capacity; FDASSERT(output_shape[unk_dim_idx] * capacity == -numel, "The 'shape' attribute in ReshapeOp is invalid. " "The input tensor X'size must be divisible by known " "capacity of 'shape'. " "But received X's shape = [%s], X's size = %d, " "'shape' is [%s], known capacity of 'shape' is %d.", Str(shape).c_str(), numel, Str(new_shape).c_str(), capacity); } else { FDASSERT(numel == capacity, "The 'shape' in ReshapeOp is invalid. " "The input tensor X'size must be equal to the capacity of " "'shape'. " "But received X's shape = [%s], X's size = %d, 'shape' is " "[%s], the capacity of 'shape' is %d.", Str(shape).c_str(), numel, Str(shape).c_str(), capacity); } shape = output_shape; return true; } template void CalculateStatisInfo(const void* src_ptr, int size, double* mean, double* max, double* min) { const T* ptr = static_cast(src_ptr); *mean = 0; *max = -99999999; *min = 99999999; for (int i = 0; i < size; ++i) { if (*(ptr + i) > *max) { *max = *(ptr + i); } if (*(ptr + i) < *min) { *min = *(ptr + i); } *mean += *(ptr + i); } *mean = *mean / size; } void FDTensor::PrintInfo(const std::string& prefix) const { double mean = 0; double max = -99999999; double min = 99999999; if (dtype == FDDataType::FP32) { CalculateStatisInfo(CpuData(), Numel(), &mean, &max, &min); } else if (dtype == FDDataType::FP64) { CalculateStatisInfo(CpuData(), Numel(), &mean, &max, &min); } else if (dtype == FDDataType::INT8) { CalculateStatisInfo(CpuData(), Numel(), &mean, &max, &min); } else if (dtype == FDDataType::UINT8) { CalculateStatisInfo(CpuData(), Numel(), &mean, &max, &min); } else if (dtype == FDDataType::INT32) { CalculateStatisInfo(CpuData(), Numel(), &mean, &max, &min); } else if (dtype == FDDataType::INT64) { CalculateStatisInfo(CpuData(), Numel(), &mean, &max, &min); } else if (dtype == FDDataType::FP16) { CalculateStatisInfo(CpuData(), Numel(), &mean, &max, &min); } else { FDASSERT(false, "PrintInfo function doesn't support current situation, maybe you " "need enhance this function now."); } std::cout << prefix << ": name=" << name << ", shape="; for (int i = 0; i < shape.size(); ++i) { std::cout << shape[i] << " "; } std::cout << ", dtype=" << Str(dtype) << ", mean=" << mean << ", max=" << max << ", min=" << min << std::endl; } bool FDTensor::ReallocFn(size_t nbytes) { if (device == Device::GPU) { #ifdef WITH_GPU size_t original_nbytes = Nbytes(); if (nbytes > original_nbytes) { if (buffer_ != nullptr) { FDDeviceFree()(buffer_); } FDDeviceAllocator()(&buffer_, nbytes); } return buffer_ != nullptr; #else FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under " "-DWITH_GPU=ON," "so this is an unexpected problem happend."); #endif } else { if (is_pinned_memory) { #ifdef WITH_GPU size_t original_nbytes = Nbytes(); if (nbytes > original_nbytes) { if (buffer_ != nullptr) { FDDeviceHostFree()(buffer_); } FDDeviceHostAllocator()(&buffer_, nbytes); } return buffer_ != nullptr; #else FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under " "-DWITH_GPU=ON," "so this is an unexpected problem happend."); #endif } buffer_ = realloc(buffer_, nbytes); return buffer_ != nullptr; } } void FDTensor::FreeFn() { if (external_data_ptr != nullptr) external_data_ptr = nullptr; if (buffer_ != nullptr) { if (device == Device::GPU) { #ifdef WITH_GPU FDDeviceFree()(buffer_); #endif } else { if (is_pinned_memory) { #ifdef WITH_GPU FDDeviceHostFree()(buffer_); #endif } else { FDHostFree()(buffer_); } } buffer_ = nullptr; } } // TODO(liqi): no src_device and dst_device // should support copy from cpu or gpu to cpu or gpu void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes, const Device& device, bool is_pinned_memory) { if (device == Device::GPU) { #ifdef WITH_GPU FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0, "[ERROR] Error occurs while copy memory from GPU to GPU"); #else FDASSERT(false, "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying " "gpu buffer is " "an unexpected problem happend."); #endif } else { if (is_pinned_memory) { #ifdef WITH_GPU FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToHost) == 0, "[ERROR] Error occurs while copy memory from host to host"); #else FDASSERT(false, "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying " "gpu buffer is " "an unexpected problem happend."); #endif } else { std::memcpy(dst, src, nbytes); } } } FDTensor::FDTensor(const std::string& tensor_name) { name = tensor_name; } FDTensor::FDTensor(const char* tensor_name) { name = tensor_name; } FDTensor::FDTensor(const Scalar& scalar) { Allocate({1}, scalar.dtype()); switch (scalar.dtype()) { case FDDataType::BOOL: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::UINT8: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::INT8: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::INT16: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::INT32: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::INT64: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::FP16: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::FP32: (reinterpret_cast(Data()))[0] = scalar.to(); break; case FDDataType::FP64: (reinterpret_cast(Data()))[0] = scalar.to(); break; } } FDTensor::FDTensor(const FDTensor& other) : shape(other.shape), name(other.name), dtype(other.dtype), device(other.device), external_data_ptr(other.external_data_ptr), device_id(other.device_id) { // Copy buffer if (other.buffer_ == nullptr) { buffer_ = nullptr; } else { size_t nbytes = Nbytes(); FDASSERT(ReallocFn(nbytes), "The FastDeploy FDTensor allocate memory error"); CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory); } } FDTensor::FDTensor(FDTensor&& other) : buffer_(other.buffer_), shape(std::move(other.shape)), name(std::move(other.name)), dtype(other.dtype), external_data_ptr(other.external_data_ptr), device(other.device), device_id(other.device_id) { other.name = ""; // Note(zhoushunjie): Avoid double free. other.buffer_ = nullptr; other.external_data_ptr = nullptr; } FDTensor& FDTensor::operator=(const FDTensor& other) { if (&other != this) { // Copy buffer device_id = other.device_id; if (other.buffer_ == nullptr) { FreeFn(); buffer_ = nullptr; shape = other.shape; name = other.name; dtype = other.dtype; device = other.device; } else { Resize(other.shape, other.dtype, other.name, other.device); size_t nbytes = Nbytes(); CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory); } external_data_ptr = other.external_data_ptr; } return *this; } FDTensor& FDTensor::operator=(FDTensor&& other) { if (&other != this) { FreeFn(); buffer_ = other.buffer_; external_data_ptr = other.external_data_ptr; shape = std::move(other.shape); name = std::move(other.name); dtype = other.dtype; device = other.device; device_id = other.device_id; other.name = ""; // Note(zhoushunjie): Avoid double free. other.buffer_ = nullptr; other.external_data_ptr = nullptr; } return *this; } const std::pair> FDTensor::GetQuantizationInfo() const{ return quantized_parameter_; } } // namespace fastdeploy